optimizations for leve1 and level2 blas functionstags/v0.2.12^2
| @@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| EXPRECISION = 1 | |||
| CCOMMON_OPT += -DEXPRECISION | |||
| CCOMMON_OPT += -DEXPRECISION | |||
| FCOMMON_OPT += -m128bit-long-double | |||
| endif | |||
| endif | |||
| @@ -350,6 +350,7 @@ ifeq ($(C_COMPILER), INTEL) | |||
| CCOMMON_OPT += -wd981 | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| # ifeq logical or. GCC or LSB | |||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) | |||
| @@ -35,7 +35,10 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ | |||
| ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ | |||
| sger.goto dger.goto \ | |||
| ssymv.goto dsymv.goto \ | |||
| sdot.goto ddot.goto \ | |||
| saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ | |||
| ssymv.goto dsymv.goto csymv.goto zsymv.goto \ | |||
| chemv.goto zhemv.goto \ | |||
| chemm.goto zhemm.goto \ | |||
| cherk.goto zherk.goto \ | |||
| cher2k.goto zher2k.goto \ | |||
| @@ -53,7 +56,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ | |||
| ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ | |||
| sger.acml dger.acml \ | |||
| ssymv.acml dsymv.acml \ | |||
| sdot.acml ddot.acml \ | |||
| saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ | |||
| ssymv.acml dsymv.acml csymv.acml zsymv.acml \ | |||
| chemv.acml zhemv.acml \ | |||
| chemm.acml zhemm.acml \ | |||
| cherk.acml zherk.acml \ | |||
| cher2k.acml zher2k.acml \ | |||
| @@ -71,7 +77,10 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ | |||
| ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ | |||
| ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ | |||
| sger.atlas dger.atlas \ | |||
| ssymv.atlas dsymv.atlas \ | |||
| sdot.atlas ddot.atlas \ | |||
| saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ | |||
| ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ | |||
| chemv.atlas zhemv.atlas \ | |||
| chemm.acml zhemm.acml \ | |||
| chemm.atlas zhemm.atlas \ | |||
| cherk.atlas zherk.atlas \ | |||
| @@ -90,7 +99,10 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||
| ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ | |||
| ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ | |||
| sger.mkl dger.mkl \ | |||
| ssymv.mkl dsymv.mkl \ | |||
| sdot.mkl ddot.mkl \ | |||
| saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ | |||
| ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ | |||
| chemv.mkl zhemv.mkl \ | |||
| chemm.mkl zhemm.mkl \ | |||
| cherk.mkl zherk.mkl \ | |||
| cher2k.mkl zher2k.mkl \ | |||
| @@ -100,7 +112,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||
| spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ | |||
| ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl | |||
| all :: goto atlas acml mkl | |||
| all :: goto mkl atlas acml | |||
| ##################################### Slinpack #################################################### | |||
| slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -732,6 +744,32 @@ dsymv.atlas : dsymv.$(SUFFIX) | |||
| dsymv.mkl : dsymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Csymv #################################################### | |||
| csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| csymv.acml : csymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| csymv.atlas : csymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| csymv.mkl : csymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dsymv #################################################### | |||
| zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| zsymv.acml : zsymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zsymv.atlas : zsymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zsymv.mkl : zsymv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Sgeev #################################################### | |||
| sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| @@ -896,6 +934,115 @@ zpotrf.atlas : zpotrf.$(SUFFIX) | |||
| zpotrf.mkl : zpotrf.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Chemv #################################################### | |||
| chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| chemv.acml : chemv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| chemv.atlas : chemv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| chemv.mkl : chemv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Zhemv #################################################### | |||
| zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| zhemv.acml : zhemv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zhemv.atlas : zhemv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zhemv.mkl : zhemv.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Sdot #################################################### | |||
| sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| sdot.acml : sdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| sdot.atlas : sdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| sdot.mkl : sdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ddot #################################################### | |||
| ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| ddot.acml : ddot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ddot.atlas : ddot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ddot.mkl : ddot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Saxpy #################################################### | |||
| saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| saxpy.acml : saxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| saxpy.atlas : saxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| saxpy.mkl : saxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Daxpy #################################################### | |||
| daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| daxpy.acml : daxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| daxpy.atlas : daxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| daxpy.mkl : daxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Caxpy #################################################### | |||
| caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| caxpy.acml : caxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| caxpy.atlas : caxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| caxpy.mkl : caxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Zaxpy #################################################### | |||
| zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| zaxpy.acml : zaxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zaxpy.atlas : zaxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zaxpy.mkl : zaxpy.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ################################################################################################### | |||
| @@ -1037,6 +1184,12 @@ ssymv.$(SUFFIX) : symv.c | |||
| dsymv.$(SUFFIX) : symv.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| csymv.$(SUFFIX) : symv.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| zsymv.$(SUFFIX) : symv.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| sgeev.$(SUFFIX) : geev.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| @@ -1073,7 +1226,29 @@ cpotrf.$(SUFFIX) : potrf.c | |||
| zpotrf.$(SUFFIX) : potrf.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| chemv.$(SUFFIX) : hemv.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| zhemv.$(SUFFIX) : hemv.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| sdot.$(SUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| ddot.$(SUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| saxpy.$(SUFFIX) : axpy.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| daxpy.$(SUFFIX) : axpy.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| caxpy.$(SUFFIX) : axpy.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| zaxpy.$(SUFFIX) : axpy.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| @@ -0,0 +1,201 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef AXPY | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AXPY BLASFUNC(zaxpy) | |||
| #else | |||
| #define AXPY BLASFUNC(caxpy) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AXPY BLASFUNC(daxpy) | |||
| #else | |||
| #define AXPY BLASFUNC(saxpy) | |||
| #endif | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT alpha[2] = { 2.0, 2.0 }; | |||
| blasint m, i; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| AXPY (&m, alpha, x, &inc_x, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -0,0 +1,195 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef DOT | |||
| #ifdef DOUBLE | |||
| #define DOT BLASFUNC(ddot) | |||
| #else | |||
| #define DOT BLASFUNC(sdot) | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| result = DOT (&m, x, &inc_x, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){ | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); | |||
| if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| @@ -0,0 +1,208 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef HEMV | |||
| #ifdef DOUBLE | |||
| #define HEMV BLASFUNC(zhemv) | |||
| #else | |||
| #define HEMV BLASFUNC(chemv) | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -0,0 +1,70 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| BLASLONG j; | |||
| FLOAT temp1; | |||
| FLOAT temp2; | |||
| #if 0 | |||
| if ( m != offset ) | |||
| printf("Symv_L: m=%d offset=%d\n",m,offset); | |||
| #endif | |||
| jx = 0; | |||
| jy = 0; | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| temp2 = 0.0; | |||
| y[jy] += temp1 * a[j*lda+j]; | |||
| iy = jy; | |||
| ix = jx; | |||
| for (i=j+1; i<m; i++) | |||
| { | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| y[iy] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[ix]; | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,71 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| BLASLONG j; | |||
| FLOAT temp1; | |||
| FLOAT temp2; | |||
| #if 0 | |||
| if( m != offset ) | |||
| printf("Symv_U: m=%d offset=%d\n",m,offset); | |||
| #endif | |||
| BLASLONG m1 = m - offset; | |||
| jx = m1 * inc_x; | |||
| jy = m1 * inc_y; | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| temp2 = 0.0; | |||
| iy = 0; | |||
| ix = 0; | |||
| for (i=0; i<j; i++) | |||
| { | |||
| y[iy] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[ix]; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| } | |||
| y[jy] += temp1 * a[j*lda+j] + alpha * temp2; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -1,3 +1,15 @@ | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSYMV_U_KERNEL = dsymv_U.c | |||
| DSYMV_L_KERNEL = dsymv_L.c | |||
| SSYMV_U_KERNEL = ssymv_U.c | |||
| SSYMV_L_KERNEL = ssymv_L.c | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| @@ -1,5 +1,17 @@ | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSYMV_U_KERNEL = dsymv_U.c | |||
| DSYMV_L_KERNEL = dsymv_L.c | |||
| SSYMV_U_KERNEL = ssymv_U.c | |||
| SSYMV_L_KERNEL = ssymv_L.c | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||
| SGEMMINCOPY = gemm_ncopy_4.S | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #include "caxpy_microk_bulldozer-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register ix = 0; | |||
| FLOAT da_r = alpha[0]; | |||
| FLOAT da_i = alpha[1]; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; | |||
| y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; | |||
| #else | |||
| y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; | |||
| y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; | |||
| #endif | |||
| ix+=4 ; | |||
| i+=2 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT da[2]; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -8; | |||
| if ( n1 ) | |||
| { | |||
| da[0] = da_r; | |||
| da[1] = da_i; | |||
| caxpy_kernel_8(n1, x, y , &da ); | |||
| ix = 2 * n1; | |||
| } | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| i++ ; | |||
| ix += 2; | |||
| } | |||
| return(0); | |||
| } | |||
| inc_x *=2; | |||
| inc_y *=2; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,135 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha | |||
| "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "prefetcht0 768(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x | |||
| "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x | |||
| "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x | |||
| "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x | |||
| "prefetcht0 768(%3,%0,4) \n\t" | |||
| #if !defined(CONJ) | |||
| "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t" | |||
| "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||
| "vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t" | |||
| "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t" | |||
| "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | |||
| "vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t" | |||
| "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t" | |||
| "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | |||
| "vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t" | |||
| "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t" | |||
| "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | |||
| "vmulps %%xmm1, %%xmm10, %%xmm10 \n\t" | |||
| "vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t" | |||
| "vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t" | |||
| "vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t" | |||
| "vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t" | |||
| #else | |||
| "vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i | |||
| "vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i | |||
| "vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i | |||
| "vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i | |||
| "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part | |||
| "vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t" | |||
| "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part | |||
| "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part | |||
| "vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t" | |||
| "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part | |||
| "vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t" | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part | |||
| "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part | |||
| "vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t" | |||
| "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part | |||
| "vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t" | |||
| "vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t" | |||
| "vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t" | |||
| "vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t" | |||
| #endif | |||
| "vmovups %%xmm12, (%3,%0,4) \n\t" | |||
| "vmovups %%xmm13, 16(%3,%0,4) \n\t" | |||
| "vmovups %%xmm14, 32(%3,%0,4) \n\t" | |||
| "vmovups %%xmm15, 48(%3,%0,4) \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) | |||
| addq $6*SIZE, BO | |||
| addq $16*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 16*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) | |||
| VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $8*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 8*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) | |||
| VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $4*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 4*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) | |||
| VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $2*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 2*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -0,0 +1,105 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(NEHALEM) | |||
| #include "daxpy_microk_nehalem-2.c" | |||
| #elif defined(BULLDOZER) | |||
| #include "daxpy_microk_bulldozer-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT a = *alpha; | |||
| while(i < n) | |||
| { | |||
| y[i] += a * x[i]; | |||
| y[i+1] += a * x[i+1]; | |||
| y[i+2] += a * x[i+2]; | |||
| y[i+3] += a * x[i+3]; | |||
| y[i+4] += a * x[i+4]; | |||
| y[i+5] += a * x[i+5]; | |||
| y[i+6] += a * x[i+6]; | |||
| y[i+7] += a * x[i+7]; | |||
| i+=8 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -8; | |||
| if ( n1 ) | |||
| daxpy_kernel_8(n1, x, y , &da ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| y[i] += da * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] += da * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,82 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vmovddup (%4), %%xmm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "prefetcht0 768(%3,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x | |||
| "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x | |||
| ".align 2 \n\t" | |||
| "vmovups %%xmm8 , (%3,%0,8) \n\t" | |||
| "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x | |||
| ".align 2 \n\t" | |||
| "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x | |||
| "vmovups %%xmm9 , 16(%3,%0,8) \n\t" | |||
| "prefetcht0 768(%2,%0,8) \n\t" | |||
| ".align 2 \n\t" | |||
| "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x | |||
| "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x | |||
| "vmovups %%xmm10, 32(%3,%0,8) \n\t" | |||
| "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x | |||
| "vmovups %%xmm11, 48(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,91 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movsd (%4), %%xmm0 \n\t" // alpha | |||
| "shufpd $0, %%xmm0, %%xmm0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| // "prefetcht0 192(%2,%0,8) \n\t" | |||
| // "prefetcht0 192(%3,%0,8) \n\t" | |||
| "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x | |||
| "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x | |||
| "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x | |||
| "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y | |||
| "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y | |||
| "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y | |||
| "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y | |||
| "mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x | |||
| "mulpd %%xmm0 , %%xmm13 \n\t" | |||
| "mulpd %%xmm0 , %%xmm14 \n\t" | |||
| "mulpd %%xmm0 , %%xmm15 \n\t" | |||
| "addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x | |||
| "addpd %%xmm13, %%xmm9 \n\t" | |||
| "addpd %%xmm14, %%xmm10 \n\t" | |||
| "addpd %%xmm15, %%xmm11 \n\t" | |||
| "movups %%xmm8 , (%3,%0,8) \n\t" | |||
| "movups %%xmm9 , 16(%3,%0,8) \n\t" | |||
| "movups %%xmm10, 32(%3,%0,8) \n\t" | |||
| "movups %%xmm11, 48(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,110 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "ddot_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ddot_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot = 0.0; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] | |||
| + y[i+4] * x[i+4] | |||
| + y[i+5] * x[i+5] | |||
| + y[i+6] * x[i+6] | |||
| + y[i+7] * x[i+7] ; | |||
| i+=8 ; | |||
| } | |||
| *d += dot; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot = 0.0 ; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -8; | |||
| if ( n1 ) | |||
| ddot_kernel_8(n1, x, y , &dot ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| while(i < n) | |||
| { | |||
| dot += y[iy] * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,84 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x | |||
| "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x | |||
| "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x | |||
| "vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y | |||
| "vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y | |||
| "vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y | |||
| "vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" | |||
| "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vmovsd %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorpd %%xmm4, %%xmm4 \n\t" | |||
| "xorpd %%xmm5, %%xmm5 \n\t" | |||
| "xorpd %%xmm6, %%xmm6 \n\t" | |||
| "xorpd %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y | |||
| "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x | |||
| "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y | |||
| "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x | |||
| "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y | |||
| "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x | |||
| "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y | |||
| "mulpd %%xmm8 , %%xmm12 \n\t" | |||
| "mulpd %%xmm9 , %%xmm13 \n\t" | |||
| "mulpd %%xmm10, %%xmm14 \n\t" | |||
| "mulpd %%xmm11, %%xmm15 \n\t" | |||
| "addpd %%xmm12, %%xmm4 \n\t" | |||
| "addpd %%xmm13, %%xmm5 \n\t" | |||
| "addpd %%xmm14, %%xmm6 \n\t" | |||
| "addpd %%xmm15, %%xmm7 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "addpd %%xmm5, %%xmm4 \n\t" | |||
| "addpd %%xmm7, %%xmm6 \n\t" | |||
| "addpd %%xmm6, %%xmm4 \n\t" | |||
| "haddpd %%xmm4, %%xmm4 \n\t" | |||
| "movsd %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HASWELL) | |||
| #include "dgemv_n_microk_haswell-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "dgemv_n_microk_nehalem-2.c" | |||
| #endif | |||
| @@ -0,0 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16x4 1 | |||
| static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movddup (%2), %%xmm12 \n\t" // x0 | |||
| "movddup 8(%2), %%xmm13 \n\t" // x1 | |||
| "movddup 16(%2), %%xmm14 \n\t" // x2 | |||
| "movddup 24(%2), %%xmm15 \n\t" // x3 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "prefetcht0 192(%3,%0,8) \n\t" | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y | |||
| "movups 32(%3,%0,8), %%xmm6 \n\t" // 2 * y | |||
| "movups 48(%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups 32(%4,%0,8), %%xmm10 \n\t" // 2 * a | |||
| "movups 48(%4,%0,8), %%xmm11 \n\t" // 2 * a | |||
| "prefetcht0 192(%4,%0,8) \n\t" | |||
| "mulpd %%xmm12 , %%xmm8 \n\t" // a * x | |||
| "mulpd %%xmm12 , %%xmm9 \n\t" // a * x | |||
| "mulpd %%xmm12 , %%xmm10 \n\t" // a * x | |||
| "mulpd %%xmm12 , %%xmm11 \n\t" // a * x | |||
| "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x | |||
| "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x | |||
| "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x | |||
| "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x | |||
| "prefetcht0 192(%5,%0,8) \n\t" | |||
| "movups (%5,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%5,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups 32(%5,%0,8), %%xmm10 \n\t" // 2 * a | |||
| "movups 48(%5,%0,8), %%xmm11 \n\t" // 2 * a | |||
| "mulpd %%xmm13 , %%xmm8 \n\t" // a * x | |||
| "mulpd %%xmm13 , %%xmm9 \n\t" // a * x | |||
| "mulpd %%xmm13 , %%xmm10 \n\t" // a * x | |||
| "mulpd %%xmm13 , %%xmm11 \n\t" // a * x | |||
| "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x | |||
| "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x | |||
| "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x | |||
| "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x | |||
| "prefetcht0 192(%6,%0,8) \n\t" | |||
| "movups (%6,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%6,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups 32(%6,%0,8), %%xmm10 \n\t" // 2 * a | |||
| "movups 48(%6,%0,8), %%xmm11 \n\t" // 2 * a | |||
| "mulpd %%xmm14 , %%xmm8 \n\t" // a * x | |||
| "mulpd %%xmm14 , %%xmm9 \n\t" // a * x | |||
| "mulpd %%xmm14 , %%xmm10 \n\t" // a * x | |||
| "mulpd %%xmm14 , %%xmm11 \n\t" // a * x | |||
| "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x | |||
| "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x | |||
| "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x | |||
| "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x | |||
| "prefetcht0 192(%7,%0,8) \n\t" | |||
| "movups (%7,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%7,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups 32(%7,%0,8), %%xmm10 \n\t" // 2 * a | |||
| "movups 48(%7,%0,8), %%xmm11 \n\t" // 2 * a | |||
| "mulpd %%xmm15 , %%xmm8 \n\t" // a * x | |||
| "mulpd %%xmm15 , %%xmm9 \n\t" // a * x | |||
| "mulpd %%xmm15 , %%xmm10 \n\t" // a * x | |||
| "mulpd %%xmm15 , %%xmm11 \n\t" // a * x | |||
| "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x | |||
| "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x | |||
| "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x | |||
| "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x | |||
| "movups %%xmm4, (%3,%0,8) \n\t" // 4 * y | |||
| "movups %%xmm5, 16(%3,%0,8) \n\t" // 4 * y | |||
| "movups %%xmm6, 32(%3,%0,8) \n\t" // 4 * y | |||
| "movups %%xmm7, 48(%3,%0,8) \n\t" // 4 * y | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,299 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #include "dsymv_L_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "dsymv_L_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) | |||
| { | |||
| FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| BLASLONG i; | |||
| for (i=from; i<to; i+=4) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| y[i] += tmp1[3] * ap[3][i]; | |||
| tmp2[3] += ap[3][i] * x[i]; | |||
| y[i+1] += tmp1[0] * ap[0][i+1]; | |||
| tmp2[0] += ap[0][i+1] * x[i+1]; | |||
| y[i+1] += tmp1[1] * ap[1][i+1]; | |||
| tmp2[1] += ap[1][i+1] * x[i+1]; | |||
| y[i+1] += tmp1[2] * ap[2][i+1]; | |||
| tmp2[2] += ap[2][i+1] * x[i+1]; | |||
| y[i+1] += tmp1[3] * ap[3][i+1]; | |||
| tmp2[3] += ap[3][i+1] * x[i+1]; | |||
| y[i+2] += tmp1[0] * ap[0][i+2]; | |||
| tmp2[0] += ap[0][i+2] * x[i+2]; | |||
| y[i+2] += tmp1[1] * ap[1][i+2]; | |||
| tmp2[1] += ap[1][i+2] * x[i+2]; | |||
| y[i+2] += tmp1[2] * ap[2][i+2]; | |||
| tmp2[2] += ap[2][i+2] * x[i+2]; | |||
| y[i+2] += tmp1[3] * ap[3][i+2]; | |||
| tmp2[3] += ap[3][i+2] * x[i+2]; | |||
| y[i+3] += tmp1[0] * ap[0][i+3]; | |||
| tmp2[0] += ap[0][i+3] * x[i+3]; | |||
| y[i+3] += tmp1[1] * ap[1][i+3]; | |||
| tmp2[1] += ap[1][i+3] * x[i+3]; | |||
| y[i+3] += tmp1[2] * ap[2][i+3]; | |||
| tmp2[2] += ap[2][i+3] * x[i+3]; | |||
| y[i+3] += tmp1[3] * ap[3][i+3]; | |||
| tmp2[3] += ap[3][i+3] * x[i+3]; | |||
| } | |||
| temp2[0] += tmp2[0]; | |||
| temp2[1] += tmp2[1]; | |||
| temp2[2] += tmp2[2]; | |||
| temp2[3] += tmp2[3]; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| BLASLONG j; | |||
| FLOAT temp1; | |||
| FLOAT temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *ap[4]; | |||
| #if 0 | |||
| if ( m != offset ) | |||
| printf("Symv_L: m=%d offset=%d\n",m,offset); | |||
| #endif | |||
| if ( (inc_x != 1) || (inc_y != 1) ) | |||
| { | |||
| jx = 0; | |||
| jy = 0; | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| temp2 = 0.0; | |||
| y[jy] += temp1 * a[j*lda+j]; | |||
| iy = jy; | |||
| ix = jx; | |||
| for (i=j+1; i<m; i++) | |||
| { | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| y[iy] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[ix]; | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| return(0); | |||
| } | |||
| BLASLONG offset1 = (offset/4)*4; | |||
| for (j=0; j<offset1; j+=4) | |||
| { | |||
| tmp1[0] = alpha * x[j]; | |||
| tmp1[1] = alpha * x[j+1]; | |||
| tmp1[2] = alpha * x[j+2]; | |||
| tmp1[3] = alpha * x[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| ap[0] = &a[j*lda]; | |||
| ap[1] = ap[0] + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| y[j] += tmp1[0] * ap[0][j]; | |||
| y[j+1] += tmp1[1] * ap[1][j+1]; | |||
| y[j+2] += tmp1[2] * ap[2][j+2]; | |||
| y[j+3] += tmp1[3] * ap[3][j+3]; | |||
| BLASLONG from = j+1; | |||
| if ( m - from >=12 ) | |||
| { | |||
| BLASLONG m2 = (m/4)*4; | |||
| for (i=j+1; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| } | |||
| for (i=j+2; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| } | |||
| for (i=j+3; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| } | |||
| if ( m2 > j+4 ) | |||
| dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); | |||
| for (i=m2; i<m; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| y[i] += tmp1[3] * ap[3][i]; | |||
| tmp2[3] += ap[3][i] * x[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i=j+1; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| } | |||
| for (i=j+2; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| } | |||
| for (i=j+3; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| } | |||
| for (i=j+4; i<m; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| y[i] += tmp1[3] * ap[3][i]; | |||
| tmp2[3] += ap[3][i] * x[i]; | |||
| } | |||
| } | |||
| y[j] += alpha * tmp2[0]; | |||
| y[j+1] += alpha * tmp2[1]; | |||
| y[j+2] += alpha * tmp2[2]; | |||
| y[j+3] += alpha * tmp2[3]; | |||
| } | |||
| for (j=offset1; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| temp2 = 0.0; | |||
| y[j] += temp1 * a[j*lda+j]; | |||
| BLASLONG from = j+1; | |||
| if ( m - from >=8 ) | |||
| { | |||
| BLASLONG j1 = ((from + 4)/4)*4; | |||
| BLASLONG j2 = (m/4)*4; | |||
| for (i=from; i<j1; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| for (i=j1; i<j2; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| for (i=j2; i<m; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i=from; i<m; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "vmovddup (%8), %%xmm4 \n\t" // temp1[0] | |||
| "vmovddup 8(%8), %%xmm5 \n\t" // temp1[1] | |||
| "vmovddup 16(%8), %%xmm6 \n\t" // temp1[1] | |||
| "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y | |||
| "vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a | |||
| "vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y | |||
| "vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x | |||
| "vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a | |||
| "vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a | |||
| "vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a | |||
| "vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a | |||
| "vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a | |||
| "addq $4 , %0 \n\t" | |||
| "vmovups %%xmm9 , -32(%3,%0,8) \n\t" | |||
| "vmovups %%xmm11 , -16(%3,%0,8) \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmovsd (%9), %%xmm4 \n\t" | |||
| "vmovsd 8(%9), %%xmm5 \n\t" | |||
| "vmovsd 16(%9), %%xmm6 \n\t" | |||
| "vmovsd 24(%9), %%xmm7 \n\t" | |||
| "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" | |||
| "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" | |||
| "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" | |||
| "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" | |||
| "vmovsd %%xmm0 , (%9) \n\t" // save temp2 | |||
| "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 | |||
| "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 | |||
| "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a[0]), // 4 | |||
| "r" (a[1]), // 5 | |||
| "r" (a[2]), // 6 | |||
| "r" (a[3]), // 8 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,132 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "movsd (%8), %%xmm4 \n\t" // temp1[0] | |||
| "movsd 8(%8), %%xmm5 \n\t" // temp1[1] | |||
| "movsd 16(%8), %%xmm6 \n\t" // temp1[2] | |||
| "movsd 24(%8), %%xmm7 \n\t" // temp1[3] | |||
| "shufpd $0, %%xmm4, %%xmm4 \n\t" | |||
| "shufpd $0, %%xmm5, %%xmm5 \n\t" | |||
| "shufpd $0, %%xmm6, %%xmm6 \n\t" | |||
| "shufpd $0, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y | |||
| "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a | |||
| "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm12 \n\t" // a * x | |||
| "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a | |||
| "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a | |||
| "movups %%xmm13 , %%xmm11 \n\t" | |||
| "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm13 \n\t" // a * x | |||
| "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "movups %%xmm14 , %%xmm11 \n\t" | |||
| "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm14 \n\t" // a * x | |||
| "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "addq $2 , %0 \n\t" | |||
| "movups %%xmm15 , %%xmm11 \n\t" | |||
| "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm15 \n\t" // a * x | |||
| "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "movsd (%9), %%xmm4 \n\t" // temp1[0] | |||
| "movsd 8(%9), %%xmm5 \n\t" // temp1[1] | |||
| "movsd 16(%9), %%xmm6 \n\t" // temp1[2] | |||
| "movsd 24(%9), %%xmm7 \n\t" // temp1[3] | |||
| "haddpd %%xmm0, %%xmm0 \n\t" | |||
| "haddpd %%xmm1, %%xmm1 \n\t" | |||
| "haddpd %%xmm2, %%xmm2 \n\t" | |||
| "haddpd %%xmm3, %%xmm3 \n\t" | |||
| "addsd %%xmm4, %%xmm0 \n\t" | |||
| "addsd %%xmm5, %%xmm1 \n\t" | |||
| "addsd %%xmm6, %%xmm2 \n\t" | |||
| "addsd %%xmm7, %%xmm3 \n\t" | |||
| "movsd %%xmm0 , (%9) \n\t" // save temp2 | |||
| "movsd %%xmm1 , 8(%9) \n\t" // save temp2 | |||
| "movsd %%xmm2 , 16(%9) \n\t" // save temp2 | |||
| "movsd %%xmm3 , 24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a[0]), // 4 | |||
| "r" (a[1]), // 5 | |||
| "r" (a[2]), // 6 | |||
| "r" (a[3]), // 7 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,273 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #include "dsymv_U_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "dsymv_U_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT x; | |||
| FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| FLOAT tp0; | |||
| FLOAT tp1; | |||
| FLOAT tp2; | |||
| FLOAT tp3; | |||
| BLASLONG i; | |||
| tp0 = temp1[0]; | |||
| tp1 = temp1[1]; | |||
| tp2 = temp1[2]; | |||
| tp3 = temp1[3]; | |||
| for (i=0; i<n; i++) | |||
| { | |||
| at0 = a0[i]; | |||
| at1 = a1[i]; | |||
| at2 = a2[i]; | |||
| at3 = a3[i]; | |||
| x = xp[i]; | |||
| yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3; | |||
| tmp2[0] += at0 * x; | |||
| tmp2[1] += at1 * x; | |||
| tmp2[2] += at2 * x; | |||
| tmp2[3] += at3 * x; | |||
| } | |||
| temp2[0] += tmp2[0]; | |||
| temp2[1] += tmp2[1]; | |||
| temp2[2] += tmp2[2]; | |||
| temp2[3] += tmp2[3]; | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_1x4 | |||
| static void dsymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT x; | |||
| FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| FLOAT tp0; | |||
| FLOAT tp1; | |||
| FLOAT tp2; | |||
| FLOAT tp3; | |||
| BLASLONG i; | |||
| tp0 = temp1[0]; | |||
| tp1 = temp1[1]; | |||
| tp2 = temp1[2]; | |||
| tp3 = temp1[3]; | |||
| for (i=from; i<to; i++) | |||
| { | |||
| at0 = a0[i]; | |||
| at1 = a1[i]; | |||
| at2 = a2[i]; | |||
| at3 = a3[i]; | |||
| x = xp[i]; | |||
| yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3; | |||
| tmp2[0] += at0 * x; | |||
| tmp2[1] += at1 * x; | |||
| tmp2[2] += at2 * x; | |||
| tmp2[3] += at3 * x; | |||
| } | |||
| temp2[0] += tmp2[0]; | |||
| temp2[1] += tmp2[1]; | |||
| temp2[2] += tmp2[2]; | |||
| temp2[3] += tmp2[3]; | |||
| } | |||
| #endif | |||
| static void dsymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT temp = 0.0; | |||
| FLOAT t1 = *temp1; | |||
| BLASLONG i; | |||
| for (i=0; i<(n/4)*4; i+=4) | |||
| { | |||
| at0 = a0[i]; | |||
| at1 = a0[i+1]; | |||
| at2 = a0[i+2]; | |||
| at3 = a0[i+3]; | |||
| yp[i] += t1 * at0; | |||
| temp += at0 * xp[i]; | |||
| yp[i+1] += t1 * at1; | |||
| temp += at1 * xp[i+1]; | |||
| yp[i+2] += t1 * at2; | |||
| temp += at2 * xp[i+2]; | |||
| yp[i+3] += t1 * at3; | |||
| temp += at3 * xp[i+3]; | |||
| } | |||
| *temp2 = temp; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| BLASLONG j; | |||
| BLASLONG j1; | |||
| BLASLONG j2; | |||
| BLASLONG m2; | |||
| FLOAT temp1; | |||
| FLOAT temp2; | |||
| FLOAT *xp, *yp; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| #if 0 | |||
| if( m != offset ) | |||
| printf("Symv_U: m=%d offset=%d\n",m,offset); | |||
| #endif | |||
| BLASLONG m1 = m - offset; | |||
| BLASLONG mrange = m -m1; | |||
| if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) ) | |||
| { | |||
| jx = m1 * inc_x; | |||
| jy = m1 * inc_y; | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| temp2 = 0.0; | |||
| iy = 0; | |||
| ix = 0; | |||
| for (i=0; i<j; i++) | |||
| { | |||
| y[iy] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[ix]; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| } | |||
| y[jy] += temp1 * a[j*lda+j] + alpha * temp2; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| return(0); | |||
| } | |||
| xp = x; | |||
| yp = y; | |||
| m2 = m - ( mrange % 4 ); | |||
| for (j=m1; j<m2; j+=4) | |||
| { | |||
| tmp1[0] = alpha * xp[j]; | |||
| tmp1[1] = alpha * xp[j+1]; | |||
| tmp1[2] = alpha * xp[j+2]; | |||
| tmp1[3] = alpha * xp[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| a0 = &a[j*lda]; | |||
| a1 = a0+lda; | |||
| a2 = a1+lda; | |||
| a3 = a2+lda; | |||
| j1 = (j/8)*8; | |||
| if ( j1 ) | |||
| dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2); | |||
| if ( j1 < j ) | |||
| dsymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2); | |||
| j2 = 0; | |||
| for ( j1 = j ; j1 < j+4 ; j1++ ) | |||
| { | |||
| temp1 = tmp1[j2]; | |||
| temp2 = tmp2[j2]; | |||
| a0 = &a[j1*lda]; | |||
| for ( i=j ; i<j1; i++ ) | |||
| { | |||
| yp[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * xp[i]; | |||
| } | |||
| y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
| j2++; | |||
| } | |||
| } | |||
| for ( ; j<m; j++) | |||
| { | |||
| temp1 = alpha * xp[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| FLOAT at0; | |||
| j1 = (j/8)*8; | |||
| if ( j1 ) | |||
| dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2); | |||
| for (i=j1 ; i<j; i++) | |||
| { | |||
| at0 = a0[i]; | |||
| yp[i] += temp1 * at0; | |||
| temp2 += at0 * xp[i]; | |||
| } | |||
| yp[j] += temp1 * a0[j] + alpha * temp2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,130 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "vmovddup (%8), %%xmm4 \n\t" // temp1[0] | |||
| "vmovddup 8(%8), %%xmm5 \n\t" // temp1[1] | |||
| "vmovddup 16(%8), %%xmm6 \n\t" // temp1[1] | |||
| "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y | |||
| "vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a | |||
| "vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y | |||
| "vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x | |||
| "vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a | |||
| "vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a | |||
| "vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a | |||
| "vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a | |||
| "vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "addq $4 , %0 \n\t" | |||
| "vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a | |||
| "vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a | |||
| "subq $4 , %1 \n\t" | |||
| "vmovups %%xmm9 , -32(%3,%0,8) \n\t" | |||
| "vmovups %%xmm11 , -16(%3,%0,8) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vmovsd %%xmm0 , (%9) \n\t" // save temp2 | |||
| "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 | |||
| "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 | |||
| "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| "r" (a1), // 5 | |||
| "r" (a2), // 6 | |||
| "r" (a3), // 7 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,125 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "movsd (%8), %%xmm4 \n\t" // temp1[0] | |||
| "movsd 8(%8), %%xmm5 \n\t" // temp1[1] | |||
| "movsd 16(%8), %%xmm6 \n\t" // temp1[2] | |||
| "movsd 24(%8), %%xmm7 \n\t" // temp1[3] | |||
| "shufpd $0, %%xmm4, %%xmm4 \n\t" | |||
| "shufpd $0, %%xmm5, %%xmm5 \n\t" | |||
| "shufpd $0, %%xmm6, %%xmm6 \n\t" | |||
| "shufpd $0, %%xmm7, %%xmm7 \n\t" | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y | |||
| "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a | |||
| "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm12 \n\t" // a * x | |||
| "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a | |||
| "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a | |||
| "movups %%xmm13 , %%xmm11 \n\t" | |||
| "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm13 \n\t" // a * x | |||
| "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "movups %%xmm14 , %%xmm11 \n\t" | |||
| "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm14 \n\t" // a * x | |||
| "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "addq $2 , %0 \n\t" | |||
| "movups %%xmm15 , %%xmm11 \n\t" | |||
| "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a | |||
| "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulpd %%xmm8 , %%xmm15 \n\t" // a * x | |||
| "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y | |||
| "subq $2 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "haddpd %%xmm0, %%xmm0 \n\t" | |||
| "haddpd %%xmm1, %%xmm1 \n\t" | |||
| "haddpd %%xmm2, %%xmm2 \n\t" | |||
| "haddpd %%xmm3, %%xmm3 \n\t" | |||
| "movsd %%xmm0 , (%9) \n\t" // save temp2 | |||
| "movsd %%xmm1 , 8(%9) \n\t" // save temp2 | |||
| "movsd %%xmm2 , 16(%9) \n\t" // save temp2 | |||
| "movsd %%xmm3 , 24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| "r" (a1), // 5 | |||
| "r" (a2), // 6 | |||
| "r" (a3), // 7 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,103 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(NEHALEM) | |||
| #include "saxpy_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT a = *alpha; | |||
| while(i < n) | |||
| { | |||
| y[i] += a * x[i]; | |||
| y[i+1] += a * x[i+1]; | |||
| y[i+2] += a * x[i+2]; | |||
| y[i+3] += a * x[i+3]; | |||
| y[i+4] += a * x[i+4]; | |||
| y[i+5] += a * x[i+5]; | |||
| y[i+6] += a * x[i+6]; | |||
| y[i+7] += a * x[i+7]; | |||
| i+=8 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -16; | |||
| if ( n1 ) | |||
| saxpy_kernel_16(n1, x, y , &da ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| y[i] += da * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] += da * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,91 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%4), %%xmm0 \n\t" // alpha | |||
| "shufps $0, %%xmm0, %%xmm0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| // "prefetcht0 192(%2,%0,4) \n\t" | |||
| // "prefetcht0 192(%3,%0,4) \n\t" | |||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x | |||
| "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm8 \n\t" // 4 * y | |||
| "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y | |||
| "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y | |||
| "mulps %%xmm0 , %%xmm12 \n\t" // alpha * x | |||
| "mulps %%xmm0 , %%xmm13 \n\t" | |||
| "mulps %%xmm0 , %%xmm14 \n\t" | |||
| "mulps %%xmm0 , %%xmm15 \n\t" | |||
| "addps %%xmm12, %%xmm8 \n\t" // y += alpha *x | |||
| "addps %%xmm13, %%xmm9 \n\t" | |||
| "addps %%xmm14, %%xmm10 \n\t" | |||
| "addps %%xmm15, %%xmm11 \n\t" | |||
| "movups %%xmm8 , (%3,%0,4) \n\t" | |||
| "movups %%xmm9 , 16(%3,%0,4) \n\t" | |||
| "movups %%xmm10, 32(%3,%0,4) \n\t" | |||
| "movups %%xmm11, 48(%3,%0,4) \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,109 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "sdot_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sdot_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot = 0.0; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] | |||
| + y[i+4] * x[i+4] | |||
| + y[i+5] * x[i+5] | |||
| + y[i+6] * x[i+6] | |||
| + y[i+7] * x[i+7] ; | |||
| i+=8 ; | |||
| } | |||
| *d += dot; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot = 0.0 ; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -16; | |||
| if ( n1 ) | |||
| sdot_kernel_16(n1, x, y , &dot ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| while(i < n) | |||
| { | |||
| dot += y[iy] * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,85 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x | |||
| "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x | |||
| "vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y | |||
| "vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y | |||
| "vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y | |||
| "vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vmovss %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm4, %%xmm4 \n\t" | |||
| "xorps %%xmm5, %%xmm5 \n\t" | |||
| "xorps %%xmm6, %%xmm6 \n\t" | |||
| "xorps %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x | |||
| "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x | |||
| "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x | |||
| "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x | |||
| "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x | |||
| "mulps %%xmm8 , %%xmm12 \n\t" | |||
| "mulps %%xmm9 , %%xmm13 \n\t" | |||
| "mulps %%xmm10, %%xmm14 \n\t" | |||
| "mulps %%xmm11, %%xmm15 \n\t" | |||
| "addps %%xmm12, %%xmm4 \n\t" | |||
| "addps %%xmm13, %%xmm5 \n\t" | |||
| "addps %%xmm14, %%xmm6 \n\t" | |||
| "addps %%xmm15, %%xmm7 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "addps %%xmm5, %%xmm4 \n\t" | |||
| "addps %%xmm7, %%xmm6 \n\t" | |||
| "addps %%xmm6, %%xmm4 \n\t" | |||
| "haddps %%xmm4, %%xmm4 \n\t" | |||
| "haddps %%xmm4, %%xmm4 \n\t" | |||
| "movss %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) | |||
| VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) | |||
| addq $6*SIZE, BO | |||
| addq $16*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 16*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $8*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 8*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) | |||
| VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $4*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 4*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) | |||
| VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) | |||
| addq $6*SIZE, BO | |||
| addq $2*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 2*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $1*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 1*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -0,0 +1,299 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #include "ssymv_L_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ssymv_L_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) | |||
| { | |||
| FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| BLASLONG i; | |||
| for (i=from; i<to; i+=4) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| y[i] += tmp1[3] * ap[3][i]; | |||
| tmp2[3] += ap[3][i] * x[i]; | |||
| y[i+1] += tmp1[0] * ap[0][i+1]; | |||
| tmp2[0] += ap[0][i+1] * x[i+1]; | |||
| y[i+1] += tmp1[1] * ap[1][i+1]; | |||
| tmp2[1] += ap[1][i+1] * x[i+1]; | |||
| y[i+1] += tmp1[2] * ap[2][i+1]; | |||
| tmp2[2] += ap[2][i+1] * x[i+1]; | |||
| y[i+1] += tmp1[3] * ap[3][i+1]; | |||
| tmp2[3] += ap[3][i+1] * x[i+1]; | |||
| y[i+2] += tmp1[0] * ap[0][i+2]; | |||
| tmp2[0] += ap[0][i+2] * x[i+2]; | |||
| y[i+2] += tmp1[1] * ap[1][i+2]; | |||
| tmp2[1] += ap[1][i+2] * x[i+2]; | |||
| y[i+2] += tmp1[2] * ap[2][i+2]; | |||
| tmp2[2] += ap[2][i+2] * x[i+2]; | |||
| y[i+2] += tmp1[3] * ap[3][i+2]; | |||
| tmp2[3] += ap[3][i+2] * x[i+2]; | |||
| y[i+3] += tmp1[0] * ap[0][i+3]; | |||
| tmp2[0] += ap[0][i+3] * x[i+3]; | |||
| y[i+3] += tmp1[1] * ap[1][i+3]; | |||
| tmp2[1] += ap[1][i+3] * x[i+3]; | |||
| y[i+3] += tmp1[2] * ap[2][i+3]; | |||
| tmp2[2] += ap[2][i+3] * x[i+3]; | |||
| y[i+3] += tmp1[3] * ap[3][i+3]; | |||
| tmp2[3] += ap[3][i+3] * x[i+3]; | |||
| } | |||
| temp2[0] += tmp2[0]; | |||
| temp2[1] += tmp2[1]; | |||
| temp2[2] += tmp2[2]; | |||
| temp2[3] += tmp2[3]; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| BLASLONG j; | |||
| FLOAT temp1; | |||
| FLOAT temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *ap[4]; | |||
| #if 0 | |||
| if ( m != offset ) | |||
| printf("Symv_L: m=%d offset=%d\n",m,offset); | |||
| #endif | |||
| if ( (inc_x != 1) || (inc_y != 1) ) | |||
| { | |||
| jx = 0; | |||
| jy = 0; | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| temp2 = 0.0; | |||
| y[jy] += temp1 * a[j*lda+j]; | |||
| iy = jy; | |||
| ix = jx; | |||
| for (i=j+1; i<m; i++) | |||
| { | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| y[iy] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[ix]; | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| return(0); | |||
| } | |||
| BLASLONG offset1 = (offset/4)*4; | |||
| for (j=0; j<offset1; j+=4) | |||
| { | |||
| tmp1[0] = alpha * x[j]; | |||
| tmp1[1] = alpha * x[j+1]; | |||
| tmp1[2] = alpha * x[j+2]; | |||
| tmp1[3] = alpha * x[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| ap[0] = &a[j*lda]; | |||
| ap[1] = ap[0] + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| y[j] += tmp1[0] * ap[0][j]; | |||
| y[j+1] += tmp1[1] * ap[1][j+1]; | |||
| y[j+2] += tmp1[2] * ap[2][j+2]; | |||
| y[j+3] += tmp1[3] * ap[3][j+3]; | |||
| BLASLONG from = j+1; | |||
| if ( m - from >=12 ) | |||
| { | |||
| BLASLONG m2 = (m/4)*4; | |||
| for (i=j+1; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| } | |||
| for (i=j+2; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| } | |||
| for (i=j+3; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| } | |||
| if ( m2 > j+4 ) | |||
| ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); | |||
| for (i=m2; i<m; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| y[i] += tmp1[3] * ap[3][i]; | |||
| tmp2[3] += ap[3][i] * x[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i=j+1; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| } | |||
| for (i=j+2; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| } | |||
| for (i=j+3; i<j+4; i++) | |||
| { | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| } | |||
| for (i=j+4; i<m; i++) | |||
| { | |||
| y[i] += tmp1[0] * ap[0][i]; | |||
| tmp2[0] += ap[0][i] * x[i]; | |||
| y[i] += tmp1[1] * ap[1][i]; | |||
| tmp2[1] += ap[1][i] * x[i]; | |||
| y[i] += tmp1[2] * ap[2][i]; | |||
| tmp2[2] += ap[2][i] * x[i]; | |||
| y[i] += tmp1[3] * ap[3][i]; | |||
| tmp2[3] += ap[3][i] * x[i]; | |||
| } | |||
| } | |||
| y[j] += alpha * tmp2[0]; | |||
| y[j+1] += alpha * tmp2[1]; | |||
| y[j+2] += alpha * tmp2[2]; | |||
| y[j+3] += alpha * tmp2[3]; | |||
| } | |||
| for (j=offset1; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| temp2 = 0.0; | |||
| y[j] += temp1 * a[j*lda+j]; | |||
| BLASLONG from = j+1; | |||
| if ( m - from >=8 ) | |||
| { | |||
| BLASLONG j1 = ((from + 4)/4)*4; | |||
| BLASLONG j2 = (m/4)*4; | |||
| for (i=from; i<j1; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| for (i=j1; i<j2; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| for (i=j2; i<m; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i=from; i<m; i++) | |||
| { | |||
| y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[i]; | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,122 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] | |||
| "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] | |||
| "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[2] | |||
| "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3] | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x | |||
| "vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y | |||
| "vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a | |||
| "vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a | |||
| "vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a | |||
| "vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a | |||
| "addq $4 , %0 \n\t" | |||
| "vmovups %%xmm9 , -16(%3,%0,4) \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmovss (%9), %%xmm4 \n\t" | |||
| "vmovss 4(%9), %%xmm5 \n\t" | |||
| "vmovss 8(%9), %%xmm6 \n\t" | |||
| "vmovss 12(%9), %%xmm7 \n\t" | |||
| "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" | |||
| "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" | |||
| "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" | |||
| "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" | |||
| "vmovss %%xmm0 , (%9) \n\t" // save temp2 | |||
| "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 | |||
| "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 | |||
| "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a[0]), // 4 | |||
| "r" (a[1]), // 5 | |||
| "r" (a[2]), // 6 | |||
| "r" (a[3]), // 8 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "xorps %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "xorps %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "xorps %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "movss (%8), %%xmm4 \n\t" // temp1[0] | |||
| "movss 4(%8), %%xmm5 \n\t" // temp1[1] | |||
| "movss 8(%8), %%xmm6 \n\t" // temp1[2] | |||
| "movss 12(%8), %%xmm7 \n\t" // temp1[3] | |||
| "shufps $0, %%xmm4, %%xmm4 \n\t" | |||
| "shufps $0, %%xmm5, %%xmm5 \n\t" | |||
| "shufps $0, %%xmm6, %%xmm6 \n\t" | |||
| "shufps $0, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm12 \n\t" // 4 * a | |||
| "movups (%5,%0,4), %%xmm13 \n\t" // 4 * a | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| "mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm12 \n\t" // a * x | |||
| "addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "movups (%6,%0,4), %%xmm14 \n\t" // 4 * a | |||
| "movups (%7,%0,4), %%xmm15 \n\t" // 4 * a | |||
| "movups %%xmm13 , %%xmm11 \n\t" | |||
| "mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm13 \n\t" // a * x | |||
| "addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "movups %%xmm14 , %%xmm11 \n\t" | |||
| "mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm14 \n\t" // a * x | |||
| "addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "movups %%xmm15 , %%xmm11 \n\t" | |||
| "mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm15 \n\t" // a * x | |||
| "addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "movups %%xmm9, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "movss (%9), %%xmm4 \n\t" // temp1[0] | |||
| "movss 4(%9), %%xmm5 \n\t" // temp1[1] | |||
| "movss 8(%9), %%xmm6 \n\t" // temp1[2] | |||
| "movss 12(%9), %%xmm7 \n\t" // temp1[3] | |||
| "haddps %%xmm0, %%xmm0 \n\t" | |||
| "haddps %%xmm1, %%xmm1 \n\t" | |||
| "haddps %%xmm2, %%xmm2 \n\t" | |||
| "haddps %%xmm3, %%xmm3 \n\t" | |||
| "haddps %%xmm0, %%xmm0 \n\t" | |||
| "haddps %%xmm1, %%xmm1 \n\t" | |||
| "haddps %%xmm2, %%xmm2 \n\t" | |||
| "haddps %%xmm3, %%xmm3 \n\t" | |||
| "addss %%xmm4, %%xmm0 \n\t" | |||
| "addss %%xmm5, %%xmm1 \n\t" | |||
| "addss %%xmm6, %%xmm2 \n\t" | |||
| "addss %%xmm7, %%xmm3 \n\t" | |||
| "movss %%xmm0 , (%9) \n\t" // save temp2 | |||
| "movss %%xmm1 , 4(%9) \n\t" // save temp2 | |||
| "movss %%xmm2 , 8(%9) \n\t" // save temp2 | |||
| "movss %%xmm3 , 12(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a[0]), // 4 | |||
| "r" (a[1]), // 5 | |||
| "r" (a[2]), // 6 | |||
| "r" (a[3]), // 7 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,273 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #include "ssymv_U_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ssymv_U_microk_nehalem-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT x; | |||
| FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| FLOAT tp0; | |||
| FLOAT tp1; | |||
| FLOAT tp2; | |||
| FLOAT tp3; | |||
| BLASLONG i; | |||
| tp0 = temp1[0]; | |||
| tp1 = temp1[1]; | |||
| tp2 = temp1[2]; | |||
| tp3 = temp1[3]; | |||
| for (i=0; i<n; i++) | |||
| { | |||
| at0 = a0[i]; | |||
| at1 = a1[i]; | |||
| at2 = a2[i]; | |||
| at3 = a3[i]; | |||
| x = xp[i]; | |||
| yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3; | |||
| tmp2[0] += at0 * x; | |||
| tmp2[1] += at1 * x; | |||
| tmp2[2] += at2 * x; | |||
| tmp2[3] += at3 * x; | |||
| } | |||
| temp2[0] += tmp2[0]; | |||
| temp2[1] += tmp2[1]; | |||
| temp2[2] += tmp2[2]; | |||
| temp2[3] += tmp2[3]; | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_1x4 | |||
| static void ssymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT x; | |||
| FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| FLOAT tp0; | |||
| FLOAT tp1; | |||
| FLOAT tp2; | |||
| FLOAT tp3; | |||
| BLASLONG i; | |||
| tp0 = temp1[0]; | |||
| tp1 = temp1[1]; | |||
| tp2 = temp1[2]; | |||
| tp3 = temp1[3]; | |||
| for (i=from; i<to; i++) | |||
| { | |||
| at0 = a0[i]; | |||
| at1 = a1[i]; | |||
| at2 = a2[i]; | |||
| at3 = a3[i]; | |||
| x = xp[i]; | |||
| yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3; | |||
| tmp2[0] += at0 * x; | |||
| tmp2[1] += at1 * x; | |||
| tmp2[2] += at2 * x; | |||
| tmp2[3] += at3 * x; | |||
| } | |||
| temp2[0] += tmp2[0]; | |||
| temp2[1] += tmp2[1]; | |||
| temp2[2] += tmp2[2]; | |||
| temp2[3] += tmp2[3]; | |||
| } | |||
| #endif | |||
| static void ssymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT temp = 0.0; | |||
| FLOAT t1 = *temp1; | |||
| BLASLONG i; | |||
| for (i=0; i<(n/4)*4; i+=4) | |||
| { | |||
| at0 = a0[i]; | |||
| at1 = a0[i+1]; | |||
| at2 = a0[i+2]; | |||
| at3 = a0[i+3]; | |||
| yp[i] += t1 * at0; | |||
| temp += at0 * xp[i]; | |||
| yp[i+1] += t1 * at1; | |||
| temp += at1 * xp[i+1]; | |||
| yp[i+2] += t1 * at2; | |||
| temp += at2 * xp[i+2]; | |||
| yp[i+3] += t1 * at3; | |||
| temp += at3 * xp[i+3]; | |||
| } | |||
| *temp2 = temp; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| BLASLONG j; | |||
| BLASLONG j1; | |||
| BLASLONG j2; | |||
| BLASLONG m2; | |||
| FLOAT temp1; | |||
| FLOAT temp2; | |||
| FLOAT *xp, *yp; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT at0,at1,at2,at3; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| #if 0 | |||
| if( m != offset ) | |||
| printf("Symv_U: m=%d offset=%d\n",m,offset); | |||
| #endif | |||
| BLASLONG m1 = m - offset; | |||
| BLASLONG mrange = m -m1; | |||
| if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) ) | |||
| { | |||
| jx = m1 * inc_x; | |||
| jy = m1 * inc_y; | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| temp2 = 0.0; | |||
| iy = 0; | |||
| ix = 0; | |||
| for (i=0; i<j; i++) | |||
| { | |||
| y[iy] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * x[ix]; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| } | |||
| y[jy] += temp1 * a[j*lda+j] + alpha * temp2; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| return(0); | |||
| } | |||
| xp = x; | |||
| yp = y; | |||
| m2 = m - ( mrange % 4 ); | |||
| for (j=m1; j<m2; j+=4) | |||
| { | |||
| tmp1[0] = alpha * xp[j]; | |||
| tmp1[1] = alpha * xp[j+1]; | |||
| tmp1[2] = alpha * xp[j+2]; | |||
| tmp1[3] = alpha * xp[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| a0 = &a[j*lda]; | |||
| a1 = a0+lda; | |||
| a2 = a1+lda; | |||
| a3 = a2+lda; | |||
| j1 = (j/8)*8; | |||
| if ( j1 ) | |||
| ssymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2); | |||
| if ( j1 < j ) | |||
| ssymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2); | |||
| j2 = 0; | |||
| for ( j1 = j ; j1 < j+4 ; j1++ ) | |||
| { | |||
| temp1 = tmp1[j2]; | |||
| temp2 = tmp2[j2]; | |||
| a0 = &a[j1*lda]; | |||
| for ( i=j ; i<j1; i++ ) | |||
| { | |||
| yp[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * xp[i]; | |||
| } | |||
| y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
| j2++; | |||
| } | |||
| } | |||
| for ( ; j<m; j++) | |||
| { | |||
| temp1 = alpha * xp[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| FLOAT at0; | |||
| j1 = (j/8)*8; | |||
| if ( j1 ) | |||
| ssymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2); | |||
| for (i=j1 ; i<j; i++) | |||
| { | |||
| at0 = a0[i]; | |||
| yp[i] += temp1 * at0; | |||
| temp2 += at0 * xp[i]; | |||
| } | |||
| yp[j] += temp1 * a0[j] + alpha * temp2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] | |||
| "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] | |||
| "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] | |||
| "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| "vmovups (%4,%0,4), %%xmm12 \n\t" // 4 * a | |||
| "vmovups (%5,%0,4), %%xmm13 \n\t" // 4 * a | |||
| "vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "vmovups (%6,%0,4), %%xmm14 \n\t" // 4 * a | |||
| "vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "vmovups (%7,%0,4), %%xmm15 \n\t" // 4 * a | |||
| "vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a | |||
| "vmovups %%xmm9 , (%3,%0,4) \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vmovss %%xmm0 , (%9) \n\t" // save temp2 | |||
| "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 | |||
| "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 | |||
| "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| "r" (a1), // 5 | |||
| "r" (a2), // 6 | |||
| "r" (a3), // 7 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,130 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); | |||
| static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm0 , %%xmm0 \n\t" // temp2[0] | |||
| "xorps %%xmm1 , %%xmm1 \n\t" // temp2[1] | |||
| "xorps %%xmm2 , %%xmm2 \n\t" // temp2[2] | |||
| "xorps %%xmm3 , %%xmm3 \n\t" // temp2[3] | |||
| "movss (%8), %%xmm4 \n\t" // temp1[0] | |||
| "movss 4(%8), %%xmm5 \n\t" // temp1[1] | |||
| "movss 8(%8), %%xmm6 \n\t" // temp1[2] | |||
| "movss 12(%8), %%xmm7 \n\t" // temp1[3] | |||
| "shufps $0, %%xmm4, %%xmm4 \n\t" | |||
| "shufps $0, %%xmm5, %%xmm5 \n\t" | |||
| "shufps $0, %%xmm6, %%xmm6 \n\t" | |||
| "shufps $0, %%xmm7, %%xmm7 \n\t" | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm12 \n\t" // 4 * a | |||
| "movups (%5,%0,4), %%xmm13 \n\t" // 4 * a | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| "mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm12 \n\t" // a * x | |||
| "addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a | |||
| "movups (%6,%0,4), %%xmm14 \n\t" // 4 * a | |||
| "movups (%7,%0,4), %%xmm15 \n\t" // 4 * a | |||
| "movups %%xmm13 , %%xmm11 \n\t" | |||
| "mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm13 \n\t" // a * x | |||
| "addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a | |||
| "movups %%xmm14 , %%xmm11 \n\t" | |||
| "mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm14 \n\t" // a * x | |||
| "addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a | |||
| "movups %%xmm15 , %%xmm11 \n\t" | |||
| "mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a | |||
| "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a | |||
| "mulps %%xmm8 , %%xmm15 \n\t" // a * x | |||
| "addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a | |||
| "movups %%xmm9, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "haddps %%xmm0, %%xmm0 \n\t" | |||
| "haddps %%xmm1, %%xmm1 \n\t" | |||
| "haddps %%xmm2, %%xmm2 \n\t" | |||
| "haddps %%xmm3, %%xmm3 \n\t" | |||
| "haddps %%xmm0, %%xmm0 \n\t" | |||
| "haddps %%xmm1, %%xmm1 \n\t" | |||
| "haddps %%xmm2, %%xmm2 \n\t" | |||
| "haddps %%xmm3, %%xmm3 \n\t" | |||
| "movss %%xmm0 , (%9) \n\t" // save temp2 | |||
| "movss %%xmm1 , 4(%9) \n\t" // save temp2 | |||
| "movss %%xmm2 , 8(%9) \n\t" // save temp2 | |||
| "movss %%xmm3 , 12(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| "r" (a1), // 5 | |||
| "r" (a2), // 6 | |||
| "r" (a3), // 7 | |||
| "r" (temp1), // 8 | |||
| "r" (temp2) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #include "zaxpy_microk_bulldozer-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4 | |||
| static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register ix = 0; | |||
| FLOAT da_r = alpha[0]; | |||
| FLOAT da_i = alpha[1]; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; | |||
| y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; | |||
| #else | |||
| y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; | |||
| y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; | |||
| #endif | |||
| ix+=4 ; | |||
| i+=2 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT da[2]; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -4; | |||
| if ( n1 ) | |||
| { | |||
| da[0] = da_r; | |||
| da[1] = da_i; | |||
| zaxpy_kernel_4(n1, x, y , &da ); | |||
| ix = 2 * n1; | |||
| } | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| i++ ; | |||
| ix += 2; | |||
| } | |||
| return(0); | |||
| } | |||
| inc_x *=2; | |||
| inc_y *=2; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,135 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4 1 | |||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vmovddup (%4), %%xmm0 \n\t" // real part of alpha | |||
| "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "prefetcht0 768(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x | |||
| "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x | |||
| "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x | |||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x | |||
| "prefetcht0 768(%3,%0,8) \n\t" | |||
| #if !defined(CONJ) | |||
| "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t" | |||
| "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||
| "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t" | |||
| "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t" | |||
| "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | |||
| "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t" | |||
| "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t" | |||
| "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | |||
| "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t" | |||
| "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t" | |||
| "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | |||
| "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t" | |||
| "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t" | |||
| "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t" | |||
| "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t" | |||
| "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t" | |||
| #else | |||
| "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i | |||
| "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i | |||
| "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i | |||
| "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i | |||
| "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i | |||
| "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part | |||
| "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t" | |||
| "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part | |||
| "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part | |||
| "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t" | |||
| "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part | |||
| "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part | |||
| "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t" | |||
| "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part | |||
| "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part | |||
| "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t" | |||
| "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part | |||
| "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t" | |||
| "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t" | |||
| "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t" | |||
| "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t" | |||
| #endif | |||
| "vmovups %%xmm12, (%3,%0,8) \n\t" | |||
| "vmovups %%xmm13, 16(%3,%0,8) \n\t" | |||
| "vmovups %%xmm14, 32(%3,%0,8) \n\t" | |||
| "vmovups %%xmm15, 48(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -222,8 +222,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) | |||
| VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) | |||
| addq $6*SIZE, BO | |||
| addq $8*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 8*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -362,8 +362,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) | |||
| VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) | |||
| addq $6*SIZE, BO | |||
| addq $4*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 4*SIZE, AO | |||
| decq %rax | |||
| .endm | |||
| @@ -491,8 +491,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) | |||
| VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) | |||
| addq $6*SIZE, BO | |||
| addq $2*SIZE, AO | |||
| addq $ 6*SIZE, BO | |||
| addq $ 2*SIZE, AO | |||
| decq %rax | |||
| .endm | |||