Refs #533. added optimized saxpy- and daxpy-kernel for haswell and sandybridgetags/v0.2.15^2
| @@ -10,7 +10,7 @@ include $(TOPDIR)/Makefile.system | |||
| #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | |||
| # ACML 6.1 custom | |||
| ACML=/home/werner/project/acml6.1/gfortran64_mp/lib | |||
| ACML=/home/saar/acml6.1/gfortran64_mp/lib | |||
| LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm | |||
| @@ -40,7 +40,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ | |||
| ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ | |||
| sger.goto dger.goto \ | |||
| sdot.goto ddot.goto \ | |||
| sdot.goto ddot.goto cdot.goto zdot.goto \ | |||
| saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ | |||
| ssymv.goto dsymv.goto csymv.goto zsymv.goto \ | |||
| chemv.goto zhemv.goto \ | |||
| @@ -61,7 +61,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ | |||
| ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ | |||
| sger.acml dger.acml \ | |||
| sdot.acml ddot.acml \ | |||
| sdot.acml ddot.acml cdot.acml zdot.acml \ | |||
| saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ | |||
| ssymv.acml dsymv.acml csymv.acml zsymv.acml \ | |||
| chemv.acml zhemv.acml \ | |||
| @@ -104,7 +104,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||
| ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ | |||
| ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ | |||
| sger.mkl dger.mkl \ | |||
| sdot.mkl ddot.mkl \ | |||
| sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ | |||
| saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ | |||
| ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ | |||
| chemv.mkl zhemv.mkl \ | |||
| @@ -998,6 +998,32 @@ ddot.atlas : ddot.$(SUFFIX) | |||
| ddot.mkl : ddot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Cdot #################################################### | |||
| cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| cdot.acml : cdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| cdot.atlas : cdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| cdot.mkl : cdot-intel.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Zdot #################################################### | |||
| zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| zdot.acml : zdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zdot.atlas : zdot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zdot.mkl : zdot-intel.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Saxpy #################################################### | |||
| saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm | |||
| @@ -1264,6 +1290,20 @@ sdot.$(SUFFIX) : dot.c | |||
| ddot.$(SUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| cdot.$(SUFFIX) : zdot.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| zdot.$(SUFFIX) : zdot.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| cdot-intel.$(SUFFIX) : zdot-intel.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| zdot-intel.$(SUFFIX) : zdot-intel.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| saxpy.$(SUFFIX) : axpy.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| @@ -0,0 +1,196 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #define RETURN_BY_STACK 1 | |||
| #include "common.h" | |||
| #undef DOT | |||
| #ifdef DOUBLE | |||
| #define DOT BLASFUNC(zdotu) | |||
| #else | |||
| #define DOT BLASFUNC(cdotu) | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT _Complex result; | |||
| blasint m, i; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| DOT (&result, &m, x, &inc_x, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -0,0 +1,195 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef DOT | |||
| #ifdef DOUBLE | |||
| #define DOT BLASFUNC(zdotu) | |||
| #else | |||
| #define DOT BLASFUNC(cdotu) | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT _Complex result; | |||
| blasint m, i; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| result = DOT (&m, x, &inc_x, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -13,6 +13,8 @@ CGEMVTKERNEL = cgemv_t_4.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DAXPYKERNEL = daxpy.c | |||
| SAXPYKERNEL = saxpy.c | |||
| SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| @@ -1,3 +1,7 @@ | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| @@ -7,6 +11,7 @@ ZGEMVTKERNEL = zgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot_bulldozer.S | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| @@ -6,6 +6,9 @@ ZGEMVNKERNEL = zgemv_n_4.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| SGEMMKERNEL = sgemm_kernel_16x4_sandy.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| @@ -33,6 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "daxpy_microk_nehalem-2.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "daxpy_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "daxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "daxpy_microk_sandy-2.c" | |||
| #endif | |||
| @@ -71,7 +75,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -8; | |||
| #if defined(SANDYBRIDGE) | |||
| int n1 = n & -32; | |||
| #else | |||
| int n1 = n & -16; | |||
| #endif | |||
| if ( n1 ) | |||
| daxpy_kernel_8(n1, x, y , &da ); | |||
| @@ -0,0 +1,78 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastsd (%4), %%ymm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y | |||
| "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y | |||
| "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y | |||
| "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y | |||
| "vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x | |||
| "vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x | |||
| "vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x | |||
| "vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x | |||
| "vmovups %%ymm12, (%3,%0,8) \n\t" | |||
| "vmovups %%ymm13, 32(%3,%0,8) \n\t" | |||
| "vmovups %%ymm14, 64(%3,%0,8) \n\t" | |||
| "vmovups %%ymm15, 96(%3,%0,8) \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,100 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastsd (%4), %%ymm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%3,%0,8), %%ymm8 \n\t" | |||
| "vmovups 32(%3,%0,8), %%ymm9 \n\t" | |||
| "vmovups 64(%3,%0,8), %%ymm10 \n\t" | |||
| "vmovups 96(%3,%0,8), %%ymm11 \n\t" | |||
| "vmovups 128(%3,%0,8), %%ymm12 \n\t" | |||
| "vmovups 160(%3,%0,8), %%ymm13 \n\t" | |||
| "vmovups 192(%3,%0,8), %%ymm14 \n\t" | |||
| "vmovups 224(%3,%0,8), %%ymm15 \n\t" | |||
| "vmulpd (%2,%0,8), %%ymm0, %%ymm1 \n\t" | |||
| "vmulpd 32(%2,%0,8), %%ymm0, %%ymm2 \n\t" | |||
| "vaddpd %%ymm8 , %%ymm1, %%ymm8 \n\t" | |||
| "vmulpd 64(%2,%0,8), %%ymm0, %%ymm3 \n\t" | |||
| "vaddpd %%ymm9 , %%ymm2, %%ymm9 \n\t" | |||
| "vmulpd 96(%2,%0,8), %%ymm0, %%ymm4 \n\t" | |||
| "vaddpd %%ymm10, %%ymm3, %%ymm10 \n\t" | |||
| "vmulpd 128(%2,%0,8), %%ymm0, %%ymm5 \n\t" | |||
| "vaddpd %%ymm11, %%ymm4, %%ymm11 \n\t" | |||
| "vmulpd 160(%2,%0,8), %%ymm0, %%ymm6 \n\t" | |||
| "vaddpd %%ymm12, %%ymm5, %%ymm12 \n\t" | |||
| "vmulpd 192(%2,%0,8), %%ymm0, %%ymm7 \n\t" | |||
| "vmulpd 224(%2,%0,8), %%ymm0, %%ymm1 \n\t" | |||
| "vaddpd %%ymm13, %%ymm6, %%ymm13 \n\t" | |||
| "vmovups %%ymm8 , (%3,%0,8) \n\t" | |||
| "vaddpd %%ymm14, %%ymm7, %%ymm14 \n\t" | |||
| "vmovups %%ymm9 , 32(%3,%0,8) \n\t" | |||
| "vaddpd %%ymm15, %%ymm1, %%ymm15 \n\t" | |||
| "vmovups %%ymm10, 64(%3,%0,8) \n\t" | |||
| "vmovups %%ymm11, 96(%3,%0,8) \n\t" | |||
| "vmovups %%ymm12,128(%3,%0,8) \n\t" | |||
| "vmovups %%ymm13,160(%3,%0,8) \n\t" | |||
| "vmovups %%ymm14,192(%3,%0,8) \n\t" | |||
| "vmovups %%ymm15,224(%3,%0,8) \n\t" | |||
| "addq $32, %0 \n\t" | |||
| "subq $32, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -31,6 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "saxpy_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "saxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "saxpy_microk_sandy-2.c" | |||
| #endif | |||
| @@ -69,7 +73,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -16; | |||
| #if defined(SANDYBRIDGE) | |||
| int n1 = n & -64; | |||
| #else | |||
| int n1 = n & -32; | |||
| #endif | |||
| if ( n1 ) | |||
| saxpy_kernel_16(n1, x, y , &da ); | |||
| @@ -0,0 +1,78 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%4), %%ymm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%3,%0,4), %%ymm12 \n\t" // 8 * y | |||
| "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 8 * y | |||
| "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 8 * y | |||
| "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 8 * y | |||
| "vfmadd231ps (%2,%0,4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x | |||
| "vfmadd231ps 32(%2,%0,4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x | |||
| "vfmadd231ps 64(%2,%0,4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x | |||
| "vfmadd231ps 96(%2,%0,4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" | |||
| "vmovups %%ymm13, 32(%3,%0,4) \n\t" | |||
| "vmovups %%ymm14, 64(%3,%0,4) \n\t" | |||
| "vmovups %%ymm15, 96(%3,%0,4) \n\t" | |||
| "addq $32, %0 \n\t" | |||
| "subq $32, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,100 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%4), %%ymm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%3,%0,4), %%ymm8 \n\t" | |||
| "vmovups 32(%3,%0,4), %%ymm9 \n\t" | |||
| "vmovups 64(%3,%0,4), %%ymm10 \n\t" | |||
| "vmovups 96(%3,%0,4), %%ymm11 \n\t" | |||
| "vmovups 128(%3,%0,4), %%ymm12 \n\t" | |||
| "vmovups 160(%3,%0,4), %%ymm13 \n\t" | |||
| "vmovups 192(%3,%0,4), %%ymm14 \n\t" | |||
| "vmovups 224(%3,%0,4), %%ymm15 \n\t" | |||
| "vmulps (%2,%0,4), %%ymm0, %%ymm1 \n\t" | |||
| "vmulps 32(%2,%0,4), %%ymm0, %%ymm2 \n\t" | |||
| "vaddps %%ymm8 , %%ymm1, %%ymm8 \n\t" | |||
| "vmulps 64(%2,%0,4), %%ymm0, %%ymm3 \n\t" | |||
| "vaddps %%ymm9 , %%ymm2, %%ymm9 \n\t" | |||
| "vmulps 96(%2,%0,4), %%ymm0, %%ymm4 \n\t" | |||
| "vaddps %%ymm10, %%ymm3, %%ymm10 \n\t" | |||
| "vmulps 128(%2,%0,4), %%ymm0, %%ymm5 \n\t" | |||
| "vaddps %%ymm11, %%ymm4, %%ymm11 \n\t" | |||
| "vmulps 160(%2,%0,4), %%ymm0, %%ymm6 \n\t" | |||
| "vaddps %%ymm12, %%ymm5, %%ymm12 \n\t" | |||
| "vmulps 192(%2,%0,4), %%ymm0, %%ymm7 \n\t" | |||
| "vmulps 224(%2,%0,4), %%ymm0, %%ymm1 \n\t" | |||
| "vaddps %%ymm13, %%ymm6, %%ymm13 \n\t" | |||
| "vmovups %%ymm8 , (%3,%0,4) \n\t" | |||
| "vaddps %%ymm14, %%ymm7, %%ymm14 \n\t" | |||
| "vmovups %%ymm9 , 32(%3,%0,4) \n\t" | |||
| "vaddps %%ymm15, %%ymm1, %%ymm15 \n\t" | |||
| "vmovups %%ymm10, 64(%3,%0,4) \n\t" | |||
| "vmovups %%ymm11, 96(%3,%0,4) \n\t" | |||
| "vmovups %%ymm12,128(%3,%0,4) \n\t" | |||
| "vmovups %%ymm13,160(%3,%0,4) \n\t" | |||
| "vmovups %%ymm14,192(%3,%0,4) \n\t" | |||
| "vmovups %%ymm15,224(%3,%0,4) \n\t" | |||
| "addq $64, %0 \n\t" | |||
| "subq $64, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||