| @@ -1,16 +1,23 @@ | |||||
| *.obj | *.obj | ||||
| *.lib | *.lib | ||||
| *.dll | *.dll | ||||
| *.dylib | |||||
| *.def | *.def | ||||
| *.o | *.o | ||||
| lapack-3.1.1 | lapack-3.1.1 | ||||
| lapack-3.1.1.tgz | lapack-3.1.1.tgz | ||||
| lapack-3.4.1 | |||||
| lapack-3.4.1.tgz | |||||
| *.so | *.so | ||||
| *.a | *.a | ||||
| .svn | .svn | ||||
| *~ | *~ | ||||
| lib.grd | |||||
| nohup.out | |||||
| config.h | config.h | ||||
| Makefile.conf | Makefile.conf | ||||
| Makefile.conf_last | |||||
| config_last.h | |||||
| getarch | getarch | ||||
| getarch_2nd | getarch_2nd | ||||
| utest/openblas_utest | utest/openblas_utest | ||||
| @@ -1,4 +1,17 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.2.0 | |||||
| 26-Jun-2012 | |||||
| common: | |||||
| * Removed the limitation (64) of numbers of CPU cores. | |||||
| Now, it supports 256 cores at max. | |||||
| * Supported clang compiler. | |||||
| * Fixed some build bugs on FreeBSD | |||||
| x86/x86-64: | |||||
| * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. | |||||
| Please use gcc >= 4.6 or clang >=3.1. | |||||
| * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.1.1 | Version 0.1.1 | ||||
| 29-Apr-2012 | 29-Apr-2012 | ||||
| @@ -7,6 +20,8 @@ common: | |||||
| * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) | * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) | ||||
| * Fixed the build bug (MD5 and download) on Mac OSX. | * Fixed the build bug (MD5 and download) on Mac OSX. | ||||
| * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. | * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. | ||||
| * Fxied the compatibility issue for compilers without C99 complex number | |||||
| (e.g. Visual Studio) | |||||
| x86/x86_64: | x86/x86_64: | ||||
| * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. | * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. | ||||
| * Test alpha=Nan in dscale. | * Test alpha=Nan in dscale. | ||||
| @@ -90,6 +90,15 @@ | |||||
| number of threads will consume extra resource. I recommend you to | number of threads will consume extra resource. I recommend you to | ||||
| specify minimum number of threads. | specify minimum number of threads. | ||||
| 1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? | |||||
| A This may be related to a bug in the Linux kernel 2.6.32. Try applying | |||||
| the patch segaults.patch using | |||||
| patch < segfaults.patch | |||||
| and see if the crashes persist. Note that this patch will lead to many | |||||
| compiler warnings. | |||||
| 2. Architecture Specific issue or Implementation | 2. Architecture Specific issue or Implementation | ||||
| @@ -256,12 +256,17 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz | |||||
| lapack-3.4.1.tgz : | lapack-3.4.1.tgz : | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| ifeq ($(OSNAME), Darwin) | |||||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) | |||||
| curl -O $(LAPACK_URL) | curl -O $(LAPACK_URL) | ||||
| else | |||||
| ifeq ($(OSNAME), FreeBSD) | |||||
| fetch $(LAPACK_URL) | |||||
| else | else | ||||
| wget $(LAPACK_URL) | wget $(LAPACK_URL) | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| large.tgz : | large.tgz : | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.1.1 | |||||
| VERSION = 0.2.0 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -94,8 +94,8 @@ VERSION = 0.1.1 | |||||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | ||||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | # with single thread. You can use this flag to avoid the overhead of multi-threading | ||||
| # in small matrix sizes. The default value is 4. | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||||
| # in small matrix sizes. The default value is 50. | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 50 | |||||
| # If you need santy check by comparing reference BLAS. It'll be very | # If you need santy check by comparing reference BLAS. It'll be very | ||||
| # slow (Not implemented yet). | # slow (Not implemented yet). | ||||
| @@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1 | |||||
| endif | endif | ||||
| # Default C compiler | # Default C compiler | ||||
| # - Only set if not specified on the command line or inherited from the environment. | |||||
| # - CC is an implicit variable so neither '?=' or 'ifndef' can be used. | |||||
| # http://stackoverflow.com/questions/4029274/mingw-and-make-variables | |||||
| # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | |||||
| ifeq ($(origin CC),default) | |||||
| CC = gcc | CC = gcc | ||||
| endif | |||||
| # Default Fortran compiler (FC) is selected by f_check. | |||||
| ifndef MAKEFILE_RULE | ifndef MAKEFILE_RULE | ||||
| include $(TOPDIR)/Makefile.rule | include $(TOPDIR)/Makefile.rule | ||||
| @@ -45,7 +53,7 @@ GETARCH_FLAGS += -DUSE64BITINT | |||||
| endif | endif | ||||
| ifndef GEMM_MULTITHREAD_THRESHOLD | ifndef GEMM_MULTITHREAD_THRESHOLD | ||||
| GEMM_MULTITHREAD_THRESHOLD=4 | |||||
| GEMM_MULTITHREAD_THRESHOLD=50 | |||||
| endif | endif | ||||
| GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | ||||
| @@ -108,6 +116,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2 | |||||
| MD5SUM = md5 -r | MD5SUM = md5 -r | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), FreeBSD) | |||||
| MD5SUM = md5 -r | |||||
| endif | |||||
| ifeq ($(OSNAME), NetBSD) | |||||
| MD5SUM = md5 -r | |||||
| endif | |||||
| ifeq ($(OSNAME), Linux) | ifeq ($(OSNAME), Linux) | ||||
| EXTRALIB += -lm | EXTRALIB += -lm | ||||
| endif | endif | ||||
| @@ -231,11 +247,11 @@ endif | |||||
| ifdef DYNAMIC_ARCH | ifdef DYNAMIC_ARCH | ||||
| ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||||
| endif | endif | ||||
| ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
| @@ -754,6 +770,7 @@ export HAVE_SSE4_1 | |||||
| export HAVE_SSE4_2 | export HAVE_SSE4_2 | ||||
| export HAVE_SSE4A | export HAVE_SSE4A | ||||
| export HAVE_SSE5 | export HAVE_SSE5 | ||||
| export HAVE_AVX | |||||
| export KERNELDIR | export KERNELDIR | ||||
| export FUNCTION_PROFILE | export FUNCTION_PROFILE | ||||
| export TARGET_CORE | export TARGET_CORE | ||||
| @@ -1,84 +0,0 @@ | |||||
| OpenBLAS Readme | |||||
| 1.Introduction | |||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) | |||||
| Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki). | |||||
| 2.Intallation | |||||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||||
| Or, | |||||
| check out codes from git://github.com/xianyi/OpenBLAS.git | |||||
| 1)Normal compile | |||||
| (a) type "make" to detect the CPU automatically. | |||||
| or | |||||
| (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||||
| 2)Cross compile | |||||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||||
| examples: | |||||
| On X86 box, compile this library for loongson3a CPU. | |||||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||||
| 3)Debug version | |||||
| make DEBUG=1 | |||||
| 4)Intall to the directory (Optional) | |||||
| e.g. | |||||
| make install PREFIX=your_installation_directory | |||||
| The default directory is /opt/OpenBLAS | |||||
| 3.Support CPU & OS | |||||
| Please read GotoBLAS_01Readme.txt | |||||
| Additional support CPU: | |||||
| x86_64: | |||||
| Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. | |||||
| MIPS64: | |||||
| ICT Loongson 3A //Level 3 BLAS subroutines are optimized. | |||||
| 4.Usages | |||||
| Link with libopenblas.a or -lopenblas for shared library. | |||||
| 4.1 Set the number of threads with environment variables. for example, | |||||
| export OPENBLAS_NUM_THREADS=4 | |||||
| or | |||||
| export GOTO_NUM_THREADS=4 | |||||
| or | |||||
| export OMP_NUM_THREADS=4 | |||||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||||
| 4.2 Set the number of threads with calling functions. for example, | |||||
| void goto_set_num_threads(int num_threads); | |||||
| or | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||||
| 5.Report Bugs | |||||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||||
| 6.To-Do List: | |||||
| Optimization on ICT Loongson 3A CPU | |||||
| 7.Contact | |||||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||||
| 8.ChangeLog | |||||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||||
| 9.Known Issues | |||||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||||
| is 64. On 32 bits, it is 32. | |||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||||
| 10. Specification of Git Branches | |||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||||
| Now, there are 4 branches in github.com. | |||||
| * The master branch. This a main branch to reflect a production-ready state. | |||||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||||
| * The gh-pages branch. This is for web pages | |||||
| @@ -0,0 +1,110 @@ | |||||
| # OpenBLAS | |||||
| ## Introduction | |||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>. | |||||
| Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | |||||
| ## Installation | |||||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||||
| Or, check out codes from git://github.com/xianyi/OpenBLAS.git | |||||
| ### Normal compile | |||||
| * type "make" to detect the CPU automatically. | |||||
| or | |||||
| * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||||
| ### Cross compile | |||||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||||
| Examples: | |||||
| On X86 box, compile this library for loongson3a CPU. | |||||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||||
| ### Debug version | |||||
| make DEBUG=1 | |||||
| ### Intall to the directory (Optional) | |||||
| Example: | |||||
| make install PREFIX=your_installation_directory | |||||
| The default directory is /opt/OpenBLAS | |||||
| ## Support CPU & OS | |||||
| Please read GotoBLAS_01Readme.txt | |||||
| ### Additional support CPU: | |||||
| #### x86/x86-64: | |||||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||||
| - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | |||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||||
| #### MIPS64: | |||||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||||
| - **ICT Loongson 3B**: Experimental | |||||
| ### Support OS: | |||||
| - **GNU/Linux** | |||||
| - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||||
| - **FreeBSD**: Supportted by community. We didn't test the library on this OS. | |||||
| ## Usages | |||||
| Link with libopenblas.a or -lopenblas for shared library. | |||||
| ### Set the number of threads with environment variables. | |||||
| Examples: | |||||
| export OPENBLAS_NUM_THREADS=4 | |||||
| or | |||||
| export GOTO_NUM_THREADS=4 | |||||
| or | |||||
| export OMP_NUM_THREADS=4 | |||||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||||
| ### Set the number of threads on runtime. | |||||
| We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy. | |||||
| void goto_set_num_threads(int num_threads); | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||||
| ## Report Bugs | |||||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||||
| ## Contact | |||||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||||
| ## ChangeLog | |||||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||||
| ## Troubleshooting | |||||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||||
| * The number of CPUs/Cores should less than or equal to 256. | |||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||||
| ## Specification of Git Branches | |||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||||
| Now, there are 4 branches in github.com. | |||||
| * The master branch. This a main branch to reflect a production-ready state. | |||||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||||
| * The gh-pages branch. This is for web pages | |||||
| @@ -18,6 +18,7 @@ CORE2 | |||||
| PENRYN | PENRYN | ||||
| DUNNINGTON | DUNNINGTON | ||||
| NEHALEM | NEHALEM | ||||
| SANDYBRIDGE | |||||
| ATOM | ATOM | ||||
| b)AMD CPU: | b)AMD CPU: | ||||
| @@ -27,6 +28,7 @@ OPTERON_SSE3 | |||||
| BARCELONA | BARCELONA | ||||
| SHANGHAI | SHANGHAI | ||||
| ISTANBUL | ISTANBUL | ||||
| BOBCAT | |||||
| c)VIA CPU: | c)VIA CPU: | ||||
| SSE_GENERIC | SSE_GENERIC | ||||
| @@ -47,6 +49,7 @@ CELL | |||||
| 3.MIPS64 CPU: | 3.MIPS64 CPU: | ||||
| SICORTEX | SICORTEX | ||||
| LOONGSON3A | LOONGSON3A | ||||
| LOONGSON3B | |||||
| 4.IA64 CPU: | 4.IA64 CPU: | ||||
| ITANIUM2 | ITANIUM2 | ||||
| @@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); | |||||
| $compiler = GCC if ($compiler eq ""); | $compiler = GCC if ($compiler eq ""); | ||||
| $os = Linux if ($data =~ /OS_LINUX/); | $os = Linux if ($data =~ /OS_LINUX/); | ||||
| $os = FreeBSD if ($data =~ /OS_FreeBSD/); | |||||
| $os = NetBSD if ($data =~ /OS_NetBSD/); | |||||
| $os = Darwin if ($data =~ /OS_Darwin/); | |||||
| $os = SunOS if ($data =~ /OS_SunOS/); | |||||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||||
| $os = AIX if ($data =~ /OS_AIX/); | $os = AIX if ($data =~ /OS_AIX/); | ||||
| $os = osf if ($data =~ /OS_OSF/); | $os = osf if ($data =~ /OS_OSF/); | ||||
| $os = WINNT if ($data =~ /OS_WINNT/); | $os = WINNT if ($data =~ /OS_WINNT/); | ||||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); | |||||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||||
| $os = Interix if ($data =~ /OS_INTERIX/); | $os = Interix if ($data =~ /OS_INTERIX/); | ||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | $architecture = x86 if ($data =~ /ARCH_X86/); | ||||
| @@ -9,6 +9,10 @@ extern "C" { | |||||
| #include <stddef.h> | #include <stddef.h> | ||||
| #include "common.h" | #include "common.h" | ||||
| /*Set the number of threads on runtime.*/ | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| void goto_set_num_threads(int num_threads); | |||||
| #define CBLAS_INDEX size_t | #define CBLAS_INDEX size_t | ||||
| enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | ||||
| @@ -68,7 +68,7 @@ extern "C" { | |||||
| #define SMP | #define SMP | ||||
| #endif | #endif | ||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #define WINDOWS_ABI | #define WINDOWS_ABI | ||||
| #define OS_WINDOWS | #define OS_WINDOWS | ||||
| @@ -89,7 +89,7 @@ extern "C" { | |||||
| #include <sched.h> | #include <sched.h> | ||||
| #endif | #endif | ||||
| #ifdef OS_DARWIN | |||||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) | |||||
| #include <sched.h> | #include <sched.h> | ||||
| #endif | #endif | ||||
| @@ -45,6 +45,8 @@ extern "C" { | |||||
| int BLASFUNC(xerbla)(char *, blasint *info, blasint); | int BLASFUNC(xerbla)(char *, blasint *info, blasint); | ||||
| void BLASFUNC(openblas_set_num_threads)(int *); | |||||
| FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | ||||
| FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | ||||
| @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { | |||||
| int openmp_nthreads=0; | int openmp_nthreads=0; | ||||
| #endif | #endif | ||||
| if ((blas_cpu_number == 1) | |||||
| if (blas_cpu_number == 1 | |||||
| #ifdef USE_OPENMP | #ifdef USE_OPENMP | ||||
| || omp_in_parallel() | || omp_in_parallel() | ||||
| @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define PROFCODE | #define PROFCODE | ||||
| #endif | #endif | ||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #define SAVEREGISTERS \ | #define SAVEREGISTERS \ | ||||
| subl $32, %esp;\ | subl $32, %esp;\ | ||||
| movups %xmm6, 0(%esp);\ | movups %xmm6, 0(%esp);\ | ||||
| @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define RESTOREREGISTERS | #define RESTOREREGISTERS | ||||
| #endif | #endif | ||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text; \ | .text; \ | ||||
| .align 16; \ | .align 16; \ | ||||
| @@ -282,7 +282,7 @@ REALNAME: | |||||
| #define EPILOGUE .end REALNAME | #define EPILOGUE .end REALNAME | ||||
| #endif | #endif | ||||
| #if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text; \ | .text; \ | ||||
| .align 16; \ | .align 16; \ | ||||
| @@ -356,4 +356,11 @@ REALNAME: | |||||
| #ifndef ALIGN_6 | #ifndef ALIGN_6 | ||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| // ffreep %st(0). | |||||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||||
| #ifndef ffreep | |||||
| #define ffreep .byte 0xdf, 0xc0 # | |||||
| #endif | |||||
| #endif | #endif | ||||
| @@ -353,7 +353,7 @@ REALNAME: | |||||
| #define EPILOGUE .end REALNAME | #define EPILOGUE .end REALNAME | ||||
| #endif | #endif | ||||
| #if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text; \ | .text; \ | ||||
| .align 512; \ | .align 512; \ | ||||
| @@ -425,6 +425,7 @@ REALNAME: | |||||
| #define ALIGN_2 .align 2 | #define ALIGN_2 .align 2 | ||||
| #define ALIGN_3 .align 3 | #define ALIGN_3 .align 3 | ||||
| #define ALIGN_4 .align 4 | #define ALIGN_4 .align 4 | ||||
| #define ALIGN_5 .align 5 | |||||
| #define ffreep fstp | #define ffreep fstp | ||||
| #endif | #endif | ||||
| @@ -448,4 +449,10 @@ REALNAME: | |||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| #endif | #endif | ||||
| // ffreep %st(0). | |||||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||||
| #ifndef ffreep | |||||
| #define ffreep .byte 0xdf, 0xc0 # | |||||
| #endif | |||||
| #endif | #endif | ||||
| @@ -103,6 +103,8 @@ | |||||
| #define CORE_NEHALEM 17 | #define CORE_NEHALEM 17 | ||||
| #define CORE_ATOM 18 | #define CORE_ATOM 18 | ||||
| #define CORE_NANO 19 | #define CORE_NANO 19 | ||||
| #define CORE_SANDYBRIDGE 20 | |||||
| #define CORE_BOBCAT 21 | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -122,6 +124,7 @@ | |||||
| #define HAVE_MISALIGNSSE (1 << 15) | #define HAVE_MISALIGNSSE (1 << 15) | ||||
| #define HAVE_128BITFPU (1 << 16) | #define HAVE_128BITFPU (1 << 16) | ||||
| #define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
| #define HAVE_AVX (1 << 18) | |||||
| #define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
| #define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
| @@ -188,4 +191,6 @@ typedef struct { | |||||
| #define CPUTYPE_NSGEODE 41 | #define CPUTYPE_NSGEODE 41 | ||||
| #define CPUTYPE_VIAC3 42 | #define CPUTYPE_VIAC3 42 | ||||
| #define CPUTYPE_NANO 43 | #define CPUTYPE_NANO 43 | ||||
| #define CPUTYPE_SANDYBRIDGE 44 | |||||
| #define CPUTYPE_BOBCAT 45 | |||||
| #endif | #endif | ||||
| @@ -189,6 +189,7 @@ int get_cputype(int gettype){ | |||||
| if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | ||||
| if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | ||||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | ||||
| if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; | |||||
| if (have_excpuid() >= 0x01) { | if (have_excpuid() >= 0x01) { | ||||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
| @@ -983,13 +984,13 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 10: | case 10: | ||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | ||||
| return CPUTYPE_NEHALEM; | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| case 12: | case 12: | ||||
| //Xeon Processor 5600 (Westmere-EP) | //Xeon Processor 5600 (Westmere-EP) | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 13: | case 13: | ||||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | ||||
| return CPUTYPE_NEHALEM; | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| case 15: | case 15: | ||||
| //Xeon Processor E7 (Westmere-EX) | //Xeon Processor E7 (Westmere-EX) | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| @@ -1027,6 +1028,8 @@ int get_cpuname(void){ | |||||
| case 1: | case 1: | ||||
| case 10: | case 10: | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 5: | |||||
| return CPUTYPE_BOBCAT; | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -1146,6 +1149,8 @@ static char *cpuname[] = { | |||||
| "NSGEODE", | "NSGEODE", | ||||
| "VIAC3", | "VIAC3", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | |||||
| "BOBCAT", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1192,6 +1197,8 @@ static char *lowercpuname[] = { | |||||
| "tms3x00", | "tms3x00", | ||||
| "nsgeode", | "nsgeode", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | |||||
| "bobcat", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1215,6 +1222,8 @@ static char *corename[] = { | |||||
| "NEHALEM", | "NEHALEM", | ||||
| "ATOM", | "ATOM", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | |||||
| "BOBCAT", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1238,6 +1247,8 @@ static char *corename_lower[] = { | |||||
| "nehalem", | "nehalem", | ||||
| "atom", | "atom", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | |||||
| "bobcat", | |||||
| }; | }; | ||||
| @@ -1321,13 +1332,13 @@ int get_coretype(void){ | |||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 10: | case 10: | ||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | ||||
| return CORE_NEHALEM; | |||||
| return CORE_SANDYBRIDGE; | |||||
| case 12: | case 12: | ||||
| //Xeon Processor 5600 (Westmere-EP) | //Xeon Processor 5600 (Westmere-EP) | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 13: | case 13: | ||||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | ||||
| return CORE_NEHALEM; | |||||
| return CORE_SANDYBRIDGE; | |||||
| case 15: | case 15: | ||||
| //Xeon Processor E7 (Westmere-EX) | //Xeon Processor E7 (Westmere-EX) | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| @@ -1346,7 +1357,9 @@ int get_coretype(void){ | |||||
| if (family <= 0x5) return CORE_80486; | if (family <= 0x5) return CORE_80486; | ||||
| if (family <= 0xe) return CORE_ATHLON; | if (family <= 0xe) return CORE_ATHLON; | ||||
| if (family == 0xf){ | if (family == 0xf){ | ||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; | |||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||||
| else if (exfamily == 5) return CORE_BOBCAT; | |||||
| else return CORE_BARCELONA; | |||||
| } | } | ||||
| } | } | ||||
| @@ -1426,6 +1439,7 @@ void get_cpuconfig(void){ | |||||
| if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | ||||
| if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | ||||
| if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | ||||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | ||||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | ||||
| @@ -1491,6 +1505,7 @@ void get_sse(void){ | |||||
| if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | ||||
| if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | ||||
| if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | ||||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | ||||
| @@ -35,19 +35,19 @@ OS_LINUX | |||||
| #endif | #endif | ||||
| #if defined(__FreeBSD__) | #if defined(__FreeBSD__) | ||||
| OS_FreeBSD | |||||
| OS_FREEBSD | |||||
| #endif | #endif | ||||
| #if defined(__NetBSD__) | #if defined(__NetBSD__) | ||||
| OS_NetBSD | |||||
| OS_NETBSD | |||||
| #endif | #endif | ||||
| #if defined(__sun) | #if defined(__sun) | ||||
| OS_SunOS | |||||
| OS_SUNOS | |||||
| #endif | #endif | ||||
| #if defined(__APPLE__) | #if defined(__APPLE__) | ||||
| OS_Darwin | |||||
| OS_DARWIN | |||||
| #endif | #endif | ||||
| #if defined(_AIX) | #if defined(_AIX) | ||||
| @@ -63,7 +63,7 @@ OS_WINNT | |||||
| #endif | #endif | ||||
| #if defined(__CYGWIN__) | #if defined(__CYGWIN__) | ||||
| OS_CYGWIN | |||||
| OS_CYGWIN_NT | |||||
| #endif | #endif | ||||
| #if defined(__INTERIX) | #if defined(__INTERIX) | ||||
| @@ -1,12 +1,12 @@ | |||||
| TOPDIR = ../.. | TOPDIR = ../.. | ||||
| include ../../Makefile.system | include ../../Makefile.system | ||||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) | |||||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | |||||
| COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | ||||
| ifdef SMP | ifdef SMP | ||||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | |||||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) | |||||
| ifndef NO_AFFINITY | ifndef NO_AFFINITY | ||||
| COMMONOBJS += init.$(SUFFIX) | COMMONOBJS += init.$(SUFFIX) | ||||
| endif | endif | ||||
| @@ -63,6 +63,14 @@ static blas_pool_t pool; | |||||
| static HANDLE blas_threads [MAX_CPU_NUMBER]; | static HANDLE blas_threads [MAX_CPU_NUMBER]; | ||||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | static DWORD blas_threads_id[MAX_CPU_NUMBER]; | ||||
| void goto_set_num_threads(int num) | |||||
| { | |||||
| } | |||||
| void openblas_set_num_threads(int num) | |||||
| { | |||||
| } | |||||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | ||||
| if (!(mode & BLAS_COMPLEX)){ | if (!(mode & BLAS_COMPLEX)){ | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define MAX_NODES 16 | #define MAX_NODES 16 | ||||
| #define MAX_CPUS 256 | #define MAX_CPUS 256 | ||||
| #define NCPUBITS (8*sizeof(unsigned long)) | |||||
| #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) | |||||
| #define CPUELT(cpu) ((cpu) / NCPUBITS) | |||||
| #define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) | |||||
| #define SH_MAGIC 0x510510 | #define SH_MAGIC 0x510510 | ||||
| @@ -103,10 +108,10 @@ typedef struct { | |||||
| int num_nodes; | int num_nodes; | ||||
| int num_procs; | int num_procs; | ||||
| int final_num_procs; | int final_num_procs; | ||||
| unsigned long avail; | |||||
| unsigned long avail [MAX_BITMASK_LEN]; | |||||
| int avail_count; | |||||
| unsigned long cpu_info [MAX_CPUS]; | unsigned long cpu_info [MAX_CPUS]; | ||||
| unsigned long node_info [MAX_NODES]; | |||||
| unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; | |||||
| int cpu_use[MAX_CPUS]; | int cpu_use[MAX_CPUS]; | ||||
| } shm_t; | } shm_t; | ||||
| @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; | |||||
| static int shmid, pshmid; | static int shmid, pshmid; | ||||
| static void *paddr; | static void *paddr; | ||||
| static unsigned long lprocmask, lnodemask; | |||||
| static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; | |||||
| static int lprocmask_count = 0; | |||||
| static int numprocs = 1; | static int numprocs = 1; | ||||
| static int numnodes = 1; | static int numnodes = 1; | ||||
| @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { | |||||
| than sizeof(unsigned long). On 64 bits, the limit | than sizeof(unsigned long). On 64 bits, the limit | ||||
| is 64. On 32 bits, it is 32. | is 64. On 32 bits, it is 32. | ||||
| ***/ | ***/ | ||||
| static inline unsigned long get_cpumap(int node) { | |||||
| static inline void get_cpumap(int node, unsigned long * node_info) { | |||||
| int infile; | int infile; | ||||
| unsigned long affinity; | |||||
| unsigned long affinity[32]; | |||||
| char name[160]; | char name[160]; | ||||
| char cpumap[160]; | char cpumap[160]; | ||||
| char *p, *dummy; | |||||
| char *dummy; | |||||
| int i=0; | int i=0; | ||||
| int count=0; | |||||
| int k=0; | |||||
| sprintf(name, CPUMAP_NAME, node); | sprintf(name, CPUMAP_NAME, node); | ||||
| infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
| for(i=0; i<32; i++){ | |||||
| affinity[i] = 0; | |||||
| } | |||||
| affinity = 0; | |||||
| if (infile != -1) { | if (infile != -1) { | ||||
| read(infile, cpumap, sizeof(cpumap)); | read(infile, cpumap, sizeof(cpumap)); | ||||
| p = cpumap; | |||||
| while (*p != '\n' && i<160){ | |||||
| if(*p != ',') { | |||||
| name[i++]=*p; | |||||
| } | |||||
| p++; | |||||
| } | |||||
| p = name; | |||||
| // while ((*p == '0') || (*p == ',')) p++; | |||||
| for(i=0; i<160; i++){ | |||||
| if(cpumap[i] == '\n') | |||||
| break; | |||||
| if(cpumap[i] != ','){ | |||||
| name[k++]=cpumap[i]; | |||||
| //Enough data for Hex | |||||
| if(k >= NCPUBITS/4){ | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| } | |||||
| affinity = strtoul(p, &dummy, 16); | |||||
| } | |||||
| if(k!=0){ | |||||
| name[k]='\0'; | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||||
| // revert the sequence | |||||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||||
| node_info[i]=affinity[count-i-1]; | |||||
| } | |||||
| close(infile); | close(infile); | ||||
| } | } | ||||
| return affinity; | |||||
| return ; | |||||
| } | } | ||||
| static inline unsigned long get_share(int cpu, int level) { | |||||
| static inline void get_share(int cpu, int level, unsigned long * share) { | |||||
| int infile; | int infile; | ||||
| unsigned long affinity; | |||||
| unsigned long affinity[32]; | |||||
| char cpumap[160]; | |||||
| char name[160]; | char name[160]; | ||||
| char *p; | |||||
| char *dummy; | |||||
| int count=0; | |||||
| int i=0,k=0; | |||||
| int bitmask_idx = 0; | |||||
| sprintf(name, SHARE_NAME, cpu, level); | sprintf(name, SHARE_NAME, cpu, level); | ||||
| infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
| affinity = (1UL << cpu); | |||||
| // Init share | |||||
| for(i=0; i<MAX_BITMASK_LEN; i++){ | |||||
| share[i]=0; | |||||
| } | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| share[bitmask_idx] = CPUMASK(cpu); | |||||
| if (infile != -1) { | if (infile != -1) { | ||||
| read(infile, name, sizeof(name)); | |||||
| p = name; | |||||
| read(infile, cpumap, sizeof(cpumap)); | |||||
| while ((*p == '0') || (*p == ',')) p++; | |||||
| for(i=0; i<160; i++){ | |||||
| if(cpumap[i] == '\n') | |||||
| break; | |||||
| if(cpumap[i] != ','){ | |||||
| name[k++]=cpumap[i]; | |||||
| //Enough data | |||||
| if(k >= NCPUBITS/4){ | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| } | |||||
| affinity = strtol(p, &p, 16); | |||||
| } | |||||
| if(k!=0){ | |||||
| name[k]='\0'; | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||||
| // revert the sequence | |||||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||||
| share[i]=affinity[count-i-1]; | |||||
| } | |||||
| close(infile); | close(infile); | ||||
| } | } | ||||
| return affinity; | |||||
| return ; | |||||
| } | } | ||||
| static int numa_check(void) { | static int numa_check(void) { | ||||
| @@ -248,6 +298,7 @@ static int numa_check(void) { | |||||
| DIR *dp; | DIR *dp; | ||||
| struct dirent *dir; | struct dirent *dir; | ||||
| int node; | int node; | ||||
| int j; | |||||
| common -> num_nodes = 0; | common -> num_nodes = 0; | ||||
| @@ -258,7 +309,9 @@ static int numa_check(void) { | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; | |||||
| for (node = 0; node < MAX_NODES; node ++) { | |||||
| for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0; | |||||
| } | |||||
| while ((dir = readdir(dp)) != NULL) { | while ((dir = readdir(dp)) != NULL) { | ||||
| if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | ||||
| @@ -266,12 +319,12 @@ static int numa_check(void) { | |||||
| node = atoi(&dir -> d_name[4]); | node = atoi(&dir -> d_name[4]); | ||||
| if (node > MAX_NODES) { | if (node > MAX_NODES) { | ||||
| fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||||
| fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||||
| exit(1); | exit(1); | ||||
| } | } | ||||
| common -> num_nodes ++; | common -> num_nodes ++; | ||||
| common -> node_info[node] = get_cpumap(node); | |||||
| get_cpumap(node, common->node_info[node]); | |||||
| } | } | ||||
| } | } | ||||
| @@ -284,7 +337,7 @@ static int numa_check(void) { | |||||
| fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | ||||
| for (node = 0; node < common -> num_nodes; node ++) | for (node = 0; node < common -> num_nodes; node ++) | ||||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); | |||||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); | |||||
| #endif | #endif | ||||
| return common -> num_nodes; | return common -> num_nodes; | ||||
| @@ -296,11 +349,13 @@ static void numa_mapping(void) { | |||||
| int i, j, h; | int i, j, h; | ||||
| unsigned long work, bit; | unsigned long work, bit; | ||||
| int count = 0; | int count = 0; | ||||
| int bitmask_idx = 0; | |||||
| for (node = 0; node < common -> num_nodes; node ++) { | for (node = 0; node < common -> num_nodes; node ++) { | ||||
| core = 0; | core = 0; | ||||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | for (cpu = 0; cpu < common -> num_procs; cpu ++) { | ||||
| if (common -> node_info[node] & common -> avail & (1UL << cpu)) { | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { | |||||
| common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | ||||
| count ++; | count ++; | ||||
| core ++; | core ++; | ||||
| @@ -357,58 +412,89 @@ static void numa_mapping(void) { | |||||
| static void disable_hyperthread(void) { | static void disable_hyperthread(void) { | ||||
| unsigned long share; | |||||
| unsigned long share[MAX_BITMASK_LEN]; | |||||
| int cpu; | int cpu; | ||||
| int bitmask_idx = 0; | |||||
| int i=0, count=0; | |||||
| bitmask_idx = CPUELT(common -> num_procs); | |||||
| if(common->num_procs > 64){ | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); | |||||
| exit(1); | |||||
| }else if(common->num_procs == 64){ | |||||
| common -> avail = 0xFFFFFFFFFFFFFFFFUL; | |||||
| }else | |||||
| common -> avail = (1UL << common -> num_procs) - 1; | |||||
| for(i=0; i< bitmask_idx; i++){ | |||||
| common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||||
| } | |||||
| if(CPUMASK(common -> num_procs) != 1){ | |||||
| common -> avail[count++] = CPUMASK(common -> num_procs) - 1; | |||||
| } | |||||
| common -> avail_count = count; | |||||
| /* if(common->num_procs > 64){ */ | |||||
| /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ | |||||
| /* exit(1); */ | |||||
| /* }else if(common->num_procs == 64){ */ | |||||
| /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ | |||||
| /* }else */ | |||||
| /* common -> avail = (1UL << common -> num_procs) - 1; */ | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | |||||
| fprintf(stderr, "\nAvail CPUs : "); | |||||
| for(i=0; i<count; i++) | |||||
| fprintf(stderr, "%04lx ", common -> avail[i]); | |||||
| fprintf(stderr, ".\n"); | |||||
| #endif | #endif | ||||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | for (cpu = 0; cpu < common -> num_procs; cpu ++) { | ||||
| share = (get_share(cpu, 1) & common -> avail); | |||||
| if (popcount(share) > 1) { | |||||
| get_share(cpu, 1, share); | |||||
| //When the shared cpu are in different element of share & avail array, this may be a bug. | |||||
| for (i = 0; i < count ; i++){ | |||||
| if (popcount(share[i]) > 1) { | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||||
| cpu, share & ~(1UL << cpu)); | |||||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||||
| cpu, share[i] & ~(CPUMASK(cpu))); | |||||
| #endif | #endif | ||||
| common -> avail &= ~((share & ~(1UL << cpu))); | |||||
| common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| static void disable_affinity(void) { | static void disable_affinity(void) { | ||||
| int i=0; | |||||
| int bitmask_idx=0; | |||||
| int count=0; | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); | |||||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); | |||||
| fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | ||||
| #endif | #endif | ||||
| if(common->final_num_procs > 64){ | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); | |||||
| exit(1); | |||||
| }else if(common->final_num_procs == 64){ | |||||
| lprocmask = 0xFFFFFFFFFFFFFFFFUL; | |||||
| }else | |||||
| lprocmask = (1UL << common -> final_num_procs) - 1; | |||||
| /* if(common->final_num_procs > 64){ */ | |||||
| /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ | |||||
| /* exit(1); */ | |||||
| /* }else if(common->final_num_procs == 64){ */ | |||||
| /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ | |||||
| /* }else */ | |||||
| /* lprocmask = (1UL << common -> final_num_procs) - 1; */ | |||||
| bitmask_idx = CPUELT(common -> final_num_procs); | |||||
| for(i=0; i< bitmask_idx; i++){ | |||||
| lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||||
| } | |||||
| if(CPUMASK(common -> final_num_procs) != 1){ | |||||
| lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; | |||||
| } | |||||
| lprocmask_count = count; | |||||
| #ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
| lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | |||||
| for(i=0; i< count; i++){ | |||||
| lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; | |||||
| } | |||||
| #endif | #endif | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); | |||||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -498,7 +584,7 @@ static void create_pshmem(void) { | |||||
| static void local_cpu_map(void) { | static void local_cpu_map(void) { | ||||
| int cpu, id, mapping; | int cpu, id, mapping; | ||||
| int bitmask_idx = 0; | |||||
| cpu = 0; | cpu = 0; | ||||
| mapping = 0; | mapping = 0; | ||||
| @@ -508,8 +594,9 @@ static void local_cpu_map(void) { | |||||
| if (id > 0) { | if (id > 0) { | ||||
| if (is_dead(id)) common -> cpu_use[cpu] = 0; | if (is_dead(id)) common -> cpu_use[cpu] = 0; | ||||
| } | } | ||||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { | |||||
| common -> cpu_use[cpu] = pshmid; | common -> cpu_use[cpu] = pshmid; | ||||
| cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | ||||
| @@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) { | |||||
| #ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
| cpu_set_t cpu_mask; | cpu_set_t cpu_mask; | ||||
| #endif | #endif | ||||
| int i; | |||||
| if (initialized) return; | if (initialized) return; | ||||
| @@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) { | |||||
| common -> num_procs = get_nprocs(); | common -> num_procs = get_nprocs(); | ||||
| if(common -> num_procs > MAX_CPUS) { | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||||
| exit(1); | |||||
| } | |||||
| for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | ||||
| numa_check(); | numa_check(); | ||||
| @@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) { | |||||
| if (common -> num_nodes > 1) numa_mapping(); | if (common -> num_nodes > 1) numa_mapping(); | ||||
| common -> final_num_procs = popcount(common -> avail); | |||||
| common -> final_num_procs = 0; | |||||
| for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); | |||||
| for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | ||||
| @@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) { | |||||
| disable_affinity(); | disable_affinity(); | ||||
| num_avail = popcount(lprocmask); | |||||
| num_avail = 0; | |||||
| for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]); | |||||
| if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | ||||
| @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <sys/syscall.h> | #include <sys/syscall.h> | ||||
| #endif | #endif | ||||
| #if defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| #include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
| #endif | #endif | ||||
| @@ -185,7 +185,7 @@ int get_num_procs(void) { | |||||
| #endif | #endif | ||||
| #if defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| int get_num_procs(void) { | int get_num_procs(void) { | ||||
| @@ -215,7 +215,7 @@ int goto_get_num_procs (void) { | |||||
| int blas_get_cpu_number(void){ | int blas_get_cpu_number(void){ | ||||
| char *p; | char *p; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| int max_num; | int max_num; | ||||
| #endif | #endif | ||||
| int blas_goto_num = 0; | int blas_goto_num = 0; | ||||
| @@ -223,7 +223,7 @@ int blas_get_cpu_number(void){ | |||||
| if (blas_num_threads) return blas_num_threads; | if (blas_num_threads) return blas_num_threads; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| max_num = get_num_procs(); | max_num = get_num_procs(); | ||||
| #endif | #endif | ||||
| @@ -250,7 +250,7 @@ int blas_get_cpu_number(void){ | |||||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | ||||
| else blas_num_threads = MAX_CPU_NUMBER; | else blas_num_threads = MAX_CPU_NUMBER; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | if (blas_num_threads > max_num) blas_num_threads = max_num; | ||||
| #endif | #endif | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef SMP_SERVER | #ifdef SMP_SERVER | ||||
| #ifdef OS_LINUX | |||||
| extern void openblas_set_num_threads(int num_threads) ; | extern void openblas_set_num_threads(int num_threads) ; | ||||
| @@ -41,5 +40,13 @@ void NAME(int* num_threads){ | |||||
| openblas_set_num_threads(*num_threads); | openblas_set_num_threads(*num_threads); | ||||
| } | } | ||||
| #endif | |||||
| #else | |||||
| //Single thread | |||||
| void openblas_set_num_threads(int num_threads) { | |||||
| } | |||||
| void NAME(int* num_threads){ | |||||
| } | |||||
| #endif | #endif | ||||
| @@ -163,9 +163,9 @@ int get_L2_size(void){ | |||||
| int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | |||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ | |||||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
| defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) | |||||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | |||||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | ||||
| @@ -384,6 +384,17 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| sgemm_p = 1024; | |||||
| dgemm_p = 512; | |||||
| cgemm_p = 512; | |||||
| zgemm_p = 256; | |||||
| #ifdef EXPRECISION | |||||
| qgemm_p = 256; | |||||
| xgemm_p = 128; | |||||
| #endif | |||||
| #endif | |||||
| #if defined(CORE_PRESCOTT) || defined(GENERIC) | #if defined(CORE_PRESCOTT) || defined(GENERIC) | ||||
| size >>= 6; | size >>= 6; | ||||
| @@ -435,7 +446,7 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(CORE_BARCELONA) | |||||
| #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) | |||||
| size >>= 8; | size >>= 8; | ||||
| sgemm_p = 232 * size; | sgemm_p = 232 * size; | ||||
| @@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll | |||||
| $(RANLIB) ../$(LIBNAME) | $(RANLIB) ../$(LIBNAME) | ||||
| ifeq ($(BINARY32), 1) | ifeq ($(BINARY32), 1) | ||||
| $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | ||||
| --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | |||||
| --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) | |||||
| -lib /machine:i386 /def:libopenblas.def | -lib /machine:i386 /def:libopenblas.def | ||||
| else | else | ||||
| $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | ||||
| --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | |||||
| --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) | |||||
| -lib /machine:X64 /def:libopenblas.def | -lib /machine:X64 /def:libopenblas.def | ||||
| endif | endif | ||||
| @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) | |||||
| ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ||||
| $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | ||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | ||||
| -Wl,--retain-symbols-file=linux.def $(EXTRALIB) | |||||
| -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) | |||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | ||||
| rm -f linktest | rm -f linktest | ||||
| @@ -32,11 +32,12 @@ if ($compiler eq "") { | |||||
| "pgf95", "pgf90", "pgf77", | "pgf95", "pgf90", "pgf77", | ||||
| "ifort"); | "ifort"); | ||||
| OUTER: | |||||
| foreach $lists (@lists) { | foreach $lists (@lists) { | ||||
| foreach $path (@path) { | foreach $path (@path) { | ||||
| if (-f $path . "/" . $lists) { | |||||
| if (-x $path . "/" . $lists) { | |||||
| $compiler = $lists; | $compiler = $lists; | ||||
| break; | |||||
| last OUTER; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_BARCELONA */ | /* #define FORCE_BARCELONA */ | ||||
| /* #define FORCE_SHANGHAI */ | /* #define FORCE_SHANGHAI */ | ||||
| /* #define FORCE_ISTANBUL */ | /* #define FORCE_ISTANBUL */ | ||||
| /* #define FORCE_BOBCAT */ | |||||
| /* #define FORCE_SSE_GENERIC */ | /* #define FORCE_SSE_GENERIC */ | ||||
| /* #define FORCE_VIAC3 */ | /* #define FORCE_VIAC3 */ | ||||
| /* #define FORCE_NANO */ | /* #define FORCE_NANO */ | ||||
| @@ -278,6 +279,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "NEHALEM" | #define CORENAME "NEHALEM" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_SANDYBRIDGE | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||||
| #define LIBNAME "sandybridge" | |||||
| #define CORENAME "SANDYBRIDGE" | |||||
| #endif | |||||
| #ifdef FORCE_ATOM | #ifdef FORCE_ATOM | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -349,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "BARCELONA" | #define CORENAME "BARCELONA" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_BOBCAT) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "BOBCAT" | |||||
| #define ARCHCONFIG "-DBOBCAT " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" | |||||
| #define LIBNAME "bobcat" | |||||
| #define CORENAME "BOBCAT" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -0,0 +1,235 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| for (j=0; j<col/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| dest0[16] = src0[4]; | |||||
| dest0[17] = src0[5]; | |||||
| dest0[18] = src1[4]; | |||||
| dest0[19] = src1[5]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src3[4]; | |||||
| dest0[23] = src3[5]; | |||||
| dest0[24] = src0[6]; | |||||
| dest0[25] = src0[7]; | |||||
| dest0[26] = src1[6]; | |||||
| dest0[27] = src1[7]; | |||||
| dest0[28] = src2[6]; | |||||
| dest0[29] = src2[7]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (4<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| ii = (2<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| ii = (1<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<2); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| dest0[8] = src0[4]; | |||||
| dest0[9] = src0[5]; | |||||
| dest0[10] = src1[4]; | |||||
| dest0[11] = src1[5]; | |||||
| dest0[12] = src0[6]; | |||||
| dest0[13] = src0[7]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (4<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| ii = (2<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| ii = (1<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<1); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (4<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| ii = (2<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| ii = (1<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,401 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; | |||||
| for (j=0; j<col/8; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src4 = src3+2*srcdim; | |||||
| src5 = src4+2*srcdim; | |||||
| src6 = src5+2*srcdim; | |||||
| src7 = src6+2*srcdim; | |||||
| src = src7+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| dest0[16] = src0[2]; | |||||
| dest0[17] = src0[3]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src2[2]; | |||||
| dest0[21] = src2[3]; | |||||
| dest0[22] = src3[2]; | |||||
| dest0[23] = src3[3]; | |||||
| dest0[24] = src4[2]; | |||||
| dest0[25] = src4[3]; | |||||
| dest0[26] = src5[2]; | |||||
| dest0[27] = src5[3]; | |||||
| dest0[28] = src6[2]; | |||||
| dest0[29] = src6[3]; | |||||
| dest0[30] = src7[2]; | |||||
| dest0[31] = src7[3]; | |||||
| dest0[32] = src0[4]; | |||||
| dest0[33] = src0[5]; | |||||
| dest0[34] = src1[4]; | |||||
| dest0[35] = src1[5]; | |||||
| dest0[36] = src2[4]; | |||||
| dest0[37] = src2[5]; | |||||
| dest0[38] = src3[4]; | |||||
| dest0[39] = src3[5]; | |||||
| dest0[40] = src4[4]; | |||||
| dest0[41] = src4[5]; | |||||
| dest0[42] = src5[4]; | |||||
| dest0[43] = src5[5]; | |||||
| dest0[44] = src6[4]; | |||||
| dest0[45] = src6[5]; | |||||
| dest0[46] = src7[4]; | |||||
| dest0[47] = src7[5]; | |||||
| dest0[48] = src0[6]; | |||||
| dest0[49] = src0[7]; | |||||
| dest0[50] = src1[6]; | |||||
| dest0[51] = src1[7]; | |||||
| dest0[52] = src2[6]; | |||||
| dest0[53] = src2[7]; | |||||
| dest0[54] = src3[6]; | |||||
| dest0[55] = src3[7]; | |||||
| dest0[56] = src4[6]; | |||||
| dest0[57] = src4[7]; | |||||
| dest0[58] = src5[6]; | |||||
| dest0[59] = src5[7]; | |||||
| dest0[60] = src6[6]; | |||||
| dest0[61] = src6[7]; | |||||
| dest0[62] = src7[6]; | |||||
| dest0[63] = src7[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| src4 = src4+8; | |||||
| src5 = src5+8; | |||||
| src6 = src6+8; | |||||
| src7 = src7+8; | |||||
| ii = (4<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| dest0[16] = src0[2]; | |||||
| dest0[17] = src0[3]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src2[2]; | |||||
| dest0[21] = src2[3]; | |||||
| dest0[22] = src3[2]; | |||||
| dest0[23] = src3[3]; | |||||
| dest0[24] = src4[2]; | |||||
| dest0[25] = src4[3]; | |||||
| dest0[26] = src5[2]; | |||||
| dest0[27] = src5[3]; | |||||
| dest0[28] = src6[2]; | |||||
| dest0[29] = src6[3]; | |||||
| dest0[30] = src7[2]; | |||||
| dest0[31] = src7[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| src4 = src4+4; | |||||
| src5 = src5+4; | |||||
| src6 = src6+4; | |||||
| src7 = src7+4; | |||||
| ii = (2<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| src4 = src4+2; | |||||
| src5 = src5+2; | |||||
| src6 = src6+2; | |||||
| src7 = src7+2; | |||||
| ii = (1<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| dest0[16] = src0[4]; | |||||
| dest0[17] = src0[5]; | |||||
| dest0[18] = src1[4]; | |||||
| dest0[19] = src1[5]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src3[4]; | |||||
| dest0[23] = src3[5]; | |||||
| dest0[24] = src0[6]; | |||||
| dest0[25] = src0[7]; | |||||
| dest0[26] = src1[6]; | |||||
| dest0[27] = src1[7]; | |||||
| dest0[28] = src2[6]; | |||||
| dest0[29] = src2[7]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (4<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| ii = (2<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| ii = (1<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<2); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| dest0[8] = src0[4]; | |||||
| dest0[9] = src0[5]; | |||||
| dest0[10] = src1[4]; | |||||
| dest0[11] = src1[5]; | |||||
| dest0[12] = src0[6]; | |||||
| dest0[13] = src0[7]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (4<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| ii = (2<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| ii = (1<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<1); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (4<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| ii = (2<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| ii = (1<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,237 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| FLOAT *dest1,*dest2; | |||||
| ii = col&-4; | |||||
| ii = ii*(2*row); | |||||
| dest2 = dest+ii; | |||||
| ii = col&-2; | |||||
| ii = ii*(2*row); | |||||
| dest1 = dest+ii; | |||||
| for (j=0; j<row/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (4<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src1[0]; | |||||
| dest0[9] = src1[1]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src1[4]; | |||||
| dest0[13] = src1[5]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| dest0[16] = src2[0]; | |||||
| dest0[17] = src2[1]; | |||||
| dest0[18] = src2[2]; | |||||
| dest0[19] = src2[3]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src2[6]; | |||||
| dest0[23] = src2[7]; | |||||
| dest0[24] = src3[0]; | |||||
| dest0[25] = src3[1]; | |||||
| dest0[26] = src3[2]; | |||||
| dest0[27] = src3[3]; | |||||
| dest0[28] = src3[4]; | |||||
| dest0[29] = src3[5]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| dest2[8] = src2[0]; | |||||
| dest2[9] = src2[1]; | |||||
| dest2[10] = src2[2]; | |||||
| dest2[11] = src2[3]; | |||||
| dest2[12] = src3[0]; | |||||
| dest2[13] = src3[1]; | |||||
| dest2[14] = src3[2]; | |||||
| dest2[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| dest2 = dest2+16; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| dest1[4] = src2[0]; | |||||
| dest1[5] = src2[1]; | |||||
| dest1[6] = src3[0]; | |||||
| dest1[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| dest1 = dest1+8; | |||||
| } | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (2<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src1[0]; | |||||
| dest0[9] = src1[1]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src1[4]; | |||||
| dest0[13] = src1[5]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| dest2 = dest2+8; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| dest1 = dest1+4; | |||||
| } | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (1<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| dest2 = dest2+4; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| dest1 = dest1+2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,370 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| FLOAT *dest1,*dest2,*dest4; | |||||
| ii = col&-8; | |||||
| ii = ii*(2*row); | |||||
| dest4 = dest+ii; | |||||
| ii = col&-4; | |||||
| ii = ii*(2*row); | |||||
| dest2 = dest+ii; | |||||
| ii = col&-2; | |||||
| ii = ii*(2*row); | |||||
| dest1 = dest+ii; | |||||
| for (j=0; j<row/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (4<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| dest0[16] = src1[0]; | |||||
| dest0[17] = src1[1]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src1[4]; | |||||
| dest0[21] = src1[5]; | |||||
| dest0[22] = src1[6]; | |||||
| dest0[23] = src1[7]; | |||||
| dest0[24] = src1[8]; | |||||
| dest0[25] = src1[9]; | |||||
| dest0[26] = src1[10]; | |||||
| dest0[27] = src1[11]; | |||||
| dest0[28] = src1[12]; | |||||
| dest0[29] = src1[13]; | |||||
| dest0[30] = src1[14]; | |||||
| dest0[31] = src1[15]; | |||||
| dest0[32] = src2[0]; | |||||
| dest0[33] = src2[1]; | |||||
| dest0[34] = src2[2]; | |||||
| dest0[35] = src2[3]; | |||||
| dest0[36] = src2[4]; | |||||
| dest0[37] = src2[5]; | |||||
| dest0[38] = src2[6]; | |||||
| dest0[39] = src2[7]; | |||||
| dest0[40] = src2[8]; | |||||
| dest0[41] = src2[9]; | |||||
| dest0[42] = src2[10]; | |||||
| dest0[43] = src2[11]; | |||||
| dest0[44] = src2[12]; | |||||
| dest0[45] = src2[13]; | |||||
| dest0[46] = src2[14]; | |||||
| dest0[47] = src2[15]; | |||||
| dest0[48] = src3[0]; | |||||
| dest0[49] = src3[1]; | |||||
| dest0[50] = src3[2]; | |||||
| dest0[51] = src3[3]; | |||||
| dest0[52] = src3[4]; | |||||
| dest0[53] = src3[5]; | |||||
| dest0[54] = src3[6]; | |||||
| dest0[55] = src3[7]; | |||||
| dest0[56] = src3[8]; | |||||
| dest0[57] = src3[9]; | |||||
| dest0[58] = src3[10]; | |||||
| dest0[59] = src3[11]; | |||||
| dest0[60] = src3[12]; | |||||
| dest0[61] = src3[13]; | |||||
| dest0[62] = src3[14]; | |||||
| dest0[63] = src3[15]; | |||||
| src0 = src0+16; | |||||
| src1 = src1+16; | |||||
| src2 = src2+16; | |||||
| src3 = src3+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| dest4[8] = src1[0]; | |||||
| dest4[9] = src1[1]; | |||||
| dest4[10] = src1[2]; | |||||
| dest4[11] = src1[3]; | |||||
| dest4[12] = src1[4]; | |||||
| dest4[13] = src1[5]; | |||||
| dest4[14] = src1[6]; | |||||
| dest4[15] = src1[7]; | |||||
| dest4[16] = src2[0]; | |||||
| dest4[17] = src2[1]; | |||||
| dest4[18] = src2[2]; | |||||
| dest4[19] = src2[3]; | |||||
| dest4[20] = src2[4]; | |||||
| dest4[21] = src2[5]; | |||||
| dest4[22] = src2[6]; | |||||
| dest4[23] = src2[7]; | |||||
| dest4[24] = src3[0]; | |||||
| dest4[25] = src3[1]; | |||||
| dest4[26] = src3[2]; | |||||
| dest4[27] = src3[3]; | |||||
| dest4[28] = src3[4]; | |||||
| dest4[29] = src3[5]; | |||||
| dest4[30] = src3[6]; | |||||
| dest4[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| dest4 = dest4+32; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| dest2[8] = src2[0]; | |||||
| dest2[9] = src2[1]; | |||||
| dest2[10] = src2[2]; | |||||
| dest2[11] = src2[3]; | |||||
| dest2[12] = src3[0]; | |||||
| dest2[13] = src3[1]; | |||||
| dest2[14] = src3[2]; | |||||
| dest2[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| dest2 = dest2+16; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| dest1[4] = src2[0]; | |||||
| dest1[5] = src2[1]; | |||||
| dest1[6] = src3[0]; | |||||
| dest1[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| dest1 = dest1+8; | |||||
| } | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (2<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| dest0[16] = src1[0]; | |||||
| dest0[17] = src1[1]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src1[4]; | |||||
| dest0[21] = src1[5]; | |||||
| dest0[22] = src1[6]; | |||||
| dest0[23] = src1[7]; | |||||
| dest0[24] = src1[8]; | |||||
| dest0[25] = src1[9]; | |||||
| dest0[26] = src1[10]; | |||||
| dest0[27] = src1[11]; | |||||
| dest0[28] = src1[12]; | |||||
| dest0[29] = src1[13]; | |||||
| dest0[30] = src1[14]; | |||||
| dest0[31] = src1[15]; | |||||
| src0 = src0+16; | |||||
| src1 = src1+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| dest4[8] = src1[0]; | |||||
| dest4[9] = src1[1]; | |||||
| dest4[10] = src1[2]; | |||||
| dest4[11] = src1[3]; | |||||
| dest4[12] = src1[4]; | |||||
| dest4[13] = src1[5]; | |||||
| dest4[14] = src1[6]; | |||||
| dest4[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| dest4 = dest4+16; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| dest2 = dest2+8; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| dest1 = dest1+4; | |||||
| } | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (1<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| src0 = src0+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| dest4 = dest4+8; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| dest2 = dest2+4; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| dest1 = dest1+2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -746,6 +746,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Sandybridge\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef OPTERON | #ifdef OPTERON | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -778,6 +794,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef BOBCAT | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Bobcate\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef NANO | #ifdef NANO | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -0,0 +1 @@ | |||||
| include $(KERNELDIR)/KERNEL.PENRYN | |||||
| @@ -76,6 +76,12 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE (8 * 1 - 4) | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHB prefetcht0 | |||||
| #endif | |||||
| #ifndef PREFETCH | #ifndef PREFETCH | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #endif | #endif | ||||
| @@ -69,6 +69,12 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE (16 * 1 - 8) | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHB prefetcht0 | |||||
| #endif | |||||
| #ifndef PREFETCH | #ifndef PREFETCH | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #endif | #endif | ||||
| @@ -262,7 +268,7 @@ | |||||
| movaps -16 * SIZE(AA), %xmm0 | movaps -16 * SIZE(AA), %xmm0 | ||||
| addps %xmm2, %xmm7 | addps %xmm2, %xmm7 | ||||
| #ifndef NEHALEM | |||||
| #if !(defined(NEHALEM) || defined(SANDYBRIDGE)) | |||||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| pshufd $0x93, %xmm1, %xmm2 | pshufd $0x93, %xmm1, %xmm2 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 4) | #define PREFETCHSIZE (16 * 4) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 4) | #define PREFETCHSIZE (16 * 4) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -488,7 +488,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1697,7 +1697,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1727,7 +1727,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -437,7 +437,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -833,7 +833,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1848,7 +1848,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2109,7 +2109,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2429,7 +2429,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -2459,7 +2459,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2952,7 +2952,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -3148,7 +3148,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3389,7 +3389,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -3404,7 +3404,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -910,7 +910,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -959,7 +959,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1439,7 +1439,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1469,7 +1469,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -872,7 +872,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1316,7 +1316,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1855,7 +1855,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1885,7 +1885,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2249,7 +2249,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2562,7 +2562,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2957,7 +2957,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -2972,7 +2972,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -3280,7 +3280,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3515,7 +3515,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -1036,7 +1036,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1066,7 +1066,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -2224,7 +2224,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -2273,7 +2273,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -454,7 +454,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -758,7 +758,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -993,7 +993,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -1324,7 +1324,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1354,7 +1354,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1718,7 +1718,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2031,7 +2031,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2859,7 +2859,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -3303,7 +3303,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE (8 * 1 - 4) | #define PREFETCHSIZE (8 * 1 - 4) | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE (16 * 1 + 8) | #define PREFETCHSIZE (16 * 1 + 8) | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 2) | #define PREFETCHSIZE (16 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 2) | #define PREFETCHSIZE (16 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -55,7 +55,7 @@ | |||||
| #define XX %edi | #define XX %edi | ||||
| #define FLAG %ebp | #define FLAG %ebp | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -697,7 +697,7 @@ | |||||
| cmpl $2 * SIZE, INCX | cmpl $2 * SIZE, INCX | ||||
| jne .L120 | jne .L120 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| PSHUFD2($0, %xmm0, %xmm6) | PSHUFD2($0, %xmm0, %xmm6) | ||||
| PSHUFD2($0, %xmm1, %xmm1) | PSHUFD2($0, %xmm1, %xmm1) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #include "l1param.h" | #include "l1param.h" | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -860,7 +860,7 @@ | |||||
| cmpl $2 * SIZE, INCX | cmpl $2 * SIZE, INCX | ||||
| jne .L220 | jne .L220 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| #ifdef HAVE_SSE3 | #ifdef HAVE_SSE3 | ||||
| movddup %xmm0, %xmm6 | movddup %xmm0, %xmm6 | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -533,7 +533,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -63,7 +63,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -994,7 +994,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -63,7 +63,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -1820,7 +1820,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -0,0 +1,62 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| @@ -0,0 +1,84 @@ | |||||
| SGEMMKERNEL = sgemm_kernel_8x8_sandy.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| #DGEMMONCOPY = gemm_ncopy_4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| #DGEMMOTCOPY = gemm_tcopy_4.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| #CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||||
| CGEMMKERNEL = cgemm_kernel_4x8_sandy.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| #ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||||
| #STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||||
| #DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||||
| #DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||||
| #DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||||
| #DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||||
| #CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||||
| #CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| #CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| #CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||||
| #ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||||
| @@ -45,6 +45,12 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 16 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef MOVAPS | #ifndef MOVAPS | ||||
| #define MOVAPS movaps | #define MOVAPS movaps | ||||
| #endif | #endif | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| @@ -45,6 +45,12 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 12 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef MOVAPS | #ifndef MOVAPS | ||||
| #define MOVAPS movaps | #define MOVAPS movaps | ||||
| #endif | #endif | ||||
| @@ -52,6 +52,13 @@ | |||||
| #define MOVUPS_A movups | #define MOVUPS_A movups | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 12 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define MOVUPS_A movups | |||||
| #endif | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | ||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -51,6 +51,12 @@ | |||||
| #define MOVUPS_A movups | #define MOVUPS_A movups | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 12 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define MOVUPS_A movups | |||||
| #endif | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) | ||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -46,6 +46,13 @@ | |||||
| #define MOVUPS_A movups | #define MOVUPS_A movups | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE 16 | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define MOVUPS_A movups | |||||
| #endif | |||||
| #ifdef MOVUPS_A | #ifdef MOVUPS_A | ||||
| #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS | #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS | ||||
| #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS | #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS | ||||
| @@ -46,6 +46,13 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | |||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 2) | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define M ARG1 /* rdi */ | #define M ARG1 /* rdi */ | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | #define RPREFETCHSIZE 12 | ||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -46,6 +46,13 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | |||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 2) | |||||
| #define PREFETCH prefetcht0 | |||||
| #define PREFETCHW prefetcht0 | |||||
| #endif | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define M ARG1 /* rdi */ | #define M ARG1 /* rdi */ | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define RPREFETCHSIZE 12 | #define RPREFETCHSIZE 12 | ||||
| #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | #define WPREFETCHSIZE (RPREFETCHSIZE * 4) | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 12) | #define PREFETCHSIZE (16 * 12) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -685,7 +685,7 @@ | |||||
| cmpq $2 * SIZE, INCX | cmpq $2 * SIZE, INCX | ||||
| jne .L120 | jne .L120 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| pshufd $0, %xmm0, %xmm14 | pshufd $0, %xmm0, %xmm14 | ||||
| pshufd $0, %xmm1, %xmm1 | pshufd $0, %xmm1, %xmm1 | ||||
| @@ -55,7 +55,7 @@ | |||||
| #include "l1param.h" | #include "l1param.h" | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -803,7 +803,7 @@ | |||||
| cmpq $2 * SIZE, INCX | cmpq $2 * SIZE, INCX | ||||
| jne .L220 | jne .L220 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| movddup %xmm0, %xmm14 | movddup %xmm0, %xmm14 | ||||
| pxor %xmm15, %xmm15 | pxor %xmm15, %xmm15 | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -160,7 +160,7 @@ | |||||
| #define a3 %xmm14 | #define a3 %xmm14 | ||||
| #define xt1 %xmm15 | #define xt1 %xmm15 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
| #define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
| #else | #else | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 24) | #define PREFETCHSIZE (16 * 24) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -86,7 +86,7 @@ | |||||
| #define BORIG 72(%rsp) | #define BORIG 72(%rsp) | ||||
| #define BUFFER 128(%rsp) | #define BUFFER 128(%rsp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -95,7 +95,7 @@ | |||||
| #define PREFETCHSIZE (8 * 6 + 4) | #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | #endif | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -86,7 +86,7 @@ | |||||
| #define BORIG 72(%rsp) | #define BORIG 72(%rsp) | ||||
| #define BUFFER 128(%rsp) | #define BUFFER 128(%rsp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -95,7 +95,7 @@ | |||||
| #define PREFETCHSIZE (8 * 6 + 4) | #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | #endif | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||