| @@ -25,6 +25,7 @@ before_install: | |||
| - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi | |||
| script: | |||
| - set -e | |||
| - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| @@ -141,5 +141,11 @@ In chronological order: | |||
| * Martin Koehler <https://github.com/grisuthedragon/> | |||
| * [2015-09-07] Improved imatcopy | |||
| * Ashwin Sekhar T K <https://github.com/ashwinyes/> | |||
| * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8) | |||
| * [2015-11-20] lapack-test fixes for Cortex-A57 | |||
| * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 | |||
| * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 | |||
| * [Your name or handle] <[email or website]> | |||
| * [Date] [Brief summary of your changes] | |||
| @@ -83,20 +83,20 @@ shared : | |||
| ifndef NO_SHARED | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||
| @$(MAKE) -C exports so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @$(MAKE) -C exports so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| @$(MAKE) -C exports so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @$(MAKE) -C exports dyn | |||
| @-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @$(MAKE) -C exports dll | |||
| @@ -0,0 +1,199 @@ | |||
| # Notes on OpenBLAS usage | |||
| ## Usage | |||
| #### Program is Terminated. Because you tried to allocate too many memory regions | |||
| In OpenBLAS, we mange a pool of memory buffers and allocate the number of | |||
| buffers as the following. | |||
| ``` | |||
| #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) | |||
| ``` | |||
| This error indicates that the program exceeded the number of buffers. | |||
| Please build OpenBLAS with larger `NUM_THREADS`. For example, `make | |||
| NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set | |||
| `MAX_CPU_NUMBER=NUM_THREADS`. | |||
| #### How can I use OpenBLAS in multi-threaded applications? | |||
| If your application is already multi-threaded, it will conflict with OpenBLAS | |||
| multi-threading. Thus, you must set OpenBLAS to use single thread in any of the | |||
| following ways: | |||
| * `export OPENBLAS_NUM_THREADS=1` in the environment variables. | |||
| * Call `openblas_set_num_threads(1)` in the application on runtime. | |||
| * Build OpenBLAS single thread version, e.g. `make USE_THREAD=0` | |||
| If the application is parallelized by OpenMP, please use OpenBLAS built with | |||
| `USE_OPENMP=1` | |||
| #### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH | |||
| The environment variable which control the kernel selection is | |||
| `OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export | |||
| OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()` | |||
| returns the used target. | |||
| #### How could I disable OpenBLAS threading affinity on runtime? | |||
| You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment | |||
| variable to disable threading affinity on runtime. For example, before the | |||
| running, | |||
| ``` | |||
| export OPENBLAS_MAIN_FREE=1 | |||
| ``` | |||
| Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1` | |||
| in `Makefile.rule`. | |||
| ## Linking with the library | |||
| * Link with shared library | |||
| `gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas` | |||
| If the library is multithreaded, please add `-lpthread`. If the library | |||
| contains LAPACK functions, please add `-lgfortran` or other Fortran libs. | |||
| * Link with static library | |||
| `gcc -o test test.c /your/path/libopenblas.a` | |||
| You can download `test.c` from https://gist.github.com/xianyi/5780018 | |||
| On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by | |||
| default), custom programs statically linked against `libopenblas.a` should also | |||
| link with the pthread library e.g.: | |||
| ``` | |||
| gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread | |||
| ``` | |||
| Failing to add the `-lpthread` flag will cause errors such as: | |||
| ``` | |||
| /opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory': | |||
| memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock' | |||
| memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock' | |||
| ... | |||
| ``` | |||
| ## Code examples | |||
| #### Call CBLAS interface | |||
| This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656 | |||
| ``` | |||
| #include <cblas.h> | |||
| #include <stdio.h> | |||
| void main() | |||
| { | |||
| int i=0; | |||
| double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; | |||
| double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; | |||
| double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; | |||
| cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3); | |||
| for(i=0; i<9; i++) | |||
| printf("%lf ", C[i]); | |||
| printf("\n"); | |||
| } | |||
| ``` | |||
| `gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran` | |||
| #### Call BLAS Fortran interface | |||
| This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018 | |||
| ``` | |||
| #include "stdio.h" | |||
| #include "stdlib.h" | |||
| #include "sys/time.h" | |||
| #include "time.h" | |||
| extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*); | |||
| int main(int argc, char* argv[]) | |||
| { | |||
| int i; | |||
| printf("test!\n"); | |||
| if(argc<4){ | |||
| printf("Input Error\n"); | |||
| return 1; | |||
| } | |||
| int m = atoi(argv[1]); | |||
| int n = atoi(argv[2]); | |||
| int k = atoi(argv[3]); | |||
| int sizeofa = m * k; | |||
| int sizeofb = k * n; | |||
| int sizeofc = m * n; | |||
| char ta = 'N'; | |||
| char tb = 'N'; | |||
| double alpha = 1.2; | |||
| double beta = 0.001; | |||
| struct timeval start,finish; | |||
| double duration; | |||
| double* A = (double*)malloc(sizeof(double) * sizeofa); | |||
| double* B = (double*)malloc(sizeof(double) * sizeofb); | |||
| double* C = (double*)malloc(sizeof(double) * sizeofc); | |||
| srand((unsigned)time(NULL)); | |||
| for (i=0; i<sizeofa; i++) | |||
| A[i] = i%3+1;//(rand()%100)/10.0; | |||
| for (i=0; i<sizeofb; i++) | |||
| B[i] = i%3+1;//(rand()%100)/10.0; | |||
| for (i=0; i<sizeofc; i++) | |||
| C[i] = i%3+1;//(rand()%100)/10.0; | |||
| //#if 0 | |||
| printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc); | |||
| gettimeofday(&start, NULL); | |||
| dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); | |||
| gettimeofday(&finish, NULL); | |||
| duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000; | |||
| double gflops = 2.0 * m *n*k; | |||
| gflops = gflops/duration*1.0e-6; | |||
| FILE *fp; | |||
| fp = fopen("timeDGEMM.txt", "a"); | |||
| fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops); | |||
| fclose(fp); | |||
| free(A); | |||
| free(B); | |||
| free(C); | |||
| return 0; | |||
| } | |||
| ``` | |||
| ` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a` | |||
| ` ./time_dgemm <m> <n> <k> ` | |||
| ## Troubleshooting | |||
| * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. | |||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||
| * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. | |||
| * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||
| ## BLAS reference manual | |||
| If you want to understand every BLAS function and definition, please read | |||
| [Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm) | |||
| or [netlib.org](http://netlib.org/blas/) | |||
| Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions) | |||
| ## How to reference OpenBLAS. | |||
| You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications). | |||
| Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly. | |||
| @@ -2134,7 +2134,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| smallscaling: smallscaling.c ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^ | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib | |||
| @@ -23,28 +23,32 @@ typedef struct { | |||
| void * s_create_matrix(int size) { | |||
| float * r = malloc(size * sizeof(double)); | |||
| for(int i = 0; i < size; i++) | |||
| int i; | |||
| for(i = 0; i < size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * c_create_matrix(int size) { | |||
| float * r = malloc(size * 2 * sizeof(double)); | |||
| for(int i = 0; i < 2 * size; i++) | |||
| int i; | |||
| for(i = 0; i < 2 * size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * z_create_matrix(int size) { | |||
| double * r = malloc(size * 2 * sizeof(double)); | |||
| for(int i = 0; i < 2 * size; i++) | |||
| int i; | |||
| for(i = 0; i < 2 * size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * d_create_matrix(int size) { | |||
| double * r = malloc(size * sizeof(double)); | |||
| for(int i = 0; i < size; i++) | |||
| int i; | |||
| for(i = 0; i < size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| @@ -188,4 +192,5 @@ int main(int argc, char * argv[]) { | |||
| size *= inc_factor; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -332,12 +332,13 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef PILEDRIVER | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| */ | |||
| /* | |||
| #ifdef STEAMROLLER | |||
| @@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| #if defined(POWER8) | |||
| #define L1_DUALFETCH | |||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| # | |||
| #ifndef L1_PREFETCH | |||
| #define L1_PREFETCH dcbt | |||
| #endif | |||
| @@ -790,6 +797,8 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #endif | |||
| @@ -396,7 +396,7 @@ REALNAME: | |||
| #define PROFCODE | |||
| #define EPILOGUE .end REALNAME | |||
| #define EPILOGUE .end | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||
| @@ -55,6 +55,7 @@ | |||
| #define CPUTYPE_POWER6 5 | |||
| #define CPUTYPE_CELL 6 | |||
| #define CPUTYPE_PPCG4 7 | |||
| #define CPUTYPE_POWER8 8 | |||
| char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -65,6 +66,7 @@ char *cpuname[] = { | |||
| "POWER6", | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8" | |||
| }; | |||
| char *lowercpuname[] = { | |||
| @@ -76,6 +78,7 @@ char *lowercpuname[] = { | |||
| "power6", | |||
| "cell", | |||
| "ppcg4", | |||
| "power8" | |||
| }; | |||
| char *corename[] = { | |||
| @@ -87,6 +90,7 @@ char *corename[] = { | |||
| "POWER6", | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8" | |||
| }; | |||
| int detect(void){ | |||
| @@ -115,7 +119,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -1,7 +1,7 @@ | |||
| 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -33,6 +33,7 @@ set(COMMON_SOURCES | |||
| xerbla.c | |||
| openblas_set_num_threads.c | |||
| openblas_error_handle.c | |||
| openblas_env.c | |||
| openblas_get_num_procs.c | |||
| openblas_get_num_threads.c | |||
| ) | |||
| @@ -1,7 +1,7 @@ | |||
| TOPDIR = ../.. | |||
| include ../../Makefile.system | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) | |||
| #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | |||
| @@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c | |||
| openblas_error_handle.$(SUFFIX) : openblas_error_handle.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_env.$(SUFFIX) : openblas_env.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #endif | |||
| extern unsigned int openblas_thread_timeout(); | |||
| #ifdef SMP_SERVER | |||
| #undef MONITOR | |||
| @@ -524,6 +526,7 @@ static int blas_monitor(void *arg){ | |||
| int blas_thread_init(void){ | |||
| BLASLONG i; | |||
| int ret; | |||
| int thread_timeout_env; | |||
| #ifdef NEED_STACKATTR | |||
| pthread_attr_t attr; | |||
| #endif | |||
| @@ -540,22 +543,12 @@ int blas_thread_init(void){ | |||
| if (!blas_server_avail){ | |||
| env_var_t p; | |||
| if (readenv(p,"THREAD_TIMEOUT")) { | |||
| thread_timeout = atoi(p); | |||
| if (thread_timeout < 4) thread_timeout = 4; | |||
| if (thread_timeout > 30) thread_timeout = 30; | |||
| thread_timeout = (1 << thread_timeout); | |||
| }else{ | |||
| if (readenv(p,"GOTO_THREAD_TIMEOUT")) { | |||
| thread_timeout = atoi(p); | |||
| if (thread_timeout < 4) thread_timeout = 4; | |||
| if (thread_timeout > 30) thread_timeout = 30; | |||
| thread_timeout = (1 << thread_timeout); | |||
| } | |||
| } | |||
| thread_timeout_env=openblas_thread_timeout(); | |||
| if (thread_timeout_env>0) { | |||
| if (thread_timeout_env < 4) thread_timeout_env = 4; | |||
| if (thread_timeout_env > 30) thread_timeout_env = 30; | |||
| thread_timeout = (1 << thread_timeout_env); | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| @@ -391,7 +391,7 @@ static char *corename[] = { | |||
| "Nehalem", | |||
| "Athlon", | |||
| "Opteron", | |||
| "Opteron(SSE3)", | |||
| "Opteron_SSE3", | |||
| "Barcelona", | |||
| "Nano", | |||
| "Sandybridge", | |||
| @@ -294,8 +294,11 @@ void openblas_fork_handler() | |||
| #endif | |||
| } | |||
| extern int openblas_num_threads_env(); | |||
| extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| env_var_t p; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| int max_num; | |||
| #endif | |||
| @@ -310,18 +313,18 @@ int blas_get_cpu_number(void){ | |||
| blas_goto_num = 0; | |||
| #ifndef USE_OPENMP | |||
| if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p); | |||
| blas_goto_num=openblas_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| if (blas_goto_num == 0) { | |||
| if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| blas_goto_num=openblas_goto_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| } | |||
| #endif | |||
| blas_omp_num = 0; | |||
| if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p); | |||
| blas_omp_num=openblas_omp_num_threads_env(); | |||
| if (blas_omp_num < 0) blas_omp_num = 0; | |||
| if (blas_goto_num > 0) blas_num_threads = blas_goto_num; | |||
| @@ -1340,6 +1343,7 @@ static void gotoblas_memory_init(void) { | |||
| /* Initialization for all function; this function should be called before main */ | |||
| static int gotoblas_initialized = 0; | |||
| extern void openblas_read_env(); | |||
| void CONSTRUCTOR gotoblas_init(void) { | |||
| @@ -1349,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) { | |||
| openblas_fork_handler(); | |||
| #endif | |||
| openblas_read_env(); | |||
| #ifdef PROFILE | |||
| moncontrol (0); | |||
| #endif | |||
| @@ -0,0 +1,84 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| static int openblas_env_verbose=0; | |||
| static unsigned int openblas_env_thread_timeout=0; | |||
| static int openblas_env_block_factor=0; | |||
| static int openblas_env_openblas_num_threads=0; | |||
| static int openblas_env_goto_num_threads=0; | |||
| static int openblas_env_omp_num_threads=0; | |||
| int openblas_verbose() { return openblas_env_verbose;} | |||
| unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} | |||
| int openblas_block_factor() { return openblas_env_block_factor;} | |||
| int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} | |||
| int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} | |||
| int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} | |||
| void openblas_read_env() { | |||
| int ret=0; | |||
| env_var_t p; | |||
| if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_verbose=ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_block_factor=ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_thread_timeout=(unsigned int)ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_openblas_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_goto_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_omp_num_threads=ret; | |||
| } | |||
| @@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| int openblas_verbose() { | |||
| int ret=0; | |||
| env_var_t p; | |||
| if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| return ret; | |||
| } | |||
| extern int openblas_verbose(); | |||
| void openblas_warning(int verbose, const char * msg) { | |||
| int current_verbose; | |||
| @@ -40,6 +40,7 @@ | |||
| #include <string.h> | |||
| #include "common.h" | |||
| extern int openblas_block_factor(); | |||
| int get_L2_size(void); | |||
| #define DEFAULT_GEMM_P 128 | |||
| @@ -249,7 +250,6 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| env_var_t p; | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) | |||
| int size = 16; | |||
| @@ -468,9 +468,8 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| if (readenv(p,"GOTO_BLOCK_FACTOR")) { | |||
| factor = atoi(p); | |||
| factor=openblas_block_factor(); | |||
| if (factor>0) { | |||
| if (factor < 10) factor = 10; | |||
| if (factor > 200) factor = 200; | |||
| @@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER5" | |||
| #endif | |||
| #if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) | |||
| #if defined(FORCE_POWER6) || defined(FORCE_POWER7) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER6" | |||
| @@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER6" | |||
| #endif | |||
| #if defined(FORCE_POWER8) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER8" | |||
| #define SUBDIRNAME "power" | |||
| #define ARCHCONFIG "-DPOWER8 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "power8" | |||
| #define CORENAME "POWER8" | |||
| #endif | |||
| #ifdef FORCE_PPCG4 | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| @@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| USE_TRMM = 1 | |||
| endif | |||
| SKERNELOBJS += \ | |||
| @@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| STRMMKERNEL = strmm_kernel_4x4.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x4.S | |||
| CTRMMKERNEL = ctrmm_kernel_4x4.S | |||
| ZTRMMKERNEL = ztrmm_kernel_4x4.S | |||
| SGEMMKERNEL = sgemm_kernel_4x4.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| endif | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_4x4.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| endif | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = cgemm_kernel_4x4.S | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = zgemm_kernel_4x4.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| @@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define ppC x16 | |||
| #define ppCRow0 x17 | |||
| #define ppCRow1 x18 | |||
| #define ppCRow2 x19 | |||
| #define ppA x20 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define ppC x17 | |||
| #define ppCRow0 x18 | |||
| #define ppCRow1 x19 | |||
| #define ppCRow2 x20 | |||
| #define ppCRow3 x21 | |||
| #define ppA x22 | |||
| #define alpha x23 | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| #define alpha1 d11 | |||
| #define alphaV1 v11.d[0] | |||
| #define alpha2 d14 | |||
| #define alphaV2 v14.d[0] | |||
| #define alpha3 d15 | |||
| #define alphaV3 v15.d[0] | |||
| #define A_PRE_SIZE 1024 | |||
| #define B_PRE_SIZE 1024 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| @@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 ppC | |||
| // 17 ppCRow0 | |||
| // 18 must save ppCRow1 | |||
| // 19 must save ppCRow2 | |||
| // 20 must save ppA | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 ppC | |||
| // 18 must save ppCRow0 | |||
| // 19 must save ppCRow1 | |||
| // 20 must save ppCRow2 | |||
| // 21 must save ppCRow3 | |||
| // 22 must save ppA | |||
| // 23 must save alpha | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| @@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v08 must save pB00, pB01 | |||
| //v09 must save pB02, pB03 | |||
| //v10 must save ALPHA0 | |||
| //v11 must save ALPHA1 | |||
| //v11 must save | |||
| //v12 must save pB10, pB11 | |||
| //v13 must save pB12, pB13 | |||
| //v14 must save ALPHA2 | |||
| //v15 must save ALPHA3 | |||
| //v14 must save | |||
| //v15 must save | |||
| //v16 must save C00, C01 | |||
| //v17 must save C02, C03 | |||
| //v18 ppC00, ppC01 | |||
| @@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_I | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp q0, q1, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v29.2d, v1.2d, v9.2d[1] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| ld1 {v2.2d, v3.2d}, [ppA] | |||
| ldp q2, q3, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| fmul v25.2d, v1.2d, v9.2d[0] | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v25.2d, v1.2d, v10.2d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| fmul v31.2d, v3.2d, v9.2d[1] | |||
| fmul v22.2d, v2.2d, v8.2d[1] | |||
| fmul v27.2d, v3.2d, v9.2d[0] | |||
| fmul v31.2d, v3.2d, v11.2d[0] | |||
| ld1 {v12.2d, v13.2d}, [pB] // for next round | |||
| add pB, pB, #32 | |||
| prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] | |||
| fmul v22.2d, v2.2d, v9.2d[0] | |||
| fmul v27.2d, v3.2d, v10.2d[0] | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| fmul v24.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v8.2d[1] | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| fmul v21.2d, v1.2d, v9.2d[0] | |||
| ld1 {v4.2d, v5.2d} , [pA] // for next round | |||
| ldp q4, q5, [pA] // for next round | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v9.2d[0] | |||
| fmul v23.2d, v3.2d, v8.2d[1] | |||
| fmul v26.2d, v2.2d, v10.2d[0] | |||
| fmul v23.2d, v3.2d, v9.2d[0] | |||
| ld1 {v6.2d, v7.2d} , [ppA] // for next round | |||
| ldp q6, q7, [ppA] // for next round | |||
| add ppA, ppA, #32 | |||
| fmul v28.2d, v0.2d, v9.2d[1] | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v30.2d, v2.2d, v9.2d[1] | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| fmul v30.2d, v2.2d, v11.2d[0] | |||
| fmul v19.2d, v3.2d, v8.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v31.2d, v7.2d, v13.2d[1] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| fmla v22.2d, v6.2d, v12.2d[1] | |||
| fmla v27.2d, v7.2d, v13.2d[0] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| ldp q0, q1, [pA] | |||
| add pA, pA, #32 | |||
| fmla v26.2d, v6.2d, v13.2d[0] | |||
| fmla v23.2d, v7.2d, v12.2d[1] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| ld1 {v2.2d, v3.2d}, [ppA] | |||
| ldp q2, q3, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmla v30.2d, v6.2d, v13.2d[1] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| ld1 {v12.2d, v13.2d}, [pB] // for next round | |||
| add pB, pB, #32 | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v31.2d, v3.2d, v9.2d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| fmla v22.2d, v2.2d, v8.2d[1] | |||
| fmla v27.2d, v3.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| prfm PLDL1KEEP, [ppA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| ld1 {v4.2d, v5.2d} , [pA] // for next round | |||
| ldp q4, q5, [pA] | |||
| add pA, pA, #32 | |||
| fmla v26.2d, v2.2d, v9.2d[0] | |||
| fmla v23.2d, v3.2d, v8.2d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| ld1 {v6.2d, v7.2d} , [ppA] // for next round | |||
| ldp q6, q7, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmla v30.2d, v2.2d, v9.2d[1] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v27.2d, v7.2d, v13.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v22.2d, v6.2d, v12.2d[1] | |||
| fmla v31.2d, v7.2d, v13.2d[1] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v26.2d, v6.2d, v13.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v30.2d, v6.2d, v13.2d[1] | |||
| fmla v23.2d, v7.2d, v12.2d[1] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp q0, q1, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| ld1 {v2.2d, v3.2d}, [ppA] | |||
| ldp q2, q3, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v31.2d, v3.2d, v9.2d[1] | |||
| fmla v22.2d, v2.2d, v8.2d[1] | |||
| fmla v27.2d, v3.2d, v9.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v26.2d, v2.2d, v9.2d[0] | |||
| fmla v23.2d, v3.2d, v8.2d[1] | |||
| fmla v30.2d, v2.2d, v9.2d[1] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| fmov alpha0, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add ppCRow0, pCRow0, #32 | |||
| ld1 {v0.2d, v1.2d}, [pCRow0] | |||
| ldp q0, q1, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV1 | |||
| st1 {v0.2d, v1.2d}, [pCRow0] | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| stp q0, q1, [pCRow0] | |||
| ld1 {v2.2d, v3.2d}, [ppCRow0] | |||
| fmla v2.2d, v18.2d, alphaV2 | |||
| fmla v3.2d, v19.2d, alphaV3 | |||
| st1 {v2.2d, v3.2d}, [ppCRow0] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow1, pCRow0, LDC | |||
| add ppCRow1, ppCRow0, LDC | |||
| ldp q2, q3, [ppCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| stp q2, q3, [ppCRow0] | |||
| ld1 {v4.2d, v5.2d}, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add ppCRow1, pCRow1, #32 | |||
| ldp q4, q5, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0 | |||
| fmla v5.2d, v21.2d, alphaV1 | |||
| st1 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v5.2d, v21.2d, alphaV0 | |||
| stp q4, q5, [pCRow1] | |||
| ld1 {v6.2d, v7.2d}, [ppCRow1] | |||
| fmla v6.2d, v22.2d, alphaV2 | |||
| fmla v7.2d, v23.2d, alphaV3 | |||
| st1 {v6.2d, v7.2d}, [ppCRow1] | |||
| add pCRow1, pCRow1, #64 | |||
| add pCRow2, pCRow1, LDC | |||
| add ppCRow2, ppCRow1, LDC | |||
| ldp q6, q7, [ppCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0 | |||
| fmla v7.2d, v23.2d, alphaV0 | |||
| stp q6, q7, [ppCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add ppCRow2, pCRow2, #32 | |||
| ld1 {v0.2d, v1.2d}, [pCRow2] | |||
| ldp q0, q1, [pCRow2] | |||
| fmla v0.2d, v24.2d, alphaV0 | |||
| fmla v1.2d, v25.2d, alphaV1 | |||
| st1 {v0.2d, v1.2d}, [pCRow2] | |||
| fmla v1.2d, v25.2d, alphaV0 | |||
| stp q0, q1, [pCRow2] | |||
| ld1 {v2.2d, v3.2d}, [ppCRow2] | |||
| fmla v2.2d, v26.2d, alphaV2 | |||
| fmla v3.2d, v27.2d, alphaV3 | |||
| st1 {v2.2d, v3.2d}, [ppCRow2] | |||
| add pCRow2, pCRow2, #64 | |||
| add pCRow1, pCRow2, LDC | |||
| add ppCRow1, ppCRow2, LDC | |||
| ldp q2, q3, [ppCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0 | |||
| fmla v3.2d, v27.2d, alphaV0 | |||
| stp q2, q3, [ppCRow2] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add ppCRow3, pCRow3, #32 | |||
| ld1 {v4.2d, v5.2d}, [pCRow1] | |||
| ldp q4, q5, [pCRow3] | |||
| fmla v4.2d, v28.2d, alphaV0 | |||
| fmla v5.2d, v29.2d, alphaV1 | |||
| st1 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v5.2d, v29.2d, alphaV0 | |||
| stp q4, q5, [pCRow3] | |||
| ld1 {v6.2d, v7.2d}, [ppCRow1] | |||
| fmla v6.2d, v30.2d, alphaV2 | |||
| fmla v7.2d, v31.2d, alphaV3 | |||
| st1 {v6.2d, v7.2d}, [ppCRow1] | |||
| add pCRow3, pCRow3, #64 | |||
| add pCRow0, pCRow0, #64 | |||
| ldp q6, q7, [ppCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0 | |||
| fmla v7.2d, v31.2d, alphaV0 | |||
| stp q6, q7, [ppCRow3] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV2 | |||
| fmla v13.2d, v21.2d, alphaV3 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1 {v8.2d, v9.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| fmla v9.2d, v25.2d, alphaV1 | |||
| fmla v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v28.2d, alphaV2 | |||
| fmla v13.2d, v29.2d, alphaV3 | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| fmla v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1 {v8.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV2 | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v28.2d, alphaV3 | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| @@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v12.d}[0], [pCRow2] | |||
| ld1 {v12.d}[1], [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| @@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV2 | |||
| fmla v13.2d, v21.2d, alphaV3 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| @@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0, alpha | |||
| ldr d8, [pCRow0] | |||
| fmadd d8, d16, alpha0, d8 | |||
| str d8, [pCRow0] | |||
| @@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha0, d0 | |||
| fmov alpha1, d0 | |||
| fmov alpha2, d0 | |||
| fmov alpha3, d0 | |||
| fmov alpha, d0 | |||
| prfm PLDL1KEEP, [origPA] | |||
| prfm PLDL1KEEP, [origPB] | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| @@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble dgemm_kernel_L2_BEGIN | |||
| dgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| lsl temp, origK, #5 // k * 4 * 8 | |||
| mov pA, origPA // pA = start of A array | |||
| add ppA, temp, pA | |||
| prfm PLDL1KEEP, [ppA] | |||
| //------------------------------------------------------------------------------ | |||
| @@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , origK, #2 // L = K / 4 | |||
| cmp counterL , #2 | |||
| blt dgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I // do one in the K | |||
| KERNEL8x4_M2 // do another in the K | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| @@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40: | |||
| dgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #1 | |||
| ands counterL , origK, #3 | |||
| ble dgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dgemm_kernel_L4_M8_46 | |||
| dgemm_kernel_L4_M8_100: | |||
| lsl temp, origK, #5 | |||
| prfm PLDL1KEEP, [pA, temp] | |||
| prfm PLDL1KEEP, [ppA, temp] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE8x4 | |||
| @@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne dgemm_kernel_L4_M8_20 | |||
| dgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| @@ -1,57 +1,3 @@ | |||
| SGEMM_BETA = gemm_beta.S | |||
| DGEMM_BETA = gemm_beta.S | |||
| CGEMM_BETA = zgemm_beta.S | |||
| ZGEMM_BETA = zgemm_beta.S | |||
| ifndef SSYMV_U_KERNEL | |||
| SSYMV_U_KERNEL = symv_U.S | |||
| endif | |||
| ifndef SSYMV_L_KERNEL | |||
| SSYMV_L_KERNEL = symv_L.S | |||
| endif | |||
| ifndef DSYMV_U_KERNEL | |||
| DSYMV_U_KERNEL = symv_U.S | |||
| endif | |||
| ifndef DSYMV_L_KERNEL | |||
| DSYMV_L_KERNEL = symv_L.S | |||
| endif | |||
| ifndef CSYMV_U_KERNEL | |||
| CSYMV_U_KERNEL = zsymv_U.S | |||
| endif | |||
| ifndef CSYMV_L_KERNEL | |||
| CSYMV_L_KERNEL = zsymv_L.S | |||
| endif | |||
| ifndef ZSYMV_U_KERNEL | |||
| ZSYMV_U_KERNEL = zsymv_U.S | |||
| endif | |||
| ifndef ZSYMV_L_KERNEL | |||
| ZSYMV_L_KERNEL = zsymv_L.S | |||
| endif | |||
| ifndef CHEMV_U_KERNEL | |||
| CHEMV_U_KERNEL = zsymv_U.S | |||
| endif | |||
| ifndef CHEMV_L_KERNEL | |||
| CHEMV_L_KERNEL = zsymv_L.S | |||
| endif | |||
| ifndef ZHEMV_U_KERNEL | |||
| ZHEMV_U_KERNEL = zsymv_U.S | |||
| endif | |||
| ifndef ZHEMV_L_KERNEL | |||
| ZHEMV_L_KERNEL = zsymv_L.S | |||
| endif | |||
| ifndef STRSMKERNEL_LN | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| endif | |||
| @@ -84,3 +30,23 @@ ifndef CTRSMKERNEL_RT | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = gemm_beta.S | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = gemm_beta.S | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = zgemm_beta.S | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta.S | |||
| endif | |||
| ifndef DSDOTKERNEL | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| endif | |||
| @@ -0,0 +1,175 @@ | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = gemm_kernel_power6.S | |||
| DTRMMKERNEL = dtrmm_kernel_16x4_power8.S | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
| SGEMMKERNEL = gemm_kernel_power6.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_16x4_power8.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
| #Pure C for other kernels | |||
| #SAMAXKERNEL = ../arm/amax.c | |||
| #DAMAXKERNEL = ../arm/amax.c | |||
| #CAMAXKERNEL = ../arm/zamax.c | |||
| #ZAMAXKERNEL = ../arm/zamax.c | |||
| # | |||
| #SAMINKERNEL = ../arm/amin.c | |||
| #DAMINKERNEL = ../arm/amin.c | |||
| #CAMINKERNEL = ../arm/zamin.c | |||
| #ZAMINKERNEL = ../arm/zamin.c | |||
| # | |||
| #SMAXKERNEL = ../arm/max.c | |||
| #DMAXKERNEL = ../arm/max.c | |||
| # | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| #ISAMAXKERNEL = ../arm/iamax.c | |||
| #IDAMAXKERNEL = ../arm/iamax.c | |||
| #ICAMAXKERNEL = ../arm/izamax.c | |||
| #IZAMAXKERNEL = ../arm/izamax.c | |||
| # | |||
| #ISAMINKERNEL = ../arm/iamin.c | |||
| #IDAMINKERNEL = ../arm/iamin.c | |||
| #ICAMINKERNEL = ../arm/izamin.c | |||
| #IZAMINKERNEL = ../arm/izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| #IDMAXKERNEL = ../arm/imax.c | |||
| # | |||
| #ISMINKERNEL = ../arm/imin.c | |||
| #IDMINKERNEL = ../arm/imin.c | |||
| # | |||
| #SASUMKERNEL = ../arm/asum.c | |||
| #DASUMKERNEL = ../arm/asum.c | |||
| #CASUMKERNEL = ../arm/zasum.c | |||
| #ZASUMKERNEL = ../arm/zasum.c | |||
| # | |||
| #SAXPYKERNEL = ../arm/axpy.c | |||
| #DAXPYKERNEL = ../arm/axpy.c | |||
| #CAXPYKERNEL = ../arm/zaxpy.c | |||
| #ZAXPYKERNEL = ../arm/zaxpy.c | |||
| # | |||
| #SCOPYKERNEL = ../arm/copy.c | |||
| #DCOPYKERNEL = ../arm/copy.c | |||
| #CCOPYKERNEL = ../arm/zcopy.c | |||
| #ZCOPYKERNEL = ../arm/zcopy.c | |||
| # | |||
| #SDOTKERNEL = ../arm/dot.c | |||
| #DDOTKERNEL = ../arm/dot.c | |||
| #CDOTKERNEL = ../arm/zdot.c | |||
| #ZDOTKERNEL = ../arm/zdot.c | |||
| # | |||
| #SNRM2KERNEL = ../arm/nrm2.c | |||
| #DNRM2KERNEL = ../arm/nrm2.c | |||
| #CNRM2KERNEL = ../arm/znrm2.c | |||
| #ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| #SROTKERNEL = ../arm/rot.c | |||
| #DROTKERNEL = ../arm/rot.c | |||
| #CROTKERNEL = ../arm/zrot.c | |||
| #ZROTKERNEL = ../arm/zrot.c | |||
| # | |||
| #SSCALKERNEL = ../arm/scal.c | |||
| #DSCALKERNEL = ../arm/scal.c | |||
| #CSCALKERNEL = ../arm/zscal.c | |||
| #ZSCALKERNEL = ../arm/zscal.c | |||
| # | |||
| #SSWAPKERNEL = ../arm/swap.c | |||
| #DSWAPKERNEL = ../arm/swap.c | |||
| #CSWAPKERNEL = ../arm/zswap.c | |||
| #ZSWAPKERNEL = ../arm/zswap.c | |||
| # | |||
| #SGEMVNKERNEL = ../arm/gemv_n.c | |||
| #DGEMVNKERNEL = ../arm/gemv_n.c | |||
| #CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| #ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| # | |||
| #SGEMVTKERNEL = ../arm/gemv_t.c | |||
| #DGEMVTKERNEL = ../arm/gemv_t.c | |||
| #CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| #ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| @@ -0,0 +1,64 @@ | |||
| #define vs0 0 | |||
| #define vs1 1 | |||
| #define vs2 2 | |||
| #define vs3 3 | |||
| #define vs4 4 | |||
| #define vs5 5 | |||
| #define vs6 6 | |||
| #define vs7 7 | |||
| #define vs8 8 | |||
| #define vs9 9 | |||
| #define vs10 10 | |||
| #define vs11 11 | |||
| #define vs12 12 | |||
| #define vs13 13 | |||
| #define vs14 14 | |||
| #define vs15 15 | |||
| #define vs16 16 | |||
| #define vs17 17 | |||
| #define vs18 18 | |||
| #define vs19 19 | |||
| #define vs20 20 | |||
| #define vs21 21 | |||
| #define vs22 22 | |||
| #define vs23 23 | |||
| #define vs24 24 | |||
| #define vs25 25 | |||
| #define vs26 26 | |||
| #define vs27 27 | |||
| #define vs28 28 | |||
| #define vs29 29 | |||
| #define vs30 30 | |||
| #define vs31 31 | |||
| #define vs32 32 | |||
| #define vs33 33 | |||
| #define vs34 34 | |||
| #define vs35 35 | |||
| #define vs36 36 | |||
| #define vs37 37 | |||
| #define vs38 38 | |||
| #define vs39 39 | |||
| #define vs40 40 | |||
| #define vs41 41 | |||
| #define vs42 42 | |||
| #define vs43 43 | |||
| #define vs44 44 | |||
| #define vs45 45 | |||
| #define vs46 46 | |||
| #define vs47 47 | |||
| #define vs48 48 | |||
| #define vs49 49 | |||
| #define vs50 50 | |||
| #define vs51 51 | |||
| #define vs52 52 | |||
| #define vs53 53 | |||
| #define vs54 54 | |||
| #define vs55 55 | |||
| #define vs56 56 | |||
| #define vs57 57 | |||
| #define vs58 58 | |||
| #define vs59 59 | |||
| #define vs60 60 | |||
| #define vs61 61 | |||
| #define vs62 62 | |||
| #define vs63 63 | |||
| @@ -0,0 +1,348 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define ALPHA_SP 296(SP) | |||
| #define FZERO 304(SP) | |||
| #else | |||
| #define STACKSIZE 240 | |||
| #define ALPHA_SP 224(SP) | |||
| #define FZERO 232(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r7 | |||
| #define OFFSET r6 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define ALPHA r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define KK r20 | |||
| #define BB r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_16x4_power8.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| #endif | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| #endif | |||
| li PRE, 256 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, ALPHA | |||
| #include "dgemm_logic_16x4_power8.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,362 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define ALPHA_SP 296(SP) | |||
| #define FZERO 304(SP) | |||
| #else | |||
| #define STACKSIZE 240 | |||
| #define ALPHA_SP 224(SP) | |||
| #define FZERO 232(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r7 | |||
| #define OFFSET r6 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define K1 r13 | |||
| #define KKK r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define ALPHA r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define KK r20 | |||
| #define BB r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_16x4_power8.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| std r13, 288(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| stw r13, 216(SP) | |||
| #endif | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #endif | |||
| mr KK, OFFSET | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| neg KK, KK | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| #endif | |||
| li PRE, 256 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, ALPHA | |||
| #include "dtrmm_logic_16x4_power8.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r13, 288(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| lwz r13, 216(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -107,6 +107,11 @@ | |||
| #ifdef PPCG4 | |||
| #define PREFETCHSIZE 16 | |||
| #define PREFETCHWSIZE 72 | |||
| #endif | |||
| #ifdef POWER8 | |||
| #define PREFETCHSIZE 16 | |||
| #define PREFETCHWSIZE 72 | |||
| #endif | |||
| PROLOGUE | |||
| @@ -193,7 +198,7 @@ LL(12): | |||
| STFD c12, 14 * SIZE(B) | |||
| STFD c16, 15 * SIZE(B) | |||
| #ifdef POWER6 | |||
| #if defined(POWER6) || defined(POWER8) | |||
| dcbtst PREA, AO1 | |||
| dcbtst PREA, AO2 | |||
| dcbtst PREA, AO3 | |||
| @@ -111,6 +111,11 @@ | |||
| #ifdef PPCG4 | |||
| #define PREFETCHSIZE 16 | |||
| #define PREFETCHWSIZE 48 | |||
| #endif | |||
| #ifdef POWER8 | |||
| #define PREFETCHSIZE 16 | |||
| #define PREFETCHWSIZE 48 | |||
| #endif | |||
| PROLOGUE | |||
| @@ -224,7 +229,7 @@ LL(12): | |||
| STFD c15, 14 * SIZE(B1) | |||
| STFD c16, 15 * SIZE(B1) | |||
| #ifdef POWER6 | |||
| #if defined(POWER6) || defined(POWER8) | |||
| dcbtst PREA, AO1 | |||
| dcbtst PREA, AO2 | |||
| dcbtst PREA, AO3 | |||
| @@ -174,6 +174,12 @@ | |||
| #define PREFETCHSIZE_C 40 | |||
| #endif | |||
| #ifdef POWER8 | |||
| #define PREFETCHSIZE_A 96 | |||
| #define PREFETCHSIZE_C 40 | |||
| #endif | |||
| #ifndef NEEDPARAM | |||
| #ifndef __64BIT__ | |||
| @@ -139,6 +139,11 @@ | |||
| #define PREFETCHSIZE_C 8 | |||
| #endif | |||
| #ifdef POWER8 | |||
| #define PREFETCHSIZE_A 96 | |||
| #define PREFETCHSIZE_C 8 | |||
| #endif | |||
| #define y01 f0 | |||
| #define y02 f1 | |||
| #define y03 f2 | |||
| @@ -0,0 +1,367 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define ALPHA_R_SP 296(SP) | |||
| #define ALPHA_I_SP 304(SP) | |||
| #define FZERO 312(SP) | |||
| #else | |||
| #define STACKSIZE 256 | |||
| #define ALPHA_R_SP 224(SP) | |||
| #define ALPHA_I_SP 232(SP) | |||
| #define FZERO 240(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r10 | |||
| #define B r6 | |||
| #define C r7 | |||
| #define LDC r8 | |||
| #define OFFSET r9 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define L r15 | |||
| #define ALPHA r16 | |||
| #define o24 r17 | |||
| #define T2 r19 | |||
| #define KK r20 | |||
| #define o8 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| #endif | |||
| stfd f1, ALPHA_R_SP | |||
| stfd f2, ALPHA_I_SP | |||
| stw r0, FZERO | |||
| #ifdef linux | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz B, FRAMESLOT(0) + STACKSIZE(SP) | |||
| lwz C, FRAMESLOT(1) + STACKSIZE(SP) | |||
| lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) | |||
| #else | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| neg KK, OFFSET | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_8x2_power8.S" | |||
| cmpwi cr0, M, 0 | |||
| ble .L999 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 256 | |||
| li o8 , 8 | |||
| li o16 , 16 | |||
| li o24 , 24 | |||
| li o32 , 32 | |||
| li o48 , 48 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| #endif | |||
| lxvdsx alpha_r, 0, ALPHA | |||
| lxvdsx alpha_i, o8, ALPHA | |||
| .align 5 | |||
| #include "zgemm_logic_8x2_power8.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,901 @@ | |||
| srawi. J, N, 1 | |||
| ble .LZGEMM_L2_END | |||
| .LZGEMM_L2_BEGIN: | |||
| mr CO, C | |||
| mr AO, A | |||
| slwi T1, LDC , 1 | |||
| add C, C, T1 | |||
| srawi. I, M, 3 | |||
| ble .LZGEMM_L2x8_END | |||
| .LZGEMM_L2x8_BEGIN: | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x8_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x8_SUB4 | |||
| .LZGEMM_L2x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_I1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x8_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x8_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x8_LOOP | |||
| .LZGEMM_L2x8_LOOP_END: | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| KERNEL2x8_E2 | |||
| b .LZGEMM_L2x8_SUB1 | |||
| .LZGEMM_L2x8_SUB4: | |||
| dcbt AO, PRE | |||
| KERNEL2x8_SUBI1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_SUB1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_SUB1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_SUB1 | |||
| KERNEL2x8_SUB1 | |||
| KERNEL2x8_SUB1 | |||
| KERNEL2x8_SUB1 | |||
| KERNEL2x8_SUB1 | |||
| b .LZGEMM_L2x8_SUB1 | |||
| .LZGEMM_L2x8_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x8_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x8_SAVE | |||
| b .LZGEMM_L2x8_SUB2 | |||
| .LZGEMM_L2x8_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x8_SAVE | |||
| .LZGEMM_L2x8_SUB2: | |||
| KERNEL2x8_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x8_SUB2 | |||
| .LZGEMM_L2x8_SAVE: | |||
| SAVE2x8 | |||
| addic. I, I, -1 | |||
| bgt .LZGEMM_L2x8_BEGIN | |||
| .LZGEMM_L2x8_END: | |||
| .LZGEMM_L2x4_BEGIN: | |||
| andi. T2, M, 7 | |||
| ble .LZGEMM_L2x1_END | |||
| andi. T1, M, 4 | |||
| ble .LZGEMM_L2x4_END | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x4_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x4_SUB4 | |||
| .LZGEMM_L2x4_LOOP_START: | |||
| LOAD2x4_1 | |||
| KERNEL2x4_I1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x4_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x4_LOOP: | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x4_LOOP | |||
| .LZGEMM_L2x4_LOOP_END: | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_E2 | |||
| b .LZGEMM_L2x4_SUB1 | |||
| .LZGEMM_L2x4_SUB4: | |||
| KERNEL2x4_SUBI1 | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| b .LZGEMM_L2x4_SUB1 | |||
| .LZGEMM_L2x4_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x4_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x4_SAVE | |||
| b .LZGEMM_L2x4_SUB2 | |||
| .LZGEMM_L2x4_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x4_SAVE | |||
| .LZGEMM_L2x4_SUB2: | |||
| KERNEL2x4_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x4_SUB2 | |||
| .LZGEMM_L2x4_SAVE: | |||
| SAVE2x4 | |||
| .LZGEMM_L2x4_END: | |||
| .LZGEMM_L2x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble .LZGEMM_L2x2_END | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x2_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x2_SUB4 | |||
| .LZGEMM_L2x2_LOOP_START: | |||
| LOAD2x2_1 | |||
| KERNEL2x2_I1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x2_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x2_LOOP: | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x2_LOOP | |||
| .LZGEMM_L2x2_LOOP_END: | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_E2 | |||
| b .LZGEMM_L2x2_SUB1 | |||
| .LZGEMM_L2x2_SUB4: | |||
| KERNEL2x2_SUBI1 | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| b .LZGEMM_L2x2_SUB1 | |||
| .LZGEMM_L2x2_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x2_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x2_SAVE | |||
| b .LZGEMM_L2x2_SUB2 | |||
| .LZGEMM_L2x2_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x2_SAVE | |||
| .LZGEMM_L2x2_SUB2: | |||
| KERNEL2x2_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x2_SUB2 | |||
| .LZGEMM_L2x2_SAVE: | |||
| SAVE2x2 | |||
| .LZGEMM_L2x2_END: | |||
| .LZGEMM_L2x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble .LZGEMM_L2x1_END | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x1_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x1_SUB4 | |||
| .LZGEMM_L2x1_LOOP_START: | |||
| LOAD2x1_1 | |||
| KERNEL2x1_I1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x1_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x1_LOOP: | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x1_LOOP | |||
| .LZGEMM_L2x1_LOOP_END: | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_E2 | |||
| b .LZGEMM_L2x1_SUB1 | |||
| .LZGEMM_L2x1_SUB4: | |||
| KERNEL2x1_SUBI1 | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| b .LZGEMM_L2x1_SUB1 | |||
| .LZGEMM_L2x1_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x1_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x1_SAVE | |||
| b .LZGEMM_L2x1_SUB2 | |||
| .LZGEMM_L2x1_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x1_SAVE | |||
| .LZGEMM_L2x1_SUB2: | |||
| KERNEL2x1_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x1_SUB2 | |||
| .LZGEMM_L2x1_SAVE: | |||
| SAVE2x1 | |||
| .LZGEMM_L2x1_END: | |||
| slwi T1, K, 5 | |||
| add B, B, T1 | |||
| addic. J, J, -1 | |||
| bgt .LZGEMM_L2_BEGIN | |||
| andi. T2, N, 1 | |||
| ble .L999 | |||
| .LZGEMM_L2_END: | |||
| b .LZGEMM_L1_BEGIN | |||
| .L999_H1: | |||
| b .L999 | |||
| .LZGEMM_L1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble .LZGEMM_L1_END | |||
| mr CO, C | |||
| mr AO, A | |||
| srawi. I, M, 3 | |||
| ble .LZGEMM_L1x8_END | |||
| .LZGEMM_L1x8_BEGIN: | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x8_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x8_SUB4 | |||
| .LZGEMM_L1x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_I1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x8_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x8_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x8_LOOP | |||
| .LZGEMM_L1x8_LOOP_END: | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| KERNEL1x8_E2 | |||
| b .LZGEMM_L1x8_SUB1 | |||
| .LZGEMM_L1x8_SUB4: | |||
| dcbt AO, PRE | |||
| KERNEL1x8_SUBI1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_SUB1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_SUB1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_SUB1 | |||
| KERNEL1x8_SUB1 | |||
| KERNEL1x8_SUB1 | |||
| KERNEL1x8_SUB1 | |||
| KERNEL1x8_SUB1 | |||
| b .LZGEMM_L1x8_SUB1 | |||
| .LZGEMM_L1x8_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x8_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x8_SAVE | |||
| b .LZGEMM_L1x8_SUB2 | |||
| .LZGEMM_L1x8_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x8_SAVE | |||
| .LZGEMM_L1x8_SUB2: | |||
| KERNEL1x8_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x8_SUB2 | |||
| .LZGEMM_L1x8_SAVE: | |||
| SAVE1x8 | |||
| addic. I, I, -1 | |||
| bgt .LZGEMM_L1x8_BEGIN | |||
| .LZGEMM_L1x8_END: | |||
| .LZGEMM_L1x4_BEGIN: | |||
| andi. T2, M, 7 | |||
| ble .LZGEMM_L1x1_END | |||
| andi. T1, M, 4 | |||
| ble .LZGEMM_L1x4_END | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x4_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x4_SUB4 | |||
| .LZGEMM_L1x4_LOOP_START: | |||
| LOAD1x4_1 | |||
| KERNEL1x4_I1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x4_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x4_LOOP: | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x4_LOOP | |||
| .LZGEMM_L1x4_LOOP_END: | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_E2 | |||
| b .LZGEMM_L1x4_SUB1 | |||
| .LZGEMM_L1x4_SUB4: | |||
| KERNEL1x4_SUBI1 | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| b .LZGEMM_L1x4_SUB1 | |||
| .LZGEMM_L1x4_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x4_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x4_SAVE | |||
| b .LZGEMM_L1x4_SUB2 | |||
| .LZGEMM_L1x4_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x4_SAVE | |||
| .LZGEMM_L1x4_SUB2: | |||
| KERNEL1x4_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x4_SUB2 | |||
| .LZGEMM_L1x4_SAVE: | |||
| SAVE1x4 | |||
| .LZGEMM_L1x4_END: | |||
| .LZGEMM_L1x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble .LZGEMM_L1x2_END | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x2_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x2_SUB4 | |||
| .LZGEMM_L1x2_LOOP_START: | |||
| LOAD1x2_1 | |||
| KERNEL1x2_I1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x2_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x2_LOOP: | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x2_LOOP | |||
| .LZGEMM_L1x2_LOOP_END: | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_E2 | |||
| b .LZGEMM_L1x2_SUB1 | |||
| .LZGEMM_L1x2_SUB4: | |||
| KERNEL1x2_SUBI1 | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| b .LZGEMM_L1x2_SUB1 | |||
| .LZGEMM_L1x2_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x2_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x2_SAVE | |||
| b .LZGEMM_L1x2_SUB2 | |||
| .LZGEMM_L1x2_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x2_SAVE | |||
| .LZGEMM_L1x2_SUB2: | |||
| KERNEL1x2_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x2_SUB2 | |||
| .LZGEMM_L1x2_SAVE: | |||
| SAVE1x2 | |||
| .LZGEMM_L1x2_END: | |||
| .LZGEMM_L1x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble .LZGEMM_L1x1_END | |||
| mr BO, B | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x1_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x1_SUB4 | |||
| .LZGEMM_L1x1_LOOP_START: | |||
| LOAD1x1_1 | |||
| KERNEL1x1_I1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x1_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x1_LOOP: | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x1_LOOP | |||
| .LZGEMM_L1x1_LOOP_END: | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_E2 | |||
| b .LZGEMM_L1x1_SUB1 | |||
| .LZGEMM_L1x1_SUB4: | |||
| KERNEL1x1_SUBI1 | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| b .LZGEMM_L1x1_SUB1 | |||
| .LZGEMM_L1x1_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x1_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x1_SAVE | |||
| b .LZGEMM_L1x1_SUB2 | |||
| .LZGEMM_L1x1_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x1_SAVE | |||
| .LZGEMM_L1x1_SUB2: | |||
| KERNEL1x1_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x1_SUB2 | |||
| .LZGEMM_L1x1_SAVE: | |||
| SAVE1x1 | |||
| .LZGEMM_L1x1_END: | |||
| .LZGEMM_L1_END: | |||
| @@ -170,6 +170,11 @@ | |||
| #define PREFETCHSIZE_C 24 | |||
| #endif | |||
| #ifdef POWER8 | |||
| #define PREFETCHSIZE_A 24 | |||
| #define PREFETCHSIZE_C 24 | |||
| #endif | |||
| #ifndef XCONJ | |||
| #define FMADDR FMADD | |||
| #define FMSUBR FNMSUB | |||
| @@ -144,6 +144,12 @@ | |||
| #define PREFETCHSIZE_C 8 | |||
| #endif | |||
| #ifdef POWER8 | |||
| #define PREFETCHSIZE_A 24 | |||
| #define PREFETCHSIZE_C 8 | |||
| #endif | |||
| #if !(defined(CONJ) && defined(XCONJ)) | |||
| #define FMADDR FMADD | |||
| #define FMSUBR FNMSUB | |||
| @@ -0,0 +1,377 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define ALPHA_R_SP 296(SP) | |||
| #define ALPHA_I_SP 304(SP) | |||
| #define FZERO 312(SP) | |||
| #else | |||
| #define STACKSIZE 256 | |||
| #define ALPHA_R_SP 224(SP) | |||
| #define ALPHA_I_SP 232(SP) | |||
| #define FZERO 240(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r10 | |||
| #define B r6 | |||
| #define C r7 | |||
| #define LDC r8 | |||
| #define OFFSET r9 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define KKK r13 | |||
| #define K1 r14 | |||
| #define L r15 | |||
| #define ALPHA r16 | |||
| #define o24 r17 | |||
| #define T2 r19 | |||
| #define KK r20 | |||
| #define o8 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| std r13, 288(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| stw r13, 216(SP) | |||
| #endif | |||
| stfd f1, ALPHA_R_SP | |||
| stfd f2, ALPHA_I_SP | |||
| stw r0, FZERO | |||
| #ifdef linux | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz B, FRAMESLOT(0) + STACKSIZE(SP) | |||
| lwz C, FRAMESLOT(1) + STACKSIZE(SP) | |||
| lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) | |||
| #else | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| neg KK, OFFSET | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_8x2_power8.S" | |||
| cmpwi cr0, M, 0 | |||
| ble .L999 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 256 | |||
| li o8 , 8 | |||
| li o16 , 16 | |||
| li o24 , 24 | |||
| li o32 , 32 | |||
| li o48 , 48 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| #endif | |||
| lxsdx alpha_r, 0, ALPHA | |||
| lxsdx alpha_i, o8, ALPHA | |||
| .align 4 | |||
| #include "ztrmm_logic_8x2_power8.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r13, 288(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| lwz r13, 216(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -389,19 +389,19 @@ DGEMVTKERNEL = dgemv_t.S | |||
| endif | |||
| ifndef CGEMVNKERNEL | |||
| CGEMVNKERNEL = cgemv_n.S | |||
| CGEMVNKERNEL = cgemv_n_4.c | |||
| endif | |||
| ifndef CGEMVTKERNEL | |||
| CGEMVTKERNEL = cgemv_t.S | |||
| CGEMVTKERNEL = cgemv_t_4.c | |||
| endif | |||
| ifndef ZGEMVNKERNEL | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| endif | |||
| ifndef ZGEMVTKERNEL | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| endif | |||
| ifndef QGEMVNKERNEL | |||
| @@ -1,6 +1,3 @@ | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| @@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
| @@ -11,7 +11,7 @@ ZAXPYKERNEL = zaxpy.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
| @@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVNKERNEL = zgemv_t_4.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HASWELL) | |||
| #include "cgemv_n_microk_haswell-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "cgemv_n_microk_bulldozer-4.c" | |||
| #endif | |||
| @@ -0,0 +1,541 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register n1 = n & -8 ; | |||
| BLASLONG register n2 = n & 4 ; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 | |||
| "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 | |||
| "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 | |||
| "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2 | |||
| "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2 | |||
| "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3 | |||
| "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "prefetcht0 384(%6,%0,4) \n\t" | |||
| "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 | |||
| "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2 | |||
| "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "prefetcht0 384(%7,%0,4) \n\t" | |||
| "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 | |||
| "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3 | |||
| "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddps %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddps %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "prefetcht0 384(%3,%0,4) \n\t" | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,4), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,4) \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %8 \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 | |||
| "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 | |||
| "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| #endif | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n1), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (n2) // 8 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x2 1 | |||
| static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register n1 = n & -8 ; | |||
| BLASLONG register n2 = n & 4 ; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 | |||
| "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 | |||
| "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je 2f \n\t" | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "prefetcht0 384(%3,%0,4) \n\t" | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,4), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,4) \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %6 \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| #endif | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n1), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (n2) // 6 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x1 1 | |||
| static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register n1 = n & -8 ; | |||
| BLASLONG register n2 = n & 4 ; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je 2f \n\t" | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "prefetcht0 384(%3,%0,4) \n\t" | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,4), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "addq $16, %0 \n\t" | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13,-32(%3,%0,4) \n\t" | |||
| "jnz 1b \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %5 \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| #endif | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n1), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap), // 4 | |||
| "r" (n2) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_ADDY 1 | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 2 ) | |||
| { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||
| temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||
| #else | |||
| temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||
| temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||
| #endif | |||
| *dest += temp_r; | |||
| *(dest+1) += temp_i; | |||
| src+=2; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| i=0; | |||
| BLASLONG register n1 = n & -8 ; | |||
| BLASLONG register n2 = n & 4 ; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%4), %%ymm0 \n\t" // alpha_r | |||
| "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i | |||
| "cmpq $0 , %1 \n\t" | |||
| "je 2f \n\t" | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src | |||
| "vmovups 32(%2,%0,4), %%ymm9 \n\t" | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" // 4 complex values from dest | |||
| "vmovups 32(%3,%0,4), %%ymm11 \n\t" | |||
| #if !defined(XCONJ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "addq $16, %0 \n\t" | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13,-32(%3,%0,4) \n\t" | |||
| "jnz 1b \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %6 \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src | |||
| "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%3,%0,4), %%ymm10 \n\t" | |||
| #if !defined(XCONJ) | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| #endif | |||
| "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n1), // 1 | |||
| "r" (src), // 2 | |||
| "r" (dest), // 3 | |||
| "r" (&alpha_r), // 4 | |||
| "r" (&alpha_i), // 5 | |||
| "r" (n2) // 6 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| return; | |||
| } | |||
| @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HASWELL) | |||
| #include "cgemv_t_microk_haswell-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "cgemv_t_microk_bulldozer-4.c" | |||
| #endif | |||
| #define NBMAX 2048 | |||
| @@ -0,0 +1,541 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary froms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary from must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp | |||
| "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp | |||
| "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" | |||
| "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" | |||
| "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 | |||
| "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je 3f \n\t" | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "prefetcht0 384(%6,%0,4) \n\t" | |||
| "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 | |||
| "prefetcht0 384(%7,%0,4) \n\t" | |||
| "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 | |||
| "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" | |||
| "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" | |||
| "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" | |||
| "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" | |||
| "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" | |||
| "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" | |||
| "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" | |||
| "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" | |||
| #endif | |||
| "vmovsd (%3), %%xmm4 \n\t" // read y | |||
| "vmovsd 8(%3), %%xmm5 \n\t" | |||
| "vmovsd 16(%3), %%xmm6 \n\t" | |||
| "vmovsd 24(%3), %%xmm7 \n\t" | |||
| "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" | |||
| "vextractf128 $1, %%ymm10, %%xmm11 \n\t" | |||
| "vextractf128 $1, %%ymm12, %%xmm13 \n\t" | |||
| "vextractf128 $1, %%ymm14, %%xmm15 \n\t" | |||
| "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" | |||
| "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" | |||
| "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" | |||
| "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" | |||
| "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" | |||
| "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" | |||
| "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" | |||
| "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" | |||
| "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" | |||
| "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" | |||
| "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| "vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| "vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| #if !defined(XCONJ) | |||
| "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" | |||
| "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" | |||
| "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" | |||
| "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" | |||
| "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" | |||
| "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" | |||
| "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" | |||
| "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" | |||
| "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" | |||
| "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" | |||
| "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" | |||
| "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" | |||
| "vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" | |||
| "vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" | |||
| "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" | |||
| "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" | |||
| "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" | |||
| #endif | |||
| "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" | |||
| "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" | |||
| "vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t" | |||
| "vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t" | |||
| "vmovsd %%xmm8 , (%3) \n\t" | |||
| "vmovsd %%xmm10, 8(%3) \n\t" | |||
| "vmovsd %%xmm12, 16(%3) \n\t" | |||
| "vmovsd %%xmm14, 24(%3) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x2 1 | |||
| static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je 3f \n\t" | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" | |||
| "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" | |||
| "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" | |||
| "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" | |||
| "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" | |||
| #endif | |||
| "vmovsd (%3), %%xmm4 \n\t" // read y | |||
| "vmovsd 8(%3), %%xmm5 \n\t" | |||
| "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" | |||
| "vextractf128 $1, %%ymm10, %%xmm11 \n\t" | |||
| "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" | |||
| "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" | |||
| "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" | |||
| "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" | |||
| "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| #if !defined(XCONJ) | |||
| "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" | |||
| "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" | |||
| "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" | |||
| "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" | |||
| "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" | |||
| "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" | |||
| "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" | |||
| #endif | |||
| "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" | |||
| "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" | |||
| "vmovsd %%xmm8 , (%3) \n\t" | |||
| "vmovsd %%xmm10, 8(%3) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (alpha) // 6 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x1 1 | |||
| static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je 3f \n\t" | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x | |||
| "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts | |||
| "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts | |||
| "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts | |||
| "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 | |||
| "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" | |||
| "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" | |||
| "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" | |||
| #endif | |||
| "vmovsd (%3), %%xmm4 \n\t" // read y | |||
| "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" | |||
| "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" | |||
| "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i | |||
| "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r | |||
| #if !defined(XCONJ) | |||
| "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" | |||
| "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" | |||
| #else | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" | |||
| "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" | |||
| "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" | |||
| #endif | |||
| "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" | |||
| "vmovsd %%xmm8 , (%3) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap), // 4 | |||
| "r" (alpha) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -34,9 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "zgemv_n_microk_haswell-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "zgemv_n_microk_sandy-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "zgemv_n_microk_bulldozer-4.c" | |||
| #endif | |||
| #define NBMAX 1024 | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| @@ -0,0 +1,514 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| if ( n > 384 ) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 | |||
| "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 | |||
| "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 | |||
| "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 | |||
| "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 | |||
| "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 | |||
| "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 512(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 | |||
| "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 | |||
| "prefetcht0 512(%5,%0,8) \n\t" | |||
| "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "prefetcht0 512(%6,%0,8) \n\t" | |||
| "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 | |||
| "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 | |||
| "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 | |||
| "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 | |||
| "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "prefetcht0 512(%7,%0,8) \n\t" | |||
| "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%3,%0,8), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,8), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| else | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 | |||
| "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 | |||
| "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 | |||
| "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 | |||
| "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 | |||
| "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 | |||
| "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 | |||
| "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 | |||
| "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 | |||
| "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 | |||
| "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 | |||
| "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 | |||
| "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%3,%0,8), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,8), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| } | |||
| #define HAVE_KERNEL_4x2 1 | |||
| static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 | |||
| "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 | |||
| "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 | |||
| "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 | |||
| "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%3,%0,8), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,8), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x1 1 | |||
| static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 | |||
| "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%3,%0,8), %%ymm10 \n\t" | |||
| "vmovups 32(%3,%0,8), %%ymm11 \n\t" | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_ADDY 1 | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 2 ) | |||
| { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||
| temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||
| #else | |||
| temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||
| temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||
| #endif | |||
| *dest += temp_r; | |||
| *(dest+1) += temp_i; | |||
| src+=2; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%4), %%ymm0 \n\t" // alpha_r | |||
| "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i | |||
| // ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src | |||
| "vmovups 32(%2,%0,8), %%ymm9 \n\t" | |||
| "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r | |||
| "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i | |||
| "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r | |||
| "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i | |||
| "vmovups (%3,%0,8), %%ymm10 \n\t" // 2 complex values from dest | |||
| "vmovups 32(%3,%0,8), %%ymm11 \n\t" | |||
| #if !defined(XCONJ) | |||
| "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" | |||
| "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" | |||
| "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" | |||
| #else | |||
| "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" | |||
| "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" | |||
| "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" | |||
| "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" | |||
| "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" | |||
| "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" | |||
| #endif | |||
| "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" | |||
| "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" | |||
| "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y | |||
| "vmovups %%ymm13, 32(%3,%0,8) \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (src), // 2 | |||
| "r" (dest), // 3 | |||
| "r" (&alpha_r), // 4 | |||
| "r" (&alpha_i) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| return; | |||
| } | |||
| @@ -51,8 +51,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| @@ -63,8 +62,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, | |||
| res = LAPACKE_clantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, | |||
| work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| @@ -51,8 +51,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| @@ -63,8 +62,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, | |||
| res = LAPACKE_dlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, | |||
| work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| @@ -38,10 +38,10 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, | |||
| const double* a, lapack_int lda, double* work ) | |||
| { | |||
| lapack_int info = 0; | |||
| double res = 0.; | |||
| double res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); | |||
| res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| @@ -74,11 +74,10 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side, | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| if( LAPACKE_lsame( vect, 'q' ) ) { | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * k ); | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,k) ); | |||
| } else { | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * nq ); | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,nq) ); | |||
| } | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| @@ -89,11 +88,7 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side, | |||
| goto exit_level_1; | |||
| } | |||
| /* Transpose input matrices */ | |||
| if( LAPACKE_lsame( vect, 'q' ) ) { | |||
| LAPACKE_dge_trans( matrix_layout, nq, k, a, lda, a_t, lda_t ); | |||
| } else { | |||
| LAPACKE_dge_trans( matrix_layout, k, nq, a, lda, a_t, lda_t ); | |||
| } | |||
| LAPACKE_dge_trans( matrix_layout, r, MIN(nq,k), a, lda, a_t, lda_t ); | |||
| LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_dormbr( &vect, &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, | |||
| @@ -87,12 +87,7 @@ lapack_int LAPACKE_dormlq_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_1; | |||
| } | |||
| /* Transpose input matrices */ | |||
| if( LAPACKE_lsame( side, 'l' ) ){ | |||
| LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); | |||
| } else { | |||
| LAPACKE_dge_trans( matrix_layout, k, n, a, lda, a_t, lda_t ); | |||
| } | |||
| LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_dormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, | |||
| @@ -51,8 +51,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| @@ -63,8 +62,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, | |||
| res = LAPACKE_slantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, | |||
| work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| @@ -41,7 +41,7 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, | |||
| float res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); | |||
| res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| @@ -73,8 +73,11 @@ lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side, | |||
| return (info < 0) ? (info - 1) : info; | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| a_t = (float*) | |||
| LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MIN(nq,k)) ); | |||
| if( LAPACKE_lsame( vect, 'q' ) ) { | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,k) ); | |||
| } else { | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,nq) ); | |||
| } | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| @@ -72,7 +72,11 @@ lapack_int LAPACKE_sormlq_work( int matrix_layout, char side, char trans, | |||
| return (info < 0) ? (info - 1) : info; | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); | |||
| if( LAPACKE_lsame( side, 'l' ) ) { | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); | |||
| } else { | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); | |||
| } | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| @@ -51,8 +51,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| @@ -63,8 +62,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, | |||
| res = LAPACKE_zlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda, | |||
| work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || | |||
| LAPACKE_lsame( norm, 'O' ) ) { | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| @@ -39,7 +39,7 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, | |||
| double* work ) | |||
| { | |||
| lapack_int info = 0; | |||
| double res = 0.; | |||
| double res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); | |||
| @@ -405,9 +405,9 @@ | |||
| $ WORK( IWRK ), LWORK-IWRK+1, INFO ) | |||
| END IF | |||
| * | |||
| * If INFO > 0 from CHSEQR, then quit | |||
| * If INFO .NE. 0 from CHSEQR, then quit | |||
| * | |||
| IF( INFO.GT.0 ) | |||
| IF( INFO.NE.0 ) | |||
| $ GO TO 50 | |||
| * | |||
| IF( WANTVL .OR. WANTVR ) THEN | |||
| @@ -145,15 +145,33 @@ | |||
| INTRINSIC ABS, CMPLX, MAX | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| INFO = 0 | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Set constants to control overflow | |||
| * | |||
| INFO = 0 | |||
| EPS = SLAMCH( 'P' ) | |||
| SMLNUM = SLAMCH( 'S' ) / EPS | |||
| BIGNUM = ONE / SMLNUM | |||
| CALL SLABAD( SMLNUM, BIGNUM ) | |||
| * | |||
| * Handle the case N=1 by itself | |||
| * | |||
| IF( N.EQ.1 ) THEN | |||
| IPIV( 1 ) = 1 | |||
| JPIV( 1 ) = 1 | |||
| IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN | |||
| INFO = 1 | |||
| A( 1, 1 ) = CMPLX( SMLNUM, ZERO ) | |||
| END IF | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Factorize A using complete pivoting. | |||
| * Set pivots less than SMIN to SMIN | |||
| * | |||
| @@ -339,16 +339,16 @@ | |||
| $ LDVL, VR, LDVR, WORK, -1, IERR ) | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| CALL CHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, | |||
| $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, | |||
| $ -1, WORK, IERR ) | |||
| $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, | |||
| $ RWORK, IERR ) | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| ELSE | |||
| CALL CGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, | |||
| $ VR, LDVR, WORK, -1, IERR ) | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| CALL CHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, | |||
| $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, | |||
| $ -1, WORK, IERR ) | |||
| $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, | |||
| $ RWORK, IERR ) | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| END IF | |||
| WORK( 1 ) = CMPLX( LWKOPT ) | |||
| @@ -418,9 +418,9 @@ | |||
| $ WORK( IWRK ), LWORK-IWRK+1, INFO ) | |||
| END IF | |||
| * | |||
| * If INFO > 0 from DHSEQR, then quit | |||
| * If INFO .NE. 0 from DHSEQR, then quit | |||
| * | |||
| IF( INFO.GT.0 ) | |||
| IF( INFO.NE.0 ) | |||
| $ GO TO 50 | |||
| * | |||
| IF( WANTVL .OR. WANTVR ) THEN | |||
| @@ -145,15 +145,33 @@ | |||
| INTRINSIC ABS, MAX | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| INFO = 0 | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Set constants to control overflow | |||
| * | |||
| INFO = 0 | |||
| EPS = DLAMCH( 'P' ) | |||
| SMLNUM = DLAMCH( 'S' ) / EPS | |||
| BIGNUM = ONE / SMLNUM | |||
| CALL DLABAD( SMLNUM, BIGNUM ) | |||
| * | |||
| * Handle the case N=1 by itself | |||
| * | |||
| IF( N.EQ.1 ) THEN | |||
| IPIV( 1 ) = 1 | |||
| JPIV( 1 ) = 1 | |||
| IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN | |||
| INFO = 1 | |||
| A( 1, 1 ) = SMLNUM | |||
| END IF | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Factorize A using complete pivoting. | |||
| * Set pivots less than SMIN to SMIN. | |||
| * | |||
| @@ -418,9 +418,9 @@ | |||
| $ WORK( IWRK ), LWORK-IWRK+1, INFO ) | |||
| END IF | |||
| * | |||
| * If INFO > 0 from SHSEQR, then quit | |||
| * If INFO .NE. 0 from SHSEQR, then quit | |||
| * | |||
| IF( INFO.GT.0 ) | |||
| IF( INFO.NE.0 ) | |||
| $ GO TO 50 | |||
| * | |||
| IF( WANTVL .OR. WANTVR ) THEN | |||
| @@ -145,15 +145,33 @@ | |||
| INTRINSIC ABS, MAX | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| INFO = 0 | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Set constants to control overflow | |||
| * | |||
| INFO = 0 | |||
| EPS = SLAMCH( 'P' ) | |||
| SMLNUM = SLAMCH( 'S' ) / EPS | |||
| BIGNUM = ONE / SMLNUM | |||
| CALL SLABAD( SMLNUM, BIGNUM ) | |||
| * | |||
| * Handle the case N=1 by itself | |||
| * | |||
| IF( N.EQ.1 ) THEN | |||
| IPIV( 1 ) = 1 | |||
| JPIV( 1 ) = 1 | |||
| IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN | |||
| INFO = 1 | |||
| A( 1, 1 ) = SMLNUM | |||
| END IF | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Factorize A using complete pivoting. | |||
| * Set pivots less than SMIN to SMIN. | |||
| * | |||
| @@ -404,9 +404,9 @@ | |||
| $ WORK( IWRK ), LWORK-IWRK+1, INFO ) | |||
| END IF | |||
| * | |||
| * If INFO > 0 from ZHSEQR, then quit | |||
| * If INFO .NE. 0 from ZHSEQR, then quit | |||
| * | |||
| IF( INFO.GT.0 ) | |||
| IF( INFO.NE.0 ) | |||
| $ GO TO 50 | |||
| * | |||
| IF( WANTVL .OR. WANTVR ) THEN | |||
| @@ -145,15 +145,33 @@ | |||
| INTRINSIC ABS, DCMPLX, MAX | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| INFO = 0 | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Set constants to control overflow | |||
| * | |||
| INFO = 0 | |||
| EPS = DLAMCH( 'P' ) | |||
| SMLNUM = DLAMCH( 'S' ) / EPS | |||
| BIGNUM = ONE / SMLNUM | |||
| CALL DLABAD( SMLNUM, BIGNUM ) | |||
| * | |||
| * Handle the case N=1 by itself | |||
| * | |||
| IF( N.EQ.1 ) THEN | |||
| IPIV( 1 ) = 1 | |||
| JPIV( 1 ) = 1 | |||
| IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN | |||
| INFO = 1 | |||
| A( 1, 1 ) = DCMPLX( SMLNUM, ZERO ) | |||
| END IF | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Factorize A using complete pivoting. | |||
| * Set pivots less than SMIN to SMIN | |||
| * | |||
| @@ -340,7 +340,7 @@ | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| CALL ZHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, | |||
| $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, | |||
| $ WORK, IERR ) | |||
| $ RWORK, IERR ) | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| ELSE | |||
| CALL ZGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, | |||
| @@ -348,7 +348,7 @@ | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| CALL ZHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, | |||
| $ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1, | |||
| $ WORK, IERR ) | |||
| $ RWORK, IERR ) | |||
| LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) | |||
| END IF | |||
| WORK( 1 ) = DCMPLX( LWKOPT ) | |||
| @@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines | |||
| 0 5 7 3 200 Values of INIBL (nibble crossover point) | |||
| 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) | |||
| 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) | |||
| 30.0 Threshold value | |||
| 40.0 Threshold value | |||
| T Put T to test the error exits | |||
| 1 Code to interpret the seed | |||
| NEP 21 | |||
| @@ -1959,6 +1959,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) | |||
| #define SNUMOPT 4 | |||
| #define DNUMOPT 8 | |||
| #define GEMM_DEFAULT_OFFSET_A 384 | |||
| #define GEMM_DEFAULT_OFFSET_B 1024 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 992 | |||
| #define DGEMM_DEFAULT_P 480 | |||
| #define CGEMM_DEFAULT_P 488 | |||
| #define ZGEMM_DEFAULT_P 240 | |||
| #define SGEMM_DEFAULT_Q 504 | |||
| #define DGEMM_DEFAULT_Q 720 | |||
| #define CGEMM_DEFAULT_Q 400 | |||
| #define ZGEMM_DEFAULT_Q 360 | |||
| #define SGEMM_DEFAULT_R 28800 | |||
| #define DGEMM_DEFAULT_R 14400 | |||
| #define ZGEMM_DEFAULT_R 7200 | |||
| #define SYMV_P 8 | |||
| #endif | |||
| #if defined(SPARC) && defined(V7) | |||
| #define SNUMOPT 4 | |||
| @@ -5,6 +5,13 @@ set(OpenBLAS_utest_src | |||
| test_amax.c | |||
| ) | |||
| if (NOT NO_LAPACK) | |||
| set(OpenBLAS_utest_src | |||
| ${OpenBLAS_utest_src} | |||
| test_potrs.c | |||
| ) | |||
| endif() | |||
| set(OpenBLAS_utest_bin openblas_utest) | |||
| add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) | |||
| @@ -11,6 +11,10 @@ include $(TOPDIR)/Makefile.system | |||
| OBJS=utest_main.o test_amax.o | |||
| #test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_fork.o | |||
| ifneq ($(NO_LAPACK), 1) | |||
| OBJS += test_potrs.o | |||
| endif | |||
| all : run_test | |||
| $(UTESTBIN): $(OBJS) | |||
| @@ -1,4 +1,4 @@ | |||
| /* Copyright 2011-2015 Bas van den Berg | |||
| /* Copyright 2011-2016 Bas van den Berg | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -58,6 +58,10 @@ struct ctest { | |||
| #define __CTEST_APPLE | |||
| #endif | |||
| #ifdef __MINGW32__ | |||
| #undef CTEST_SEGFAULT | |||
| #endif | |||
| #if defined(_WIN32) && defined(_MSC_VER) | |||
| #define __CTEST_MSVC | |||
| #endif | |||
| @@ -212,6 +216,9 @@ void assert_not_equal(intmax_t exp, intmax_t real, const char* caller, int line) | |||
| void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line); | |||
| #define ASSERT_NOT_EQUAL_U(exp, real) assert_not_equal_u(exp, real, __FILE__, __LINE__) | |||
| void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line); | |||
| #define ASSERT_INTERVAL(exp1, exp2, real) assert_interval(exp1, exp2, real, __FILE__, __LINE__) | |||
| void assert_null(void* real, const char* caller, int line); | |||
| #define ASSERT_NULL(real) assert_null((void*)real, __FILE__, __LINE__) | |||
| @@ -511,6 +518,12 @@ void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int l | |||
| } | |||
| } | |||
| void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line) { | |||
| if (real < exp1 || real > exp2) { | |||
| CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); | |||
| } | |||
| } | |||
| void assert_dbl_near(double exp, double real, double tol, const char* caller, int line) { | |||
| double diff = exp - real; | |||
| double absdiff = diff; | |||
| @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "ctest.h" | |||
| #include <common.h> | |||
| #include <math.h> | |||
| #define SINGLE_EPS 1e-04 | |||
| #define DOUBLE_EPS 1e-13 | |||