| @@ -1,4 +1,22 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.1.0 | |||||
| 23-Mar-2012 | |||||
| common: | |||||
| * Set soname of shared library on Linux. | |||||
| * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use | |||||
| this flag to control the library name, e.g. libopenblas.a, | |||||
| libopenblas_ifort.a or libopenblas_omp.a. | |||||
| * Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule. | |||||
| The lib use single thread in GEMM function with small matrices. | |||||
| x86/x86_64: | |||||
| * Used GEMV SSE/SSE2 kernels on x86 32-bit. | |||||
| * Exported CBLAS functions in Windows DLL. | |||||
| MIPS64: | |||||
| * Completed Level-3 BLAS optimization on Loongson 3A CPU. | |||||
| * Improved GEMV performance on Loongson 3A CPU. | |||||
| * Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT) | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.1 alpha2.5 | Version 0.1 alpha2.5 | ||||
| 19-Feb-2012 | 19-Feb-2012 | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.1alpha2.5 | |||||
| VERSION = 0.1.0 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -279,7 +279,12 @@ endif | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| CCOMMON_OPT += -march=mips64 | |||||
| FCOMMON_OPT += -march=mips64 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| CCOMMON_OPT += -march=mips64 | CCOMMON_OPT += -march=mips64 | ||||
| FCOMMON_OPT += -march=mips64 | FCOMMON_OPT += -march=mips64 | ||||
| endif | endif | ||||
| @@ -534,8 +539,10 @@ ifdef SMP | |||||
| CCOMMON_OPT += -DSMP_SERVER | CCOMMON_OPT += -DSMP_SERVER | ||||
| ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
| ifneq ($(CORE), LOONGSON3B) | |||||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| @@ -600,9 +607,11 @@ endif | |||||
| ifneq ($(ARCH), x86_64) | ifneq ($(ARCH), x86_64) | ||||
| ifneq ($(ARCH), x86) | ifneq ($(ARCH), x86) | ||||
| ifneq ($(CORE), LOONGSON3B) | |||||
| NO_AFFINITY = 1 | NO_AFFINITY = 1 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifdef NO_AFFINITY | ifdef NO_AFFINITY | ||||
| CCOMMON_OPT += -DNO_AFFINITY | CCOMMON_OPT += -DNO_AFFINITY | ||||
| @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve | |||||
| 9.Known Issues | 9.Known Issues | ||||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | ||||
| is 64. On 32 bits, it is 32. | is 64. On 32 bits, it is 32. | ||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||||
| 10. Specification of Git Branches | 10. Specification of Git Branches | ||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | ||||
| @@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...); | |||||
| static inline int my_mbind(void *addr, unsigned long len, int mode, | static inline int my_mbind(void *addr, unsigned long len, int mode, | ||||
| unsigned long *nodemask, unsigned long maxnode, | unsigned long *nodemask, unsigned long maxnode, | ||||
| unsigned flags) { | unsigned flags) { | ||||
| #if defined (LOONGSON3B) | |||||
| #if defined (__64BIT__) | |||||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||||
| #else | |||||
| return 0; //NULL Implementation on Loongson 3B 32bit. | |||||
| #endif | |||||
| #else | |||||
| //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | ||||
| unsigned long null_nodemask=0; | |||||
| return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); | |||||
| // unsigned long null_nodemask=0; | |||||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||||
| #endif | |||||
| } | } | ||||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | ||||
| @@ -2127,7 +2127,9 @@ | |||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
| extern BLASLONG gemm_offset_a; | |||||
| extern BLASLONG gemm_offset_b; | |||||
| extern BLASLONG sgemm_p; | extern BLASLONG sgemm_p; | ||||
| extern BLASLONG sgemm_q; | extern BLASLONG sgemm_q; | ||||
| extern BLASLONG sgemm_r; | extern BLASLONG sgemm_r; | ||||
| @@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ | |||||
| static inline unsigned int rpcc(void){ | static inline unsigned int rpcc(void){ | ||||
| unsigned long ret; | unsigned long ret; | ||||
| #if defined(LOONGSON3A) | |||||
| unsigned long long tmp; | |||||
| __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | |||||
| ret=tmp; | |||||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
| // unsigned long long tmp; | |||||
| //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | |||||
| //ret=tmp; | |||||
| __asm__ __volatile__(".set push \n" | |||||
| ".set mips32r2\n" | |||||
| "rdhwr %0, $2\n" | |||||
| ".set pop": "=r"(ret):: "memory"); | |||||
| #else | #else | ||||
| __asm__ __volatile__(".set push \n" | __asm__ __volatile__(".set push \n" | ||||
| ".set mips32r2\n" | ".set mips32r2\n" | ||||
| @@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){ | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
| #ifndef NO_AFFINITY | |||||
| #define WHEREAMI | |||||
| static inline int WhereAmI(void){ | |||||
| int ret=0; | |||||
| __asm__ __volatile__(".set push \n" | |||||
| ".set mips32r2\n" | |||||
| "rdhwr %0, $0\n" | |||||
| ".set pop": "=r"(ret):: "memory"); | |||||
| return ret; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | static inline int blas_quickdivide(blasint x, blasint y){ | ||||
| return x / y; | return x / y; | ||||
| } | } | ||||
| @@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define CMPEQ c.eq.d | #define CMPEQ c.eq.d | ||||
| #define CMPLE c.le.d | #define CMPLE c.le.d | ||||
| #define CMPLT c.lt.d | #define CMPLT c.lt.d | ||||
| #define NEG neg.d | |||||
| #else | #else | ||||
| #define LD lwc1 | #define LD lwc1 | ||||
| #define ST swc1 | #define ST swc1 | ||||
| @@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define CMPEQ c.eq.s | #define CMPEQ c.eq.s | ||||
| #define CMPLE c.le.s | #define CMPLE c.le.s | ||||
| #define CMPLT c.lt.s | #define CMPLT c.lt.s | ||||
| #define PLU plu.ps | |||||
| #define PLL pll.ps | |||||
| #define PUU puu.ps | |||||
| #define PUL pul.ps | |||||
| #define MADPS madd.ps | |||||
| #define CVTU cvt.s.pu | |||||
| #define CVTL cvt.s.pl | |||||
| #define NEG neg.s | |||||
| #endif | #endif | ||||
| #if defined(__64BIT__) && defined(USE64BITINT) | #if defined(__64BIT__) && defined(USE64BITINT) | ||||
| @@ -218,13 +247,18 @@ REALNAME: ;\ | |||||
| #define SEEK_ADDRESS | #define SEEK_ADDRESS | ||||
| #define BUFFER_SIZE ( 8 << 20) | |||||
| #define BUFFER_SIZE ( 32 << 20) | |||||
| #if defined(LOONGSON3A) | #if defined(LOONGSON3A) | ||||
| #define PAGESIZE (16UL << 10) | #define PAGESIZE (16UL << 10) | ||||
| #define FIXED_PAGESIZE (16UL << 10) | #define FIXED_PAGESIZE (16UL << 10) | ||||
| #endif | #endif | ||||
| #if defined(LOONGSON3B) | |||||
| #define PAGESIZE (32UL << 10) | |||||
| #define FIXED_PAGESIZE (32UL << 10) | |||||
| #endif | |||||
| #ifndef PAGESIZE | #ifndef PAGESIZE | ||||
| #define PAGESIZE (64UL << 10) | #define PAGESIZE (64UL << 10) | ||||
| #endif | #endif | ||||
| @@ -236,7 +270,7 @@ REALNAME: ;\ | |||||
| #define MAP_ANONYMOUS MAP_ANON | #define MAP_ANONYMOUS MAP_ANON | ||||
| #endif | #endif | ||||
| #if defined(LOONGSON3A) | |||||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
| #define PREFETCHD_(x) ld $0, x | #define PREFETCHD_(x) ld $0, x | ||||
| #define PREFETCHD(x) PREFETCHD_(x) | #define PREFETCHD(x) PREFETCHD_(x) | ||||
| #else | #else | ||||
| @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CPU_UNKNOWN 0 | #define CPU_UNKNOWN 0 | ||||
| #define CPU_SICORTEX 1 | #define CPU_SICORTEX 1 | ||||
| #define CPU_LOONGSON3A 2 | #define CPU_LOONGSON3A 2 | ||||
| #define CPU_LOONGSON3B 3 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "UNKOWN", | "UNKOWN", | ||||
| "SICORTEX", | "SICORTEX", | ||||
| "LOONGSON3A" | |||||
| "LOONGSON3A", | |||||
| "LOONGSON3B" | |||||
| }; | }; | ||||
| int detect(void){ | int detect(void){ | ||||
| @@ -101,6 +103,8 @@ int detect(void){ | |||||
| if (strstr(p, "Loongson-3A")){ | if (strstr(p, "Loongson-3A")){ | ||||
| return CPU_LOONGSON3A; | return CPU_LOONGSON3A; | ||||
| }else if(strstr(p, "Loongson-3B")){ | |||||
| return CPU_LOONGSON3B; | |||||
| }else if (strstr(p, "Loongson-3")){ | }else if (strstr(p, "Loongson-3")){ | ||||
| infile = fopen("/proc/cpuinfo", "r"); | infile = fopen("/proc/cpuinfo", "r"); | ||||
| while (fgets(buffer, sizeof(buffer), infile)){ | while (fgets(buffer, sizeof(buffer), infile)){ | ||||
| @@ -130,6 +134,8 @@ void get_architecture(void){ | |||||
| void get_subarchitecture(void){ | void get_subarchitecture(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | if(detect()==CPU_LOONGSON3A) { | ||||
| printf("LOONGSON3A"); | printf("LOONGSON3A"); | ||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("LOONGSON3B"); | |||||
| }else{ | }else{ | ||||
| printf("SICORTEX"); | printf("SICORTEX"); | ||||
| } | } | ||||
| @@ -149,6 +155,15 @@ void get_cpuconfig(void){ | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
| printf("#define DTB_SIZE 4096\n"); | printf("#define DTB_SIZE 4096\n"); | ||||
| printf("#define L2_ASSOCIATIVE 4\n"); | printf("#define L2_ASSOCIATIVE 4\n"); | ||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("#define LOONGSON3B\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| }else{ | }else{ | ||||
| printf("#define SICORTEX\n"); | printf("#define SICORTEX\n"); | ||||
| printf("#define L1_DATA_SIZE 32768\n"); | printf("#define L1_DATA_SIZE 32768\n"); | ||||
| @@ -164,6 +179,8 @@ void get_cpuconfig(void){ | |||||
| void get_libname(void){ | void get_libname(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | if(detect()==CPU_LOONGSON3A) { | ||||
| printf("loongson3a\n"); | printf("loongson3a\n"); | ||||
| }else if(detect()==CPU_LOONGSON3B) { | |||||
| printf("loongson3b\n"); | |||||
| }else{ | }else{ | ||||
| #ifdef __mips64 | #ifdef __mips64 | ||||
| printf("mips64\n"); | printf("mips64\n"); | ||||
| @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| range_M[0] = 0; | range_M[0] = 0; | ||||
| i = arg -> m; | i = arg -> m; | ||||
| } else { | } else { | ||||
| range_M[0] = range_M[0]; | |||||
| i = range_M[1] - range_M[0]; | |||||
| range_M[0] = range_m[0]; | |||||
| i = range_m[1] - range_m[0]; | |||||
| } | } | ||||
| num_cpu_m = 0; | num_cpu_m = 0; | ||||
| @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| queue[num_cpu].args = arg; | queue[num_cpu].args = arg; | ||||
| queue[num_cpu].range_m = range_m; | queue[num_cpu].range_m = range_m; | ||||
| queue[num_cpu].range_n = &range[num_cpu]; | queue[num_cpu].range_n = &range[num_cpu]; | ||||
| queue[num_cpu].sa = NULL; | |||||
| #if defined(LOONGSON3A) | |||||
| queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | |||||
| queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | |||||
| #else | |||||
| queue[num_cpu].sa = NULL; | |||||
| queue[num_cpu].sb = NULL; | queue[num_cpu].sb = NULL; | ||||
| #endif | |||||
| queue[num_cpu].next = &queue[num_cpu + 1]; | queue[num_cpu].next = &queue[num_cpu + 1]; | ||||
| num_cpu ++; | num_cpu ++; | ||||
| } | } | ||||
| if (num_cpu) { | if (num_cpu) { | ||||
| #if defined(LOONGSON3A) | |||||
| queue[0].sa = sa; | queue[0].sa = sa; | ||||
| queue[0].sb = sb; | |||||
| queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | |||||
| #else | |||||
| queue[0].sa = sa; | |||||
| queue[0].sb = sb; | |||||
| #endif | |||||
| queue[num_cpu - 1].next = NULL; | queue[num_cpu - 1].next = NULL; | ||||
| exec_blas(num_cpu, | exec_blas(num_cpu, | ||||
| @@ -55,8 +55,8 @@ int CNAME(int mode, | |||||
| range_M[0] = 0; | range_M[0] = 0; | ||||
| i = arg -> m; | i = arg -> m; | ||||
| } else { | } else { | ||||
| range_M[0] = range_M[0]; | |||||
| i = range_M[1] - range_M[0]; | |||||
| range_M[0] = range_m[0]; | |||||
| i = range_m[1] - range_m[0]; | |||||
| } | } | ||||
| num_cpu_m = 0; | num_cpu_m = 0; | ||||
| @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ | |||||
| /* Initializing routine */ | /* Initializing routine */ | ||||
| int blas_thread_init(void){ | int blas_thread_init(void){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| int ret; | |||||
| #ifdef NEED_STACKATTR | #ifdef NEED_STACKATTR | ||||
| pthread_attr_t attr; | pthread_attr_t attr; | ||||
| #endif | #endif | ||||
| @@ -545,12 +546,16 @@ int blas_thread_init(void){ | |||||
| pthread_cond_init (&thread_status[i].wakeup, NULL); | pthread_cond_init (&thread_status[i].wakeup, NULL); | ||||
| #ifdef NEED_STACKATTR | #ifdef NEED_STACKATTR | ||||
| pthread_create(&blas_threads[i], &attr, | |||||
| ret=pthread_create(&blas_threads[i], &attr, | |||||
| (void *)&blas_thread_server, (void *)i); | (void *)&blas_thread_server, (void *)i); | ||||
| #else | #else | ||||
| pthread_create(&blas_threads[i], NULL, | |||||
| ret=pthread_create(&blas_threads[i], NULL, | |||||
| (void *)&blas_thread_server, (void *)i); | (void *)&blas_thread_server, (void *)i); | ||||
| #endif | #endif | ||||
| if(ret!=0){ | |||||
| fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); | |||||
| exit(1); | |||||
| } | |||||
| } | } | ||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| @@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) { | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| #if defined(ARCH_MIPS64) | |||||
| //set parameters for different number of threads. | |||||
| blas_set_parameter(); | |||||
| #endif | |||||
| } | } | ||||
| void openblas_set_num_threads(int num_threads) { | void openblas_set_num_threads(int num_threads) { | ||||
| @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { | |||||
| omp_set_num_threads(blas_cpu_number); | omp_set_num_threads(blas_cpu_number); | ||||
| #if defined(ARCH_MIPS64) | |||||
| //set parameters for different number of threads. | |||||
| blas_set_parameter(); | |||||
| #endif | |||||
| } | } | ||||
| void openblas_set_num_threads(int num_threads) { | void openblas_set_num_threads(int num_threads) { | ||||
| @@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){ | |||||
| #ifdef OS_LINUX | #ifdef OS_LINUX | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| int ret; | |||||
| int ret=0; | |||||
| ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); | ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); | ||||
| if(ret==-1){ | if(ret==-1){ | ||||
| int errsv=errno; | int errsv=errno; | ||||
| perror("alloc_mmap:"); | |||||
| perror("OpenBLAS alloc_mmap:"); | |||||
| printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); | printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); | ||||
| } | } | ||||
| @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | ||||
| #endif | #endif | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
| #ifndef DYNAMIC_ARCH | #ifndef DYNAMIC_ARCH | ||||
| blas_set_parameter(); | blas_set_parameter(); | ||||
| #endif | #endif | ||||
| @@ -45,8 +45,22 @@ int get_L2_size(void); | |||||
| #define DEFAULT_GEMM_P 128 | #define DEFAULT_GEMM_P 128 | ||||
| #define DEFAULT_GEMM_Q 128 | #define DEFAULT_GEMM_Q 128 | ||||
| #define DEFAULT_GEMM_R 128 | #define DEFAULT_GEMM_R 128 | ||||
| #define DEFAULT_GEMM_OFFSET_A 0 | |||||
| #define DEFAULT_GEMM_OFFSET_B 0 | |||||
| /* Global Parameter */ | /* Global Parameter */ | ||||
| #if GEMM_OFFSET_A == gemm_offset_a | |||||
| BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; | |||||
| #else | |||||
| BLASLONG gemm_offset_a = GEMM_OFFSET_A; | |||||
| #endif | |||||
| #if GEMM_OFFSET_B == gemm_offset_b | |||||
| BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; | |||||
| #else | |||||
| BLASLONG gemm_offset_b = GEMM_OFFSET_B; | |||||
| #endif | |||||
| #if SGEMM_P == sgemm_p | #if SGEMM_P == sgemm_p | ||||
| BLASLONG sgemm_p = DEFAULT_GEMM_P; | BLASLONG sgemm_p = DEFAULT_GEMM_P; | ||||
| #else | #else | ||||
| @@ -666,3 +680,36 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(ARCH_MIPS64) | |||||
| void blas_set_parameter(void){ | |||||
| #if defined(LOONGSON3A) | |||||
| #ifdef SMP | |||||
| if(blas_num_threads == 1){ | |||||
| #endif | |||||
| //single thread | |||||
| dgemm_r = 1024; | |||||
| #ifdef SMP | |||||
| }else{ | |||||
| //multi thread | |||||
| dgemm_r = 200; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| #if defined(LOONGSON3B) | |||||
| #ifdef SMP | |||||
| if(blas_num_threads == 1 || blas_num_threads == 2){ | |||||
| #endif | |||||
| //single thread | |||||
| dgemm_r = 640; | |||||
| #ifdef SMP | |||||
| }else{ | |||||
| //multi thread | |||||
| dgemm_r = 160; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_CELL */ | /* #define FORCE_CELL */ | ||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3A */ | /* #define FORCE_LOONGSON3A */ | ||||
| /* #define FORCE_LOONGSON3B */ | |||||
| /* #define FORCE_ITANIUM2 */ | /* #define FORCE_ITANIUM2 */ | ||||
| /* #define FORCE_GENERIC */ | /* #define FORCE_GENERIC */ | ||||
| /* #define FORCE_SPARC */ | /* #define FORCE_SPARC */ | ||||
| @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_LOONGSON3B | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "MIPS" | |||||
| #define SUBARCHITECTURE "LOONGSON3B" | |||||
| #define SUBDIRNAME "mips64" | |||||
| #define ARCHCONFIG "-DLOONGSON3B " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||||
| #define LIBNAME "loongson3b" | |||||
| #define CORENAME "LOONGSON3B" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_ITANIUM2 | #ifdef FORCE_ITANIUM2 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "IA64" | #define ARCHITECTURE "IA64" | ||||
| @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, | |||||
| FLOAT *sa, *sb; | FLOAT *sa, *sb; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | int mode = BLAS_XDOUBLE | BLAS_REAL; | ||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, | |||||
| #else | #else | ||||
| int mode = BLAS_SINGLE | BLAS_REAL; | int mode = BLAS_SINGLE | BLAS_REAL; | ||||
| #endif | #endif | ||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if defined(SMP) && !defined(NO_AFFINITY) | #if defined(SMP) && !defined(NO_AFFINITY) | ||||
| @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| FLOAT *sa, *sb; | FLOAT *sa, *sb; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | int mode = BLAS_XDOUBLE | BLAS_REAL; | ||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| #else | #else | ||||
| int mode = BLAS_SINGLE | BLAS_REAL; | int mode = BLAS_SINGLE | BLAS_REAL; | ||||
| #endif | #endif | ||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if defined(SMP) && !defined(NO_AFFINITY) | #if defined(SMP) && !defined(NO_AFFINITY) | ||||
| @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD | |||||
| $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) | $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) | ||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ | ||||
| ifeq ($(TARGET), LOONGSON3B) | |||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||||
| else | |||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| endif | |||||
| $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| @@ -0,0 +1,157 @@ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||||
| #ifdef TRMMKERNEL | |||||
| ,BLASLONG offset | |||||
| #endif | |||||
| ) | |||||
| { | |||||
| BLASLONG i,j,k; | |||||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||||
| FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; | |||||
| for (j=0; j<bn/2; j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| C1 = C0+ldc; | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| for (k=0; k<bk/4; k+=1) | |||||
| { | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res2 = res2+load0*load3; | |||||
| res3 = res3+load2*load3; | |||||
| load4 = ptrba[2*1+0]; | |||||
| load5 = ptrbb[2*1+0]; | |||||
| res0 = res0+load4*load5; | |||||
| load6 = ptrba[2*1+1]; | |||||
| res1 = res1+load6*load5; | |||||
| load7 = ptrbb[2*1+1]; | |||||
| res2 = res2+load4*load7; | |||||
| res3 = res3+load6*load7; | |||||
| load0 = ptrba[2*2+0]; | |||||
| load1 = ptrbb[2*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*2+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*2+1]; | |||||
| res2 = res2+load0*load3; | |||||
| res3 = res3+load2*load3; | |||||
| load4 = ptrba[2*3+0]; | |||||
| load5 = ptrbb[2*3+0]; | |||||
| res0 = res0+load4*load5; | |||||
| load6 = ptrba[2*3+1]; | |||||
| res1 = res1+load6*load5; | |||||
| load7 = ptrbb[2*3+1]; | |||||
| res2 = res2+load4*load7; | |||||
| res3 = res3+load6*load7; | |||||
| ptrba = ptrba+8; | |||||
| ptrbb = ptrbb+8; | |||||
| } | |||||
| for (k=0; k<(bk&3); k+=1) | |||||
| { | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res2 = res2+load0*load3; | |||||
| res3 = res3+load2*load3; | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = C0[0]+res0; | |||||
| res1 = res1*alpha; | |||||
| C0[1] = C0[1]+res1; | |||||
| res2 = res2*alpha; | |||||
| C1[0] = C1[0]+res2; | |||||
| res3 = res3*alpha; | |||||
| C1[1] = C1[1]+res3; | |||||
| C0 = C0+2; | |||||
| C1 = C1+2; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| for (k=0; k<bk; k+=1) | |||||
| { | |||||
| load0 = ptrba[0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrbb[2*0+1]; | |||||
| res1 = res1+load0*load2; | |||||
| ptrba = ptrba+1; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = C0[0]+res0; | |||||
| res1 = res1*alpha; | |||||
| C1[0] = C1[0]+res1; | |||||
| C0 = C0+1; | |||||
| C1 = C1+1; | |||||
| } | |||||
| k = (bk<<1); | |||||
| bb = bb+k; | |||||
| i = (ldc<<1); | |||||
| C = C+i; | |||||
| } | |||||
| for (j=0; j<(bn&1); j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| for (k=0; k<bk; k+=1) | |||||
| { | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+1; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = C0[0]+res0; | |||||
| res1 = res1*alpha; | |||||
| C0[1] = C0[1]+res1; | |||||
| C0 = C0+2; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| for (k=0; k<bk; k+=1) | |||||
| { | |||||
| load0 = ptrba[0+0]; | |||||
| load1 = ptrbb[0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| ptrba = ptrba+1; | |||||
| ptrbb = ptrbb+1; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = C0[0]+res0; | |||||
| C0 = C0+1; | |||||
| } | |||||
| k = (bk<<0); | |||||
| bb = bb+k; | |||||
| C = C+ldc; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,280 @@ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||||
| #ifdef TRMMKERNEL | |||||
| ,BLASLONG offset | |||||
| #endif | |||||
| ) | |||||
| { | |||||
| BLASLONG i,j,k; | |||||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||||
| FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; | |||||
| BLASLONG off, temp; | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off = -offset; | |||||
| #endif | |||||
| for (j=0; j<bn/2; j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| C1 = C0+ldc; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2; | |||||
| ptrbb = bb + off*2; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+2; | |||||
| #else | |||||
| temp = off+2; | |||||
| #endif | |||||
| for (k=0; k<temp/4; k+=1) | |||||
| { | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res2 = res2+load0*load3; | |||||
| res3 = res3+load2*load3; | |||||
| load4 = ptrba[2*1+0]; | |||||
| load5 = ptrbb[2*1+0]; | |||||
| res0 = res0+load4*load5; | |||||
| load6 = ptrba[2*1+1]; | |||||
| res1 = res1+load6*load5; | |||||
| load7 = ptrbb[2*1+1]; | |||||
| res2 = res2+load4*load7; | |||||
| res3 = res3+load6*load7; | |||||
| load0 = ptrba[2*2+0]; | |||||
| load1 = ptrbb[2*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*2+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*2+1]; | |||||
| res2 = res2+load0*load3; | |||||
| res3 = res3+load2*load3; | |||||
| load4 = ptrba[2*3+0]; | |||||
| load5 = ptrbb[2*3+0]; | |||||
| res0 = res0+load4*load5; | |||||
| load6 = ptrba[2*3+1]; | |||||
| res1 = res1+load6*load5; | |||||
| load7 = ptrbb[2*3+1]; | |||||
| res2 = res2+load4*load7; | |||||
| res3 = res3+load6*load7; | |||||
| ptrba = ptrba+8; | |||||
| ptrbb = ptrbb+8; | |||||
| } | |||||
| for (k=0; k<(temp&3); k+=1) | |||||
| { | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res2 = res2+load0*load3; | |||||
| res3 = res3+load2*load3; | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = res0; | |||||
| res1 = res1*alpha; | |||||
| C0[1] = res1; | |||||
| res2 = res2*alpha; | |||||
| C1[0] = res2; | |||||
| res3 = res3*alpha; | |||||
| C1[1] = res3; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; | |||||
| #else | |||||
| temp -= 2; | |||||
| #endif | |||||
| ptrba += temp*2; | |||||
| ptrbb += temp*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; | |||||
| #endif | |||||
| C0 = C0+2; | |||||
| C1 = C1+2; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off; | |||||
| ptrbb = bb+off*2; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+1; | |||||
| #else | |||||
| temp = off+2; | |||||
| #endif | |||||
| for (k=0; k<temp; k+=1) | |||||
| { | |||||
| load0 = ptrba[0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrbb[2*0+1]; | |||||
| res1 = res1+load0*load2; | |||||
| ptrba = ptrba+1; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = res0; | |||||
| res1 = res1*alpha; | |||||
| C1[0] = res1; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; | |||||
| #else | |||||
| temp -= 2; | |||||
| #endif | |||||
| ptrba += temp; | |||||
| ptrbb += temp*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; | |||||
| #endif | |||||
| C0 = C0+1; | |||||
| C1 = C1+1; | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 2; | |||||
| #endif | |||||
| k = (bk<<1); | |||||
| bb = bb+k; | |||||
| i = (ldc<<1); | |||||
| C = C+i; | |||||
| } | |||||
| for (j=0; j<(bn&1); j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2; | |||||
| ptrbb = bb + off; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+2; | |||||
| #else | |||||
| temp = off+1; | |||||
| #endif | |||||
| for (k=0; k<temp; k+=1) | |||||
| { | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+1; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = res0; | |||||
| res1 = res1*alpha; | |||||
| C0[1] = res1; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; | |||||
| #else | |||||
| temp -= 1; | |||||
| #endif | |||||
| ptrba += temp*2; | |||||
| ptrbb += temp; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; | |||||
| #endif | |||||
| C0 = C0+2; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off; | |||||
| ptrbb = bb+off; | |||||
| #endif | |||||
| res0 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off + 1; | |||||
| #else | |||||
| temp = off + 1; | |||||
| #endif | |||||
| for (k=0; k<temp; k+=1) | |||||
| { | |||||
| load0 = ptrba[0+0]; | |||||
| load1 = ptrbb[0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| ptrba = ptrba+1; | |||||
| ptrbb = ptrbb+1; | |||||
| } | |||||
| res0 = res0*alpha; | |||||
| C0[0] = res0; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; | |||||
| #else | |||||
| temp -= 1; | |||||
| #endif | |||||
| ptrba += temp; | |||||
| ptrbb += temp; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; | |||||
| #endif | |||||
| C0 = C0+1; | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 1; | |||||
| #endif | |||||
| k = (bk<<0); | |||||
| bb = bb+k; | |||||
| C = C+ldc; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,838 @@ | |||||
| #include "common.h" | |||||
| /******************************** | |||||
| ADD1 a*c | |||||
| ADD2 b*c | |||||
| ADD3 a*d | |||||
| ADD4 b*d | |||||
| *********************************/ | |||||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||||
| #ifdef TRMMKERNEL | |||||
| , BLASLONG offset | |||||
| #endif | |||||
| ) | |||||
| { | |||||
| BLASLONG i,j,k; | |||||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||||
| FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15; | |||||
| for (j=0; j<bn/2; j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| C1 = C0+2*ldc; | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| res4 = 0; | |||||
| res5 = 0; | |||||
| res6 = 0; | |||||
| res7 = 0; | |||||
| for (k=0; k<bk/4; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| #endif | |||||
| ptrba = ptrba+16; | |||||
| ptrbb = ptrbb+16; | |||||
| } | |||||
| for (k=0; k<(bk&3); k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| #endif | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| load0 = res0*alphar; | |||||
| C0[0] = C0[0]+load0; | |||||
| load1 = res1*alphar; | |||||
| C0[1] = C0[1]+load1; | |||||
| load0 = res1*alphai; | |||||
| C0[0] = C0[0]-load0; | |||||
| load1 = res0*alphai; | |||||
| C0[1] = C0[1]+load1; | |||||
| load2 = res2*alphar; | |||||
| C0[2] = C0[2]+load2; | |||||
| load3 = res3*alphar; | |||||
| C0[3] = C0[3]+load3; | |||||
| load2 = res3*alphai; | |||||
| C0[2] = C0[2]-load2; | |||||
| load3 = res2*alphai; | |||||
| C0[3] = C0[3]+load3; | |||||
| load4 = res4*alphar; | |||||
| C1[0] = C1[0]+load4; | |||||
| load5 = res5*alphar; | |||||
| C1[1] = C1[1]+load5; | |||||
| load4 = res5*alphai; | |||||
| C1[0] = C1[0]-load4; | |||||
| load5 = res4*alphai; | |||||
| C1[1] = C1[1]+load5; | |||||
| load6 = res6*alphar; | |||||
| C1[2] = C1[2]+load6; | |||||
| load7 = res7*alphar; | |||||
| C1[3] = C1[3]+load7; | |||||
| load6 = res7*alphai; | |||||
| C1[2] = C1[2]-load6; | |||||
| load7 = res6*alphai; | |||||
| C1[3] = C1[3]+load7; | |||||
| C0 = C0+4; | |||||
| C1 = C1+4; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| for (k=0; k<bk; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3+load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2-load2*load5; | |||||
| res3 = res3+load0*load5; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3+load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2+load2*load5; | |||||
| res3 = res3-load0*load5; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3-load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2+load2*load5; | |||||
| res3 = res3+load0*load5; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3-load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2-load2*load5; | |||||
| res3 = res3-load0*load5; | |||||
| #endif | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| load0 = res0*alphar; | |||||
| C0[0] = C0[0]+load0; | |||||
| load1 = res1*alphar; | |||||
| C0[1] = C0[1]+load1; | |||||
| load0 = res1*alphai; | |||||
| C0[0] = C0[0]-load0; | |||||
| load1 = res0*alphai; | |||||
| C0[1] = C0[1]+load1; | |||||
| load2 = res2*alphar; | |||||
| C1[0] = C1[0]+load2; | |||||
| load3 = res3*alphar; | |||||
| C1[1] = C1[1]+load3; | |||||
| load2 = res3*alphai; | |||||
| C1[0] = C1[0]-load2; | |||||
| load3 = res2*alphai; | |||||
| C1[1] = C1[1]+load3; | |||||
| C0 = C0+2; | |||||
| C1 = C1+2; | |||||
| } | |||||
| k = (bk<<2); | |||||
| bb = bb+k; | |||||
| i = (ldc<<2); | |||||
| C = C+i; | |||||
| } | |||||
| for (j=0; j<(bn&1); j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| for (k=0; k<bk; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| #endif | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| load0 = res0*alphar; | |||||
| C0[0] = C0[0]+load0; | |||||
| load1 = res1*alphar; | |||||
| C0[1] = C0[1]+load1; | |||||
| load0 = res1*alphai; | |||||
| C0[0] = C0[0]-load0; | |||||
| load1 = res0*alphai; | |||||
| C0[1] = C0[1]+load1; | |||||
| load2 = res2*alphar; | |||||
| C0[2] = C0[2]+load2; | |||||
| load3 = res3*alphar; | |||||
| C0[3] = C0[3]+load3; | |||||
| load2 = res3*alphai; | |||||
| C0[2] = C0[2]-load2; | |||||
| load3 = res2*alphai; | |||||
| C0[3] = C0[3]+load3; | |||||
| C0 = C0+4; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| ptrbb = bb; | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| for (k=0; k<bk; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| #endif | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| load0 = res0*alphar; | |||||
| C0[0] = C0[0]+load0; | |||||
| load1 = res1*alphar; | |||||
| C0[1] = C0[1]+load1; | |||||
| load0 = res1*alphai; | |||||
| C0[0] = C0[0]-load0; | |||||
| load1 = res0*alphai; | |||||
| C0[1] = C0[1]+load1; | |||||
| C0 = C0+2; | |||||
| } | |||||
| k = (bk<<1); | |||||
| bb = bb+k; | |||||
| i = (ldc<<1); | |||||
| C = C+i; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,923 @@ | |||||
| #include "common.h" | |||||
| /******************************** | |||||
| ADD1 a*c | |||||
| ADD2 b*c | |||||
| ADD3 a*d | |||||
| ADD4 b*d | |||||
| *********************************/ | |||||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, | |||||
| FLOAT* C,BLASLONG ldc, BLASLONG offset) | |||||
| { | |||||
| BLASLONG i,j,k; | |||||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||||
| FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15; | |||||
| BLASLONG off, temp; | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off = -offset; | |||||
| #endif | |||||
| for (j=0; j<bn/2; j+=1) | |||||
| { | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| C0 = C; | |||||
| C1 = C0+2*ldc; | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2*2; | |||||
| ptrbb = bb+off*2*2; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| res4 = 0; | |||||
| res5 = 0; | |||||
| res6 = 0; | |||||
| res7 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #elif defined(LEFT) | |||||
| temp = off + 2; | |||||
| #else | |||||
| temp = off + 2; | |||||
| #endif | |||||
| for (k=0; k<temp/4; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1+load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3+load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5+load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7+load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0+load10*load11; | |||||
| res1 = res1+load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2+load13*load11; | |||||
| res3 = res3+load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4+load10*load15; | |||||
| res5 = res5+load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6+load13*load15; | |||||
| res7 = res7+load12*load15; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*1+0]; | |||||
| load9 = ptrbb[4*1+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*1+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*1+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*1+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*1+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*1+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*1+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| load0 = ptrba[4*2+0]; | |||||
| load1 = ptrbb[4*2+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*2+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*2+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*2+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*2+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*2+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*2+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| load8 = ptrba[4*3+0]; | |||||
| load9 = ptrbb[4*3+0]; | |||||
| res0 = res0+load8*load9; | |||||
| load10 = ptrba[4*3+1]; | |||||
| res1 = res1-load10*load9; | |||||
| load11 = ptrbb[4*3+1]; | |||||
| res0 = res0-load10*load11; | |||||
| res1 = res1-load8*load11; | |||||
| load12 = ptrba[4*3+2]; | |||||
| res2 = res2+load12*load9; | |||||
| load13 = ptrba[4*3+3]; | |||||
| res3 = res3-load13*load9; | |||||
| res2 = res2-load13*load11; | |||||
| res3 = res3-load12*load11; | |||||
| load14 = ptrbb[4*3+2]; | |||||
| res4 = res4+load8*load14; | |||||
| res5 = res5-load10*load14; | |||||
| load15 = ptrbb[4*3+3]; | |||||
| res4 = res4-load10*load15; | |||||
| res5 = res5-load8*load15; | |||||
| res6 = res6+load12*load14; | |||||
| res7 = res7-load13*load14; | |||||
| res6 = res6-load13*load15; | |||||
| res7 = res7-load12*load15; | |||||
| #endif | |||||
| ptrba = ptrba+16; | |||||
| ptrbb = ptrbb+16; | |||||
| } | |||||
| for (k=0; k<(temp&3); k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5+load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7+load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4+load2*load7; | |||||
| res5 = res5+load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6+load5*load7; | |||||
| res7 = res7+load4*load7; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| load6 = ptrbb[4*0+2]; | |||||
| res4 = res4+load0*load6; | |||||
| res5 = res5-load2*load6; | |||||
| load7 = ptrbb[4*0+3]; | |||||
| res4 = res4-load2*load7; | |||||
| res5 = res5-load0*load7; | |||||
| res6 = res6+load4*load6; | |||||
| res7 = res7-load5*load6; | |||||
| res6 = res6-load5*load7; | |||||
| res7 = res7-load4*load7; | |||||
| #endif | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| load0 = res0*alphar-res1*alphai; | |||||
| load1 = res1*alphar+res0*alphai; | |||||
| C0[0] = load0; | |||||
| C0[1] = load1; | |||||
| load2 = res2*alphar-res3*alphai; | |||||
| load3 = res3*alphar+res2*alphai; | |||||
| C0[2] = load2; | |||||
| C0[3] = load3; | |||||
| load4 = res4*alphar-res5*alphai; | |||||
| load5 = res5*alphar+res4*alphai; | |||||
| C1[0] = load4; | |||||
| C1[1] = load5; | |||||
| load6 = res6*alphar-res7*alphai; | |||||
| load7 = res7*alphar+res6*alphai; | |||||
| C1[2] = load6; | |||||
| C1[3] = load7; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; | |||||
| #else | |||||
| temp -= 2; | |||||
| #endif | |||||
| ptrba += temp*2*2; | |||||
| ptrbb += temp*2*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; | |||||
| #endif | |||||
| C0 = C0+4; | |||||
| C1 = C1+4; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2; | |||||
| ptrbb = bb + off*2*2; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+1; | |||||
| #else | |||||
| temp = off+2; | |||||
| #endif | |||||
| for (k=0; k<temp; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3+load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2-load2*load5; | |||||
| res3 = res3+load0*load5; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3+load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2+load2*load5; | |||||
| res3 = res3-load0*load5; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3-load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2+load2*load5; | |||||
| res3 = res3+load0*load5; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[4*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[4*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrbb[4*0+2]; | |||||
| res2 = res2+load0*load4; | |||||
| res3 = res3-load2*load4; | |||||
| load5 = ptrbb[4*0+3]; | |||||
| res2 = res2-load2*load5; | |||||
| res3 = res3-load0*load5; | |||||
| #endif | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| load0 = res0*alphar-res1*alphai; | |||||
| load1 = res1*alphar+res0*alphai; | |||||
| C0[0] = load0; | |||||
| C0[1] = load1; | |||||
| load2 = res2*alphar-res3*alphai; | |||||
| load3 = res3*alphar+res2*alphai; | |||||
| C1[0] = load2; | |||||
| C1[1] = load3; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; | |||||
| #else | |||||
| temp -= 2; | |||||
| #endif | |||||
| ptrba += temp*2; | |||||
| ptrbb += temp*2*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; | |||||
| #endif | |||||
| C0 = C0+2; | |||||
| C1 = C1+2; | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 2; | |||||
| #endif | |||||
| k = (bk<<2); | |||||
| bb = bb+k; | |||||
| i = (ldc<<2); | |||||
| C = C+i; | |||||
| } | |||||
| for (j=0; j<(bn&1); j+=1) | |||||
| { | |||||
| C0 = C; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/2; i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2*2; | |||||
| ptrbb = bb+off*2; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| res2 = 0; | |||||
| res3 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #elif defined(LEFT) | |||||
| temp = off + 2; | |||||
| #else | |||||
| temp = off + 1; | |||||
| #endif | |||||
| for (k=0; k<temp; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3+load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2+load5*load3; | |||||
| res3 = res3+load4*load3; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[4*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[4*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| load4 = ptrba[4*0+2]; | |||||
| res2 = res2+load4*load1; | |||||
| load5 = ptrba[4*0+3]; | |||||
| res3 = res3-load5*load1; | |||||
| res2 = res2-load5*load3; | |||||
| res3 = res3-load4*load3; | |||||
| #endif | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| load0 = res0*alphar-res1*alphai; | |||||
| load1 = res1*alphar+res0*alphai; | |||||
| C0[0] = load0; | |||||
| C0[1] = load1; | |||||
| load2 = res2*alphar-res3*alphai; | |||||
| load3 = res3*alphar+res2*alphai; | |||||
| C0[2] = load2; | |||||
| C0[3] = load3; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; | |||||
| #else | |||||
| temp -= 1; | |||||
| #endif | |||||
| ptrba += temp*2*2; | |||||
| ptrbb += temp*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; | |||||
| #endif | |||||
| C0 = C0+4; | |||||
| } | |||||
| for (i=0; i<(bm&1); i+=1) | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2; | |||||
| ptrbb = bb + off*2; | |||||
| #endif | |||||
| res0 = 0; | |||||
| res1 = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off + 1; | |||||
| #else | |||||
| temp = off + 1; | |||||
| #endif | |||||
| for (k=0; k<temp; k+=1) | |||||
| { | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1+load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0+load2*load3; | |||||
| res1 = res1+load0*load3; | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| load0 = ptrba[2*0+0]; | |||||
| load1 = ptrbb[2*0+0]; | |||||
| res0 = res0+load0*load1; | |||||
| load2 = ptrba[2*0+1]; | |||||
| res1 = res1-load2*load1; | |||||
| load3 = ptrbb[2*0+1]; | |||||
| res0 = res0-load2*load3; | |||||
| res1 = res1-load0*load3; | |||||
| #endif | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| load0 = res0*alphar-res1*alphai; | |||||
| load1 = res1*alphar+res0*alphai; | |||||
| C0[0] = load0; | |||||
| C0[1] = load1; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||||
| (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; | |||||
| #else | |||||
| temp -= 1; | |||||
| #endif | |||||
| ptrba += temp*2; | |||||
| ptrbb += temp*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; | |||||
| #endif | |||||
| C0 = C0+2; | |||||
| } | |||||
| k = (bk<<1); | |||||
| bb = bb+k; | |||||
| i = (ldc<<1); | |||||
| C = C+i; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT.S | DTRSMKERNEL_RT = trsm_kernel_RT.S | ||||
| endif | endif | ||||
| ifndef CTRSMKERNEL_LN | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LT.S | CTRSMKERNEL_LN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef CTRSMKERNEL_LT | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT.S | CTRSMKERNEL_LT = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef CTRSMKERNEL_RN | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT.S | CTRSMKERNEL_RN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef CTRSMKERNEL_RT | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT.S | CTRSMKERNEL_RT = ztrsm_kernel_RT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_LN | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_LT | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_RN | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_RT | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | ||||
| endif | |||||
| CGEMM3MKERNEL = zgemm3m_kernel.S | CGEMM3MKERNEL = zgemm3m_kernel.S | ||||
| ZGEMM3MKERNEL = zgemm3m_kernel.S | ZGEMM3MKERNEL = zgemm3m_kernel.S | ||||
| @@ -1,18 +1,48 @@ | |||||
| SAXPYKERNEL=axpy_loongson3a.S | SAXPYKERNEL=axpy_loongson3a.S | ||||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | DAXPYKERNEL=daxpy_loongson3a_simd.S | ||||
| SGEMMKERNEL = sgemm_kernel_loongson3a.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | SGEMMONCOPYOBJ = sgemm_oncopy.o | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | SGEMMOTCOPYOBJ = sgemm_otcopy.o | ||||
| DGEMMKERNEL = gemm_kernel_loongson3a.S | |||||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | DGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | DGEMMONCOPYOBJ = dgemm_oncopy.o | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | DGEMMOTCOPYOBJ = dgemm_otcopy.o | ||||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| @@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -0,0 +1,64 @@ | |||||
| SAXPYKERNEL=axpy_loongson3a.S | |||||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | |||||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -0,0 +1,101 @@ | |||||
| #include "common.h" | |||||
| //These are auto-tuning codes on Loongson-3A platform. | |||||
| //#define prefetch(x) __builtin_prefetch(x) | |||||
| //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) | |||||
| #define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) | |||||
| #define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) | |||||
| #define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) | |||||
| { | |||||
| BLASLONG kx=0, ky=0; | |||||
| if(!ALPHA) | |||||
| return 0; | |||||
| //if(INCX < 0) | |||||
| // kx = (1-N) * INCX; | |||||
| // INCX = -INCX; | |||||
| //if(INCY < 0) | |||||
| // ky = (1-M) * INCY; | |||||
| // INCY = -INCY; | |||||
| BLASLONG fahead = 30; | |||||
| BLASLONG spec_unroll = 4; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0; | |||||
| if(ALPHA == 1) { | |||||
| if(INCY == 1) { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[i + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0, h = ky; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[h + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if(INCY == 1) { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[i + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0, h = ky; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[h + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,93 @@ | |||||
| #include "common.h" | |||||
| //These are auto-tuning codes on Loongson-3A platform. | |||||
| //#define prefetch(x) __builtin_prefetch(x) | |||||
| //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) | |||||
| #define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) | |||||
| #define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) | |||||
| #define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||||
| if(!ALPHA) | |||||
| return 0; | |||||
| // if(INCX < 0) | |||||
| // INCX = -INCX; | |||||
| // if(INCY < 0) | |||||
| // INCY = -INCY; | |||||
| BLASLONG fahead = 30; | |||||
| BLASLONG spec_unroll = 3; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0; | |||||
| if(ALPHA == 1) { | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[i + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0, h = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[h + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[i + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0, h = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[h + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,139 @@ | |||||
| #include "common.h" | |||||
| //typedef int BLASLONG; | |||||
| //typedef double FLOAT; | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_0 | |||||
| #define spec_loop spec_loop_0 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_0 | |||||
| #define norm_loop norm_loop_0 | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_1 | |||||
| #define spec_loop spec_loop_1 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_1 | |||||
| #define norm_loop norm_loop_1 | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_2 | |||||
| #define spec_loop spec_loop_2 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_2 | |||||
| #define norm_loop norm_loop_2 | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_3 | |||||
| #define spec_loop spec_loop_3 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_3 | |||||
| #define norm_loop norm_loop_3 | |||||
| #endif | |||||
| #define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||||
| if(!rALPHA && iALPHA) | |||||
| return 0; | |||||
| BLASLONG fahead = 60; | |||||
| BLASLONG spec_unroll = 2; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0, jj = 0; | |||||
| if(rALPHA == 1 && iALPHA == 0) { | |||||
| if(INCY == 1) { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| FLOAT rTmp, iTmp; | |||||
| if(INCY == 1) { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,125 @@ | |||||
| #include "common.h" | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_0 | |||||
| #define spec_loop spec_loop_0 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_0 | |||||
| #define norm_loop norm_loop_0 | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_1 | |||||
| #define spec_loop spec_loop_1 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_1 | |||||
| #define norm_loop norm_loop_1 | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_2 | |||||
| #define spec_loop spec_loop_2 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_2 | |||||
| #define norm_loop norm_loop_2 | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_3 | |||||
| #define spec_loop spec_loop_3 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_3 | |||||
| #define norm_loop norm_loop_3 | |||||
| #endif | |||||
| #define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||||
| if(!rALPHA && iALPHA) | |||||
| return 0; | |||||
| BLASLONG fahead = 30; | |||||
| BLASLONG spec_unroll = 2; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0, jj = 0; | |||||
| if(rALPHA == 1 && iALPHA == 0) { | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| FLOAT rTmp, iTmp; | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1480,31 +1480,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | #define DGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_P 32 | |||||
| #define DGEMM_DEFAULT_P 32 | |||||
| #define CGEMM_DEFAULT_P 108 | |||||
| #define ZGEMM_DEFAULT_P 112 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_Q 116 | |||||
| #define DGEMM_DEFAULT_Q 116 | |||||
| #define CGEMM_DEFAULT_Q 144 | |||||
| #define ZGEMM_DEFAULT_Q 72 | |||||
| #define SGEMM_DEFAULT_P 64 | |||||
| #define DGEMM_DEFAULT_P 44 | |||||
| #define CGEMM_DEFAULT_P 64 | |||||
| #define ZGEMM_DEFAULT_P 32 | |||||
| #define SGEMM_DEFAULT_R 1000 | |||||
| #define DGEMM_DEFAULT_R 1000 | |||||
| #define CGEMM_DEFAULT_R 2000 | |||||
| #define ZGEMM_DEFAULT_R 2000 | |||||
| #define SGEMM_DEFAULT_Q 192 | |||||
| #define DGEMM_DEFAULT_Q 92 | |||||
| #define CGEMM_DEFAULT_Q 128 | |||||
| #define ZGEMM_DEFAULT_Q 80 | |||||
| #define SGEMM_DEFAULT_R 640 | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define CGEMM_DEFAULT_R 640 | |||||
| #define ZGEMM_DEFAULT_R 640 | |||||
| #define GEMM_OFFSET_A1 0x10000 | |||||
| #define GEMM_OFFSET_B1 0x100000 | |||||
| #define SYMV_P 16 | |||||
| #endif | |||||
| #ifdef LOONGSON3B | |||||
| #define SNUMOPT 2 | |||||
| #define DNUMOPT 2 | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_P 64 | |||||
| #define DGEMM_DEFAULT_P 24 | |||||
| #define CGEMM_DEFAULT_P 24 | |||||
| #define ZGEMM_DEFAULT_P 20 | |||||
| #define SGEMM_DEFAULT_Q 192 | |||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 128 | |||||
| #define ZGEMM_DEFAULT_Q 64 | |||||
| #define SGEMM_DEFAULT_R 512 | |||||
| #define DGEMM_DEFAULT_R 512 | |||||
| #define CGEMM_DEFAULT_R 512 | |||||
| #define ZGEMM_DEFAULT_R 512 | |||||
| #define GEMM_OFFSET_A1 0x10000 | |||||
| #define GEMM_OFFSET_B1 0x100000 | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #endif | #endif | ||||
| @@ -1301,6 +1301,8 @@ | |||||
| NC = 0 | NC = 0 | ||||
| RESET = .TRUE. | RESET = .TRUE. | ||||
| ERRMAX = RZERO | ERRMAX = RZERO | ||||
| RALS = RONE | |||||
| RBETS = RONE | |||||
| * | * | ||||
| DO 100 IN = 1, NIDIM | DO 100 IN = 1, NIDIM | ||||
| N = IDIM( IN ) | N = IDIM( IN ) | ||||
| @@ -1303,6 +1303,8 @@ | |||||
| NC = 0 | NC = 0 | ||||
| RESET = .TRUE. | RESET = .TRUE. | ||||
| ERRMAX = RZERO | ERRMAX = RZERO | ||||
| RALS = RONE | |||||
| RBETS = RONE | |||||
| * | * | ||||
| DO 100 IN = 1, NIDIM | DO 100 IN = 1, NIDIM | ||||
| N = IDIM( IN ) | N = IDIM( IN ) | ||||