| @@ -279,7 +279,12 @@ endif | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| @@ -534,8 +539,10 @@ ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| ifeq ($(ARCH), mips64) | |||
| ifneq ($(CORE), LOONGSON3B) | |||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -600,9 +607,11 @@ endif | |||
| ifneq ($(ARCH), x86_64) | |||
| ifneq ($(ARCH), x86) | |||
| ifneq ($(CORE), LOONGSON3B) | |||
| NO_AFFINITY = 1 | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef NO_AFFINITY | |||
| CCOMMON_OPT += -DNO_AFFINITY | |||
| @@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...); | |||
| static inline int my_mbind(void *addr, unsigned long len, int mode, | |||
| unsigned long *nodemask, unsigned long maxnode, | |||
| unsigned flags) { | |||
| #if defined (LOONGSON3B) | |||
| #if defined (__64BIT__) | |||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
| #else | |||
| return 0; //NULL Implementation on Loongson 3B 32bit. | |||
| #endif | |||
| #else | |||
| //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | |||
| unsigned long null_nodemask=0; | |||
| return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); | |||
| #endif | |||
| } | |||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | |||
| @@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ | |||
| static inline unsigned int rpcc(void){ | |||
| unsigned long ret; | |||
| #if defined(LOONGSON3A) | |||
| unsigned long long tmp; | |||
| __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | |||
| ret=tmp; | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| // unsigned long long tmp; | |||
| //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | |||
| //ret=tmp; | |||
| __asm__ __volatile__(".set push \n" | |||
| ".set mips32r2\n" | |||
| "rdhwr %0, $2\n" | |||
| ".set pop": "=r"(ret):: "memory"); | |||
| #else | |||
| __asm__ __volatile__(".set push \n" | |||
| ".set mips32r2\n" | |||
| @@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){ | |||
| return ret; | |||
| } | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| #ifndef NO_AFFINITY | |||
| #define WHEREAMI | |||
| static inline int WhereAmI(void){ | |||
| int ret=0; | |||
| __asm__ __volatile__(".set push \n" | |||
| ".set mips32r2\n" | |||
| "rdhwr %0, $0\n" | |||
| ".set pop": "=r"(ret):: "memory"); | |||
| return ret; | |||
| } | |||
| #endif | |||
| #endif | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| } | |||
| @@ -234,6 +254,11 @@ REALNAME: ;\ | |||
| #define FIXED_PAGESIZE (16UL << 10) | |||
| #endif | |||
| #if defined(LOONGSON3B) | |||
| #define PAGESIZE (32UL << 10) | |||
| #define FIXED_PAGESIZE (32UL << 10) | |||
| #endif | |||
| #ifndef PAGESIZE | |||
| #define PAGESIZE (64UL << 10) | |||
| #endif | |||
| @@ -245,7 +270,7 @@ REALNAME: ;\ | |||
| #define MAP_ANONYMOUS MAP_ANON | |||
| #endif | |||
| #if defined(LOONGSON3A) | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| #define PREFETCHD_(x) ld $0, x | |||
| #define PREFETCHD(x) PREFETCHD_(x) | |||
| #else | |||
| @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3A 2 | |||
| #define CPU_LOONGSON3B 3 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "SICORTEX", | |||
| "LOONGSON3A" | |||
| "LOONGSON3A", | |||
| "LOONGSON3B" | |||
| }; | |||
| int detect(void){ | |||
| @@ -101,6 +103,8 @@ int detect(void){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| @@ -130,6 +134,8 @@ void get_architecture(void){ | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("LOONGSON3A"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("LOONGSON3B"); | |||
| }else{ | |||
| printf("SICORTEX"); | |||
| } | |||
| @@ -149,6 +155,15 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("#define LOONGSON3B\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else{ | |||
| printf("#define SICORTEX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| @@ -164,6 +179,8 @@ void get_cpuconfig(void){ | |||
| void get_libname(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("loongson3a\n"); | |||
| }else if(detect()==CPU_LOONGSON3B) { | |||
| printf("loongson3b\n"); | |||
| }else{ | |||
| #ifdef __mips64 | |||
| printf("mips64\n"); | |||
| @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| range_M[0] = 0; | |||
| i = arg -> m; | |||
| } else { | |||
| range_M[0] = range_M[0]; | |||
| i = range_M[1] - range_M[0]; | |||
| range_M[0] = range_m[0]; | |||
| i = range_m[1] - range_m[0]; | |||
| } | |||
| num_cpu_m = 0; | |||
| @@ -55,8 +55,8 @@ int CNAME(int mode, | |||
| range_M[0] = 0; | |||
| i = arg -> m; | |||
| } else { | |||
| range_M[0] = range_M[0]; | |||
| i = range_M[1] - range_M[0]; | |||
| range_M[0] = range_m[0]; | |||
| i = range_m[1] - range_m[0]; | |||
| } | |||
| num_cpu_m = 0; | |||
| @@ -389,12 +389,13 @@ static void *alloc_mmap(void *address){ | |||
| if (map_address != (void *)-1) { | |||
| #ifdef OS_LINUX | |||
| #ifdef DEBUG | |||
| int ret; | |||
| #if 1 | |||
| //#ifdef DEBUG | |||
| int ret=0; | |||
| ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); | |||
| if(ret==-1){ | |||
| int errsv=errno; | |||
| perror("alloc_mmap:"); | |||
| perror("OpenBLAS alloc_mmap:"); | |||
| printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); | |||
| } | |||
| @@ -696,5 +696,20 @@ void blas_set_parameter(void){ | |||
| } | |||
| #endif | |||
| #endif | |||
| #if defined(LOONGSON3B) | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1 || blas_num_threads == 2){ | |||
| #endif | |||
| //single thread | |||
| dgemm_r = 640; | |||
| #ifdef SMP | |||
| }else{ | |||
| //multi thread | |||
| dgemm_r = 160; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_CELL */ | |||
| /* #define FORCE_SICORTEX */ | |||
| /* #define FORCE_LOONGSON3A */ | |||
| /* #define FORCE_LOONGSON3B */ | |||
| /* #define FORCE_ITANIUM2 */ | |||
| /* #define FORCE_GENERIC */ | |||
| /* #define FORCE_SPARC */ | |||
| @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_LOONGSON3B | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "LOONGSON3B" | |||
| #define SUBDIRNAME "mips64" | |||
| #define ARCHCONFIG "-DLOONGSON3B " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "loongson3b" | |||
| #define CORENAME "LOONGSON3B" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ITANIUM2 | |||
| #define FORCE | |||
| #define ARCHITECTURE "IA64" | |||
| @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD | |||
| $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ | |||
| ifeq ($(TARGET), LOONGSON3B) | |||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||
| $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||
| $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||
| $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||
| $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||
| $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||
| $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||
| $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||
| $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| else | |||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||
| @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| endif | |||
| $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| @@ -0,0 +1,157 @@ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; | |||
| for (j=0; j<bn/2; j+=1) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| for (k=0; k<bk/4; k+=1) | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| load4 = ptrba[2*1+0]; | |||
| load5 = ptrbb[2*1+0]; | |||
| res0 = res0+load4*load5; | |||
| load6 = ptrba[2*1+1]; | |||
| res1 = res1+load6*load5; | |||
| load7 = ptrbb[2*1+1]; | |||
| res2 = res2+load4*load7; | |||
| res3 = res3+load6*load7; | |||
| load0 = ptrba[2*2+0]; | |||
| load1 = ptrbb[2*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*2+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*2+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| load4 = ptrba[2*3+0]; | |||
| load5 = ptrbb[2*3+0]; | |||
| res0 = res0+load4*load5; | |||
| load6 = ptrba[2*3+1]; | |||
| res1 = res1+load6*load5; | |||
| load7 = ptrbb[2*3+1]; | |||
| res2 = res2+load4*load7; | |||
| res3 = res3+load6*load7; | |||
| ptrba = ptrba+8; | |||
| ptrbb = ptrbb+8; | |||
| } | |||
| for (k=0; k<(bk&3); k+=1) | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = C0[0]+res0; | |||
| res1 = res1*alpha; | |||
| C0[1] = C0[1]+res1; | |||
| res2 = res2*alpha; | |||
| C1[0] = C1[0]+res2; | |||
| res3 = res3*alpha; | |||
| C1[1] = C1[1]+res3; | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| for (k=0; k<bk; k+=1) | |||
| { | |||
| load0 = ptrba[0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrbb[2*0+1]; | |||
| res1 = res1+load0*load2; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = C0[0]+res0; | |||
| res1 = res1*alpha; | |||
| C1[0] = C1[0]+res1; | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) | |||
| { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| for (k=0; k<bk; k+=1) | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = C0[0]+res0; | |||
| res1 = res1*alpha; | |||
| C0[1] = C0[1]+res1; | |||
| C0 = C0+2; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| for (k=0; k<bk; k+=1) | |||
| { | |||
| load0 = ptrba[0+0]; | |||
| load1 = ptrbb[0+0]; | |||
| res0 = res0+load0*load1; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = C0[0]+res0; | |||
| C0 = C0+1; | |||
| } | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,280 @@ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; | |||
| BLASLONG off, temp; | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off = -offset; | |||
| #endif | |||
| for (j=0; j<bn/2; j+=1) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || \ | |||
| (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; | |||
| #else | |||
| temp = off+2; | |||
| #endif | |||
| for (k=0; k<temp/4; k+=1) | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| load4 = ptrba[2*1+0]; | |||
| load5 = ptrbb[2*1+0]; | |||
| res0 = res0+load4*load5; | |||
| load6 = ptrba[2*1+1]; | |||
| res1 = res1+load6*load5; | |||
| load7 = ptrbb[2*1+1]; | |||
| res2 = res2+load4*load7; | |||
| res3 = res3+load6*load7; | |||
| load0 = ptrba[2*2+0]; | |||
| load1 = ptrbb[2*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*2+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*2+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| load4 = ptrba[2*3+0]; | |||
| load5 = ptrbb[2*3+0]; | |||
| res0 = res0+load4*load5; | |||
| load6 = ptrba[2*3+1]; | |||
| res1 = res1+load6*load5; | |||
| load7 = ptrbb[2*3+1]; | |||
| res2 = res2+load4*load7; | |||
| res3 = res3+load6*load7; | |||
| ptrba = ptrba+8; | |||
| ptrbb = ptrbb+8; | |||
| } | |||
| for (k=0; k<(temp&3); k+=1) | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = res0; | |||
| res1 = res1*alpha; | |||
| C0[1] = res1; | |||
| res2 = res2*alpha; | |||
| C1[0] = res2; | |||
| res3 = res3*alpha; | |||
| C1[1] = res3; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off; | |||
| ptrbb = bb+off*2; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; | |||
| #else | |||
| temp = off+2; | |||
| #endif | |||
| for (k=0; k<temp; k+=1) | |||
| { | |||
| load0 = ptrba[0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrbb[2*0+1]; | |||
| res1 = res1+load0*load2; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = res0; | |||
| res1 = res1*alpha; | |||
| C1[0] = res1; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk-off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || \ | |||
| (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; | |||
| #else | |||
| temp = off+1; | |||
| #endif | |||
| for (k=0; k<temp; k+=1) | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = res0; | |||
| res1 = res1*alpha; | |||
| C0[1] = res1; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+2; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off; | |||
| ptrbb = bb+off; | |||
| #endif | |||
| res0 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off + 1; | |||
| #else | |||
| temp = off + 1; | |||
| #endif | |||
| for (k=0; k<temp; k+=1) | |||
| { | |||
| load0 = ptrba[0+0]; | |||
| load1 = ptrbb[0+0]; | |||
| res0 = res0+load0*load1; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0 = res0*alpha; | |||
| C0[0] = res0; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk-off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp; | |||
| ptrbb += temp; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; | |||
| #endif | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,838 @@ | |||
| #include "common.h" | |||
| /******************************** | |||
| ADD1 a*c | |||
| ADD2 b*c | |||
| ADD3 a*d | |||
| ADD4 b*d | |||
| *********************************/ | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| , BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15; | |||
| for (j=0; j<bn/2; j+=1) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+2*ldc; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| res4 = 0; | |||
| res5 = 0; | |||
| res6 = 0; | |||
| res7 = 0; | |||
| for (k=0; k<bk/4; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7+load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7+load12*load15; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7-load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7-load12*load15; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7+load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7+load12*load15; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7-load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7-load12*load15; | |||
| #endif | |||
| ptrba = ptrba+16; | |||
| ptrbb = ptrbb+16; | |||
| } | |||
| for (k=0; k<(bk&3); k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7+load4*load7; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7-load4*load7; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7+load4*load7; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7-load4*load7; | |||
| #endif | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| load0 = res0*alphar; | |||
| C0[0] = C0[0]+load0; | |||
| load1 = res1*alphar; | |||
| C0[1] = C0[1]+load1; | |||
| load0 = res1*alphai; | |||
| C0[0] = C0[0]-load0; | |||
| load1 = res0*alphai; | |||
| C0[1] = C0[1]+load1; | |||
| load2 = res2*alphar; | |||
| C0[2] = C0[2]+load2; | |||
| load3 = res3*alphar; | |||
| C0[3] = C0[3]+load3; | |||
| load2 = res3*alphai; | |||
| C0[2] = C0[2]-load2; | |||
| load3 = res2*alphai; | |||
| C0[3] = C0[3]+load3; | |||
| load4 = res4*alphar; | |||
| C1[0] = C1[0]+load4; | |||
| load5 = res5*alphar; | |||
| C1[1] = C1[1]+load5; | |||
| load4 = res5*alphai; | |||
| C1[0] = C1[0]-load4; | |||
| load5 = res4*alphai; | |||
| C1[1] = C1[1]+load5; | |||
| load6 = res6*alphar; | |||
| C1[2] = C1[2]+load6; | |||
| load7 = res7*alphar; | |||
| C1[3] = C1[3]+load7; | |||
| load6 = res7*alphai; | |||
| C1[2] = C1[2]-load6; | |||
| load7 = res6*alphai; | |||
| C1[3] = C1[3]+load7; | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| for (k=0; k<bk; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3+load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2-load2*load5; | |||
| res3 = res3+load0*load5; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3+load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2+load2*load5; | |||
| res3 = res3-load0*load5; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3-load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2+load2*load5; | |||
| res3 = res3+load0*load5; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3-load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2-load2*load5; | |||
| res3 = res3-load0*load5; | |||
| #endif | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| load0 = res0*alphar; | |||
| C0[0] = C0[0]+load0; | |||
| load1 = res1*alphar; | |||
| C0[1] = C0[1]+load1; | |||
| load0 = res1*alphai; | |||
| C0[0] = C0[0]-load0; | |||
| load1 = res0*alphai; | |||
| C0[1] = C0[1]+load1; | |||
| load2 = res2*alphar; | |||
| C1[0] = C1[0]+load2; | |||
| load3 = res3*alphar; | |||
| C1[1] = C1[1]+load3; | |||
| load2 = res3*alphai; | |||
| C1[0] = C1[0]-load2; | |||
| load3 = res2*alphai; | |||
| C1[1] = C1[1]+load3; | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| k = (bk<<2); | |||
| bb = bb+k; | |||
| i = (ldc<<2); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) | |||
| { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| for (k=0; k<bk; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| #endif | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| load0 = res0*alphar; | |||
| C0[0] = C0[0]+load0; | |||
| load1 = res1*alphar; | |||
| C0[1] = C0[1]+load1; | |||
| load0 = res1*alphai; | |||
| C0[0] = C0[0]-load0; | |||
| load1 = res0*alphai; | |||
| C0[1] = C0[1]+load1; | |||
| load2 = res2*alphar; | |||
| C0[2] = C0[2]+load2; | |||
| load3 = res3*alphar; | |||
| C0[3] = C0[3]+load3; | |||
| load2 = res3*alphai; | |||
| C0[2] = C0[2]-load2; | |||
| load3 = res2*alphai; | |||
| C0[3] = C0[3]+load3; | |||
| C0 = C0+4; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0 = 0; | |||
| res1 = 0; | |||
| for (k=0; k<bk; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| #endif | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| load0 = res0*alphar; | |||
| C0[0] = C0[0]+load0; | |||
| load1 = res1*alphar; | |||
| C0[1] = C0[1]+load1; | |||
| load0 = res1*alphai; | |||
| C0[0] = C0[0]-load0; | |||
| load1 = res0*alphai; | |||
| C0[1] = C0[1]+load1; | |||
| C0 = C0+2; | |||
| } | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,923 @@ | |||
| #include "common.h" | |||
| /******************************** | |||
| ADD1 a*c | |||
| ADD2 b*c | |||
| ADD3 a*d | |||
| ADD4 b*d | |||
| *********************************/ | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, | |||
| FLOAT* C,BLASLONG ldc, BLASLONG offset) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15; | |||
| BLASLONG off, temp; | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off = -offset; | |||
| #endif | |||
| for (j=0; j<bn/2; j+=1) | |||
| { | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| C0 = C; | |||
| C1 = C0+2*ldc; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2*2; | |||
| ptrbb = bb+off*2*2; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| res4 = 0; | |||
| res5 = 0; | |||
| res6 = 0; | |||
| res7 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk - off; | |||
| #elif defined(LEFT) | |||
| temp = off + 2; | |||
| #else | |||
| temp = off + 2; | |||
| #endif | |||
| for (k=0; k<temp/4; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7+load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7+load12*load15; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7-load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1+load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3+load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5+load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7+load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7-load12*load15; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7+load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7+load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0+load10*load11; | |||
| res1 = res1+load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2+load13*load11; | |||
| res3 = res3+load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4+load10*load15; | |||
| res5 = res5+load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6+load13*load15; | |||
| res7 = res7+load12*load15; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*1+0]; | |||
| load9 = ptrbb[4*1+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*1+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*1+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*1+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*1+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*1+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*1+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7-load12*load15; | |||
| load0 = ptrba[4*2+0]; | |||
| load1 = ptrbb[4*2+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*2+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*2+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*2+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*2+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*2+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*2+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7-load4*load7; | |||
| load8 = ptrba[4*3+0]; | |||
| load9 = ptrbb[4*3+0]; | |||
| res0 = res0+load8*load9; | |||
| load10 = ptrba[4*3+1]; | |||
| res1 = res1-load10*load9; | |||
| load11 = ptrbb[4*3+1]; | |||
| res0 = res0-load10*load11; | |||
| res1 = res1-load8*load11; | |||
| load12 = ptrba[4*3+2]; | |||
| res2 = res2+load12*load9; | |||
| load13 = ptrba[4*3+3]; | |||
| res3 = res3-load13*load9; | |||
| res2 = res2-load13*load11; | |||
| res3 = res3-load12*load11; | |||
| load14 = ptrbb[4*3+2]; | |||
| res4 = res4+load8*load14; | |||
| res5 = res5-load10*load14; | |||
| load15 = ptrbb[4*3+3]; | |||
| res4 = res4-load10*load15; | |||
| res5 = res5-load8*load15; | |||
| res6 = res6+load12*load14; | |||
| res7 = res7-load13*load14; | |||
| res6 = res6-load13*load15; | |||
| res7 = res7-load12*load15; | |||
| #endif | |||
| ptrba = ptrba+16; | |||
| ptrbb = ptrbb+16; | |||
| } | |||
| for (k=0; k<(temp&3); k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7+load4*load7; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5+load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7+load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7-load4*load7; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4+load2*load7; | |||
| res5 = res5+load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6+load5*load7; | |||
| res7 = res7+load4*load7; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| load6 = ptrbb[4*0+2]; | |||
| res4 = res4+load0*load6; | |||
| res5 = res5-load2*load6; | |||
| load7 = ptrbb[4*0+3]; | |||
| res4 = res4-load2*load7; | |||
| res5 = res5-load0*load7; | |||
| res6 = res6+load4*load6; | |||
| res7 = res7-load5*load6; | |||
| res6 = res6-load5*load7; | |||
| res7 = res7-load4*load7; | |||
| #endif | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| load0 = res0*alphar-res1*alphai; | |||
| load1 = res1*alphar+res0*alphai; | |||
| C0[0] = load0; | |||
| C0[1] = load1; | |||
| load2 = res2*alphar-res3*alphai; | |||
| load3 = res3*alphar+res2*alphai; | |||
| C0[2] = load2; | |||
| C0[3] = load3; | |||
| load4 = res4*alphar-res5*alphai; | |||
| load5 = res5*alphar+res4*alphai; | |||
| C1[0] = load4; | |||
| C1[1] = load5; | |||
| load6 = res6*alphar-res7*alphai; | |||
| load7 = res7*alphar+res6*alphai; | |||
| C1[2] = load6; | |||
| C1[3] = load7; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp*2*2; | |||
| ptrbb += temp*2*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2*2; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk - off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; | |||
| #else | |||
| temp = off+2; | |||
| #endif | |||
| for (k=0; k<temp; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3+load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2-load2*load5; | |||
| res3 = res3+load0*load5; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3+load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2+load2*load5; | |||
| res3 = res3-load0*load5; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3-load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2+load2*load5; | |||
| res3 = res3+load0*load5; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[4*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[4*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrbb[4*0+2]; | |||
| res2 = res2+load0*load4; | |||
| res3 = res3-load2*load4; | |||
| load5 = ptrbb[4*0+3]; | |||
| res2 = res2-load2*load5; | |||
| res3 = res3-load0*load5; | |||
| #endif | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| load0 = res0*alphar-res1*alphai; | |||
| load1 = res1*alphar+res0*alphai; | |||
| C0[0] = load0; | |||
| C0[1] = load1; | |||
| load2 = res2*alphar-res3*alphai; | |||
| load3 = res3*alphar+res2*alphai; | |||
| C1[0] = load2; | |||
| C1[1] = load3; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| k = (bk<<2); | |||
| bb = bb+k; | |||
| i = (ldc<<2); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/2; i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2*2; | |||
| ptrbb = bb+off*2; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| res2 = 0; | |||
| res3 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk - off; | |||
| #elif defined(LEFT) | |||
| temp = off + 2; | |||
| #else | |||
| temp = off + 1; | |||
| #endif | |||
| for (k=0; k<temp; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3+load4*load3; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3+load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3-load4*load3; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2+load5*load3; | |||
| res3 = res3+load4*load3; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[4*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[4*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| load4 = ptrba[4*0+2]; | |||
| res2 = res2+load4*load1; | |||
| load5 = ptrba[4*0+3]; | |||
| res3 = res3-load5*load1; | |||
| res2 = res2-load5*load3; | |||
| res3 = res3-load4*load3; | |||
| #endif | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| load0 = res0*alphar-res1*alphai; | |||
| load1 = res1*alphar+res0*alphai; | |||
| C0[0] = load0; | |||
| C0[1] = load1; | |||
| load2 = res2*alphar-res3*alphai; | |||
| load3 = res3*alphar+res2*alphai; | |||
| C0[2] = load2; | |||
| C0[3] = load3; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk-off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp*2*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+4; | |||
| } | |||
| for (i=0; i<(bm&1); i+=1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| res0 = 0; | |||
| res1 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off + 1; | |||
| #else | |||
| temp = off + 1; | |||
| #endif | |||
| for (k=0; k<temp; k+=1) | |||
| { | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1+load0*load3; | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1-load0*load3; | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0+load2*load3; | |||
| res1 = res1+load0*load3; | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1-load2*load1; | |||
| load3 = ptrbb[2*0+1]; | |||
| res0 = res0-load2*load3; | |||
| res1 = res1-load0*load3; | |||
| #endif | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| load0 = res0*alphar-res1*alphai; | |||
| load1 = res1*alphar+res0*alphai; | |||
| C0[0] = load0; | |||
| C0[1] = load1; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+2; | |||
| } | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,64 @@ | |||
| SAXPYKERNEL=axpy_loongson3a.S | |||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | |||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -1502,10 +1502,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q 80 | |||
| #define SGEMM_DEFAULT_R 1024 | |||
| #define SGEMM_DEFAULT_R 640 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R 1024 | |||
| #define ZGEMM_DEFAULT_R 1024 | |||
| #define CGEMM_DEFAULT_R 640 | |||
| #define ZGEMM_DEFAULT_R 640 | |||
| #define GEMM_OFFSET_A1 0x10000 | |||
| #define GEMM_OFFSET_B1 0x100000 | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #ifdef LOONGSON3B | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 64 | |||
| #define DGEMM_DEFAULT_P 24 | |||
| #define CGEMM_DEFAULT_P 24 | |||
| #define ZGEMM_DEFAULT_P 20 | |||
| #define SGEMM_DEFAULT_Q 192 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q 64 | |||
| #define SGEMM_DEFAULT_R 512 | |||
| #define DGEMM_DEFAULT_R 512 | |||
| #define CGEMM_DEFAULT_R 512 | |||
| #define ZGEMM_DEFAULT_R 512 | |||
| #define GEMM_OFFSET_A1 0x10000 | |||
| #define GEMM_OFFSET_B1 0x100000 | |||