1. Using core loongson3r3 and loongson3r4 for loongson 2. Add DYNAMIC_ARCH for loongson Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1tags/v0.3.13^2
| @@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 | |||||
| DYNAMIC_CORE += THUNDERX3T110 | DYNAMIC_CORE += THUNDERX3T110 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), mips64) | |||||
| DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 | |||||
| endif | |||||
| ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
| DYNAMIC_CORE = ZARCH_GENERIC | DYNAMIC_CORE = ZARCH_GENERIC | ||||
| @@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| CCOMMON_OPT += -march=mips64 | |||||
| FCOMMON_OPT += -march=mips64 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| CCOMMON_OPT += -march=mips64 | |||||
| FCOMMON_OPT += -march=mips64 | |||||
| ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||||
| CCOMMON_OPT += -march=loongson3a | |||||
| FCOMMON_OPT += -march=loongson3a | |||||
| endif | endif | ||||
| ifeq ($(CORE), MIPS24K) | ifeq ($(CORE), MIPS24K) | ||||
| @@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 | |||||
| else | else | ||||
| FCOMMON_OPT += -n64 | FCOMMON_OPT += -n64 | ||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| ifeq ($(CORE), LOONGSON3R3) | |||||
| FCOMMON_OPT += -loongson3 -static | FCOMMON_OPT += -loongson3 -static | ||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| ifeq ($(CORE), LOONGSON3R4) | |||||
| FCOMMON_OPT += -loongson3 -static | FCOMMON_OPT += -loongson3 -static | ||||
| endif | endif | ||||
| @@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 | |||||
| else | else | ||||
| CCOMMON_OPT += -n64 | CCOMMON_OPT += -n64 | ||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| ifeq ($(CORE), LOONGSON3R3) | |||||
| CCOMMON_OPT += -loongson3 -static | CCOMMON_OPT += -loongson3 -static | ||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| ifeq ($(CORE), LOONGSON3R4) | |||||
| CCOMMON_OPT += -loongson3 -static | CCOMMON_OPT += -loongson3 -static | ||||
| endif | endif | ||||
| @@ -1223,10 +1222,8 @@ ifdef SMP | |||||
| CCOMMON_OPT += -DSMP_SERVER | CCOMMON_OPT += -DSMP_SERVER | ||||
| ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
| ifneq ($(CORE), LOONGSON3B) | |||||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| @@ -1342,11 +1339,9 @@ endif | |||||
| ifneq ($(ARCH), x86_64) | ifneq ($(ARCH), x86_64) | ||||
| ifneq ($(ARCH), x86) | ifneq ($(ARCH), x86) | ||||
| ifneq ($(CORE), LOONGSON3B) | |||||
| NO_AFFINITY = 1 | NO_AFFINITY = 1 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifdef NO_AFFINITY | ifdef NO_AFFINITY | ||||
| ifeq ($(NO_AFFINITY), 0) | ifeq ($(NO_AFFINITY), 0) | ||||
| @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | ||||
| return 0; | return 0; | ||||
| #else | #else | ||||
| #if defined (LOONGSON3B) | |||||
| #if defined (__64BIT__) | |||||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||||
| #else | |||||
| return 0; //NULL Implementation on Loongson 3B 32bit. | |||||
| #endif | |||||
| #else | |||||
| //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | ||||
| // unsigned long null_nodemask=0; | // unsigned long null_nodemask=0; | ||||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | ||||
| #endif | #endif | ||||
| #endif | |||||
| } | } | ||||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | ||||
| @@ -229,12 +229,7 @@ REALNAME: ;\ | |||||
| #define BUFFER_SIZE ( 32 << 21) | #define BUFFER_SIZE ( 32 << 21) | ||||
| #if defined(LOONGSON3A) | |||||
| #define PAGESIZE (16UL << 10) | |||||
| #define FIXED_PAGESIZE (16UL << 10) | |||||
| #endif | |||||
| #if defined(LOONGSON3B) | |||||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||||
| #define PAGESIZE (16UL << 10) | #define PAGESIZE (16UL << 10) | ||||
| #define FIXED_PAGESIZE (16UL << 10) | #define FIXED_PAGESIZE (16UL << 10) | ||||
| #endif | #endif | ||||
| @@ -250,7 +245,7 @@ REALNAME: ;\ | |||||
| #define MAP_ANONYMOUS MAP_ANON | #define MAP_ANONYMOUS MAP_ANON | ||||
| #endif | #endif | ||||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||||
| #define PREFETCHD_(x) ld $0, x | #define PREFETCHD_(x) ld $0, x | ||||
| #define PREFETCHD(x) PREFETCHD_(x) | #define PREFETCHD(x) PREFETCHD_(x) | ||||
| #else | #else | ||||
| @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* or implied, of The University of Texas at Austin. */ | /* or implied, of The University of Texas at Austin. */ | ||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_SICORTEX 1 | |||||
| #define CPU_LOONGSON3A 2 | |||||
| #define CPU_LOONGSON3B 3 | |||||
| #define CPU_I6400 4 | |||||
| #define CPU_P6600 5 | |||||
| #define CPU_I6500 6 | |||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_SICORTEX 1 | |||||
| #define CPU_LOONGSON3R3 2 | |||||
| #define CPU_LOONGSON3R4 3 | |||||
| #define CPU_I6400 4 | |||||
| #define CPU_P6600 5 | |||||
| #define CPU_I6500 6 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "UNKNOWN", | "UNKNOWN", | ||||
| "SICORTEX", | "SICORTEX", | ||||
| "LOONGSON3A", | |||||
| "LOONGSON3B", | |||||
| "LOONGSON3R3", | |||||
| "LOONGSON3R4", | |||||
| "I6400", | "I6400", | ||||
| "P6600", | "P6600", | ||||
| "I6500" | "I6500" | ||||
| @@ -90,48 +90,13 @@ static char *cpuname[] = { | |||||
| int detect(void){ | int detect(void){ | ||||
| #ifdef __linux | |||||
| #ifdef linux | |||||
| FILE *infile; | FILE *infile; | ||||
| char buffer[512], *p; | char buffer[512], *p; | ||||
| p = (char *)NULL; | p = (char *)NULL; | ||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("cpu", buffer, 3)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| #if 0 | |||||
| fprintf(stderr, "%s\n", p); | |||||
| #endif | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL){ | |||||
| if (strstr(p, "Loongson-3A")){ | |||||
| return CPU_LOONGSON3A; | |||||
| }else if(strstr(p, "Loongson-3B")){ | |||||
| return CPU_LOONGSON3B; | |||||
| }else if (strstr(p, "Loongson-3")){ | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| p = (char *)NULL; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("system type", buffer, 11)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if (strstr(p, "loongson3a")) | |||||
| return CPU_LOONGSON3A; | |||||
| }else{ | |||||
| return CPU_SICORTEX; | |||||
| } | |||||
| } | |||||
| //Check model name for Loongson3 | //Check model name for Loongson3 | ||||
| infile = fopen("/proc/cpuinfo", "r"); | infile = fopen("/proc/cpuinfo", "r"); | ||||
| p = (char *)NULL; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | while (fgets(buffer, sizeof(buffer), infile)){ | ||||
| if (!strncmp("model name", buffer, 10)){ | if (!strncmp("model name", buffer, 10)){ | ||||
| p = strchr(buffer, ':') + 2; | p = strchr(buffer, ':') + 2; | ||||
| @@ -140,14 +105,16 @@ int detect(void){ | |||||
| } | } | ||||
| fclose(infile); | fclose(infile); | ||||
| if(p != NULL){ | if(p != NULL){ | ||||
| if (strstr(p, "Loongson-3A")){ | |||||
| return CPU_LOONGSON3A; | |||||
| }else if(strstr(p, "Loongson-3B")){ | |||||
| return CPU_LOONGSON3B; | |||||
| } | |||||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||||
| return CPU_LOONGSON3R3; | |||||
| }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||||
| return CPU_LOONGSON3R4; | |||||
| } else{ | |||||
| return CPU_SICORTEX; | |||||
| } | } | ||||
| #endif | #endif | ||||
| return CPU_UNKNOWN; | return CPU_UNKNOWN; | ||||
| } | |||||
| } | } | ||||
| char *get_corename(void){ | char *get_corename(void){ | ||||
| @@ -159,10 +126,10 @@ void get_architecture(void){ | |||||
| } | } | ||||
| void get_subarchitecture(void){ | void get_subarchitecture(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("LOONGSON3A"); | |||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("LOONGSON3B"); | |||||
| if(detect()==CPU_LOONGSON3R3) { | |||||
| printf("LOONGSON3R3"); | |||||
| }else if(detect()==CPU_LOONGSON3R4){ | |||||
| printf("LOONGSON3R4"); | |||||
| }else if(detect()==CPU_I6400){ | }else if(detect()==CPU_I6400){ | ||||
| printf("I6400"); | printf("I6400"); | ||||
| }else if(detect()==CPU_P6600){ | }else if(detect()==CPU_P6600){ | ||||
| @@ -179,8 +146,8 @@ void get_subdirname(void){ | |||||
| } | } | ||||
| void get_cpuconfig(void){ | void get_cpuconfig(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("#define LOONGSON3A\n"); | |||||
| if(detect()==CPU_LOONGSON3R3) { | |||||
| printf("#define LOONGSON3R3\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | printf("#define L1_DATA_SIZE 65536\n"); | ||||
| printf("#define L1_DATA_LINESIZE 32\n"); | printf("#define L1_DATA_LINESIZE 32\n"); | ||||
| printf("#define L2_SIZE 512488\n"); | printf("#define L2_SIZE 512488\n"); | ||||
| @@ -188,8 +155,8 @@ void get_cpuconfig(void){ | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
| printf("#define DTB_SIZE 4096\n"); | printf("#define DTB_SIZE 4096\n"); | ||||
| printf("#define L2_ASSOCIATIVE 4\n"); | printf("#define L2_ASSOCIATIVE 4\n"); | ||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("#define LOONGSON3B\n"); | |||||
| }else if(detect()==CPU_LOONGSON3R4){ | |||||
| printf("#define LOONGSON3R4\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | printf("#define L1_DATA_SIZE 65536\n"); | ||||
| printf("#define L1_DATA_LINESIZE 32\n"); | printf("#define L1_DATA_LINESIZE 32\n"); | ||||
| printf("#define L2_SIZE 512488\n"); | printf("#define L2_SIZE 512488\n"); | ||||
| @@ -237,10 +204,10 @@ void get_cpuconfig(void){ | |||||
| } | } | ||||
| void get_libname(void){ | void get_libname(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("loongson3a\n"); | |||||
| }else if(detect()==CPU_LOONGSON3B) { | |||||
| printf("loongson3b\n"); | |||||
| if(detect()==CPU_LOONGSON3R3) { | |||||
| printf("loongson3r3\n"); | |||||
| }else if(detect()==CPU_LOONGSON3R4) { | |||||
| printf("loongson3r4\n"); | |||||
| }else if(detect()==CPU_I6400) { | }else if(detect()==CPU_I6400) { | ||||
| printf("i6400\n"); | printf("i6400\n"); | ||||
| }else if(detect()==CPU_P6600) { | }else if(detect()==CPU_P6600) { | ||||
| @@ -24,10 +24,14 @@ else | |||||
| ifeq ($(ARCH),zarch) | ifeq ($(ARCH),zarch) | ||||
| COMMONOBJS += dynamic_zarch.$(SUFFIX) | COMMONOBJS += dynamic_zarch.$(SUFFIX) | ||||
| else | else | ||||
| ifeq ($(ARCH),mips64) | |||||
| COMMONOBJS += dynamic_mips64.$(SUFFIX) | |||||
| else | |||||
| COMMONOBJS += dynamic.$(SUFFIX) | COMMONOBJS += dynamic.$(SUFFIX) | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| else | else | ||||
| COMMONOBJS += parameter.$(SUFFIX) | COMMONOBJS += parameter.$(SUFFIX) | ||||
| endif | endif | ||||
| @@ -92,10 +96,14 @@ else | |||||
| ifeq ($(ARCH),zarch) | ifeq ($(ARCH),zarch) | ||||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | ||||
| else | else | ||||
| ifeq ($(ARCH),mips64) | |||||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) | |||||
| else | |||||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| else | else | ||||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | ||||
| endif | endif | ||||
| @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| #if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
| #ifndef DYNAMIC_ARCH | |||||
| //set parameters for different number of threads. | //set parameters for different number of threads. | ||||
| blas_set_parameter(); | blas_set_parameter(); | ||||
| #endif | #endif | ||||
| #endif | |||||
| } | } | ||||
| @@ -0,0 +1,230 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <sys/wait.h> | |||||
| #include <stdio.h> | |||||
| #include <unistd.h> | |||||
| #include <stdlib.h> | |||||
| #include <string.h> | |||||
| #include <sys/resource.h> | |||||
| #include "common.h" | |||||
| extern gotoblas_t gotoblas_LOONGSON3R3; | |||||
| extern gotoblas_t gotoblas_LOONGSON3R4; | |||||
| extern void openblas_warning(int verbose, const char * msg); | |||||
| #define NUM_CORETYPES 2 | |||||
| static char *corename[] = { | |||||
| "loongson3r3", | |||||
| "loongson3r4", | |||||
| "UNKNOWN" | |||||
| }; | |||||
| char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; | |||||
| if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; | |||||
| return corename[NUM_CORETYPES]; | |||||
| } | |||||
| static gotoblas_t *force_coretype(char *coretype) { | |||||
| int i; | |||||
| int found = -1; | |||||
| char message[128]; | |||||
| for ( i=0 ; i < NUM_CORETYPES; i++) | |||||
| { | |||||
| if (!strncasecmp(coretype, corename[i], 20)) | |||||
| { | |||||
| found = i; | |||||
| break; | |||||
| } | |||||
| } | |||||
| switch (found) | |||||
| { | |||||
| case 0: return (&gotoblas_LOONGSON3R3); | |||||
| case 1: return (&gotoblas_LOONGSON3R4); | |||||
| } | |||||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||||
| openblas_warning(1, message); | |||||
| return NULL; | |||||
| } | |||||
| #define MMI_MASK 0x00000010 | |||||
| #define MSA_MASK 0x00000020 | |||||
| int fd[2]; | |||||
| int support_cpucfg; | |||||
| static void handler(int signum) | |||||
| { | |||||
| close(fd[1]); | |||||
| exit(1); | |||||
| } | |||||
| /* Brief : Function to check if cpucfg supported on loongson | |||||
| * Return: 1 supported | |||||
| * 0 not supported | |||||
| */ | |||||
| static int cpucfg_test(void) { | |||||
| pid_t pid; | |||||
| int status = 0; | |||||
| support_cpucfg = 0; | |||||
| pipe(fd); | |||||
| pid = fork(); | |||||
| if (pid == 0) { /* Subprocess */ | |||||
| struct sigaction act; | |||||
| close(fd[0]); | |||||
| /* Set signal action for SIGILL. */ | |||||
| act.sa_handler = handler; | |||||
| sigaction(SIGILL,&act,NULL); | |||||
| /* Execute cpucfg in subprocess. */ | |||||
| __asm__ volatile( | |||||
| ".insn \n\t" | |||||
| ".word (0xc8080118) \n\t" | |||||
| ::: | |||||
| ); | |||||
| support_cpucfg = 1; | |||||
| write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); | |||||
| close(fd[1]); | |||||
| exit(0); | |||||
| } else if (pid > 0){ /* Parent process*/ | |||||
| close(fd[1]); | |||||
| if ((waitpid(pid,&status,0) <= 0) || | |||||
| (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) | |||||
| support_cpucfg = 0; | |||||
| close(fd[0]); | |||||
| } else { | |||||
| support_cpucfg = 0; | |||||
| } | |||||
| return support_cpucfg; | |||||
| } | |||||
| static gotoblas_t *get_coretype_from_cpucfg(void) { | |||||
| int flag = 0; | |||||
| __asm__ volatile( | |||||
| ".insn \n\t" | |||||
| "dli $8, 0x01 \n\t" | |||||
| ".word (0xc9084918) \n\t" | |||||
| "usw $9, 0x00(%0) \n\t" | |||||
| : | |||||
| : "r"(&flag) | |||||
| : "memory" | |||||
| ); | |||||
| if (flag & MSA_MASK) | |||||
| return (&gotoblas_LOONGSON3R4); | |||||
| if (flag & MMI_MASK) | |||||
| return (&gotoblas_LOONGSON3R3); | |||||
| return NULL; | |||||
| } | |||||
| static gotoblas_t *get_coretype_from_cpuinfo(void) { | |||||
| #ifdef linux | |||||
| FILE *infile; | |||||
| char buffer[512], *p; | |||||
| p = (char *)NULL; | |||||
| //Check model name for Loongson3 | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("model name", buffer, 10)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL){ | |||||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) | |||||
| return (&gotoblas_LOONGSON3R3); | |||||
| else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) | |||||
| return (&gotoblas_LOONGSON3R4); | |||||
| else | |||||
| return NULL; | |||||
| } | |||||
| #endif | |||||
| return NULL; | |||||
| } | |||||
| static gotoblas_t *get_coretype(void) { | |||||
| int ret = 0; | |||||
| ret = cpucfg_test(); | |||||
| if (ret == 1) | |||||
| return get_coretype_from_cpucfg(); | |||||
| else | |||||
| return get_coretype_from_cpuinfo(); | |||||
| } | |||||
| void gotoblas_dynamic_init(void) { | |||||
| char coremsg[128]; | |||||
| char coren[22]; | |||||
| char *p; | |||||
| if (gotoblas) return; | |||||
| p = getenv("OPENBLAS_CORETYPE"); | |||||
| if ( p ) | |||||
| { | |||||
| gotoblas = force_coretype(p); | |||||
| } | |||||
| else | |||||
| { | |||||
| gotoblas = get_coretype(); | |||||
| } | |||||
| if (gotoblas == NULL) | |||||
| { | |||||
| snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); | |||||
| openblas_warning(1, coremsg); | |||||
| gotoblas = &gotoblas_LOONGSON3R3; | |||||
| } | |||||
| if (gotoblas && gotoblas->init) { | |||||
| strncpy(coren, gotoblas_corename(), 20); | |||||
| sprintf(coremsg, "Core: %s\n", coren); | |||||
| openblas_warning(2, coremsg); | |||||
| gotoblas -> init(); | |||||
| } else { | |||||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||||
| exit(1); | |||||
| } | |||||
| } | |||||
| void gotoblas_dynamic_quit(void) { | |||||
| gotoblas = NULL; | |||||
| } | |||||
| @@ -717,7 +717,7 @@ void blas_set_parameter(void){ | |||||
| #if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
| void blas_set_parameter(void){ | void blas_set_parameter(void){ | ||||
| #if defined(LOONGSON3A) | |||||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||||
| #ifdef SMP | #ifdef SMP | ||||
| if(blas_num_threads == 1){ | if(blas_num_threads == 1){ | ||||
| #endif | #endif | ||||
| @@ -731,20 +731,6 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(LOONGSON3B) | |||||
| #ifdef SMP | |||||
| if(blas_num_threads == 1 || blas_num_threads == 2){ | |||||
| #endif | |||||
| //single thread | |||||
| dgemm_r = 640; | |||||
| #ifdef SMP | |||||
| }else{ | |||||
| //multi thread | |||||
| dgemm_r = 160; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_PPC440FP2 */ | /* #define FORCE_PPC440FP2 */ | ||||
| /* #define FORCE_CELL */ | /* #define FORCE_CELL */ | ||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3A */ | |||||
| /* #define FORCE_LOONGSON3B */ | |||||
| /* #define FORCE_LOONGSON3R3 */ | |||||
| /* #define FORCE_LOONGSON3R4 */ | |||||
| /* #define FORCE_I6400 */ | /* #define FORCE_I6400 */ | ||||
| /* #define FORCE_P6600 */ | /* #define FORCE_P6600 */ | ||||
| /* #define FORCE_P5600 */ | /* #define FORCE_P5600 */ | ||||
| @@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef FORCE_LOONGSON3A | |||||
| #ifdef FORCE_LOONGSON3R3 | |||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "MIPS" | #define ARCHITECTURE "MIPS" | ||||
| #define SUBARCHITECTURE "LOONGSON3A" | |||||
| #define SUBARCHITECTURE "LOONGSON3R3" | |||||
| #define SUBDIRNAME "mips64" | #define SUBDIRNAME "mips64" | ||||
| #define ARCHCONFIG "-DLOONGSON3A " \ | |||||
| #define ARCHCONFIG "-DLOONGSON3R3 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | ||||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | ||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | ||||
| #define LIBNAME "loongson3a" | |||||
| #define CORENAME "LOONGSON3A" | |||||
| #define LIBNAME "loongson3r3" | |||||
| #define CORENAME "LOONGSON3R3" | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_LOONGSON3B | |||||
| #ifdef FORCE_LOONGSON3R4 | |||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "MIPS" | #define ARCHITECTURE "MIPS" | ||||
| #define SUBARCHITECTURE "LOONGSON3B" | |||||
| #define SUBARCHITECTURE "LOONGSON3R4" | |||||
| #define SUBDIRNAME "mips64" | #define SUBDIRNAME "mips64" | ||||
| #define ARCHCONFIG "-DLOONGSON3B " \ | |||||
| #define ARCHCONFIG "-DLOONGSON3R4 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | ||||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | ||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | ||||
| #define LIBNAME "loongson3b" | |||||
| #define CORENAME "LOONGSON3B" | |||||
| #define LIBNAME "loongson3r4" | |||||
| #define CORENAME "LOONGSON3R4" | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| @@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) | |||||
| endif | endif | ||||
| else ifeq ($(TARGET_CORE), HASWELL) | else ifeq ($(TARGET_CORE), HASWELL) | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | ||||
| else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||||
| else | else | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | ||||
| endif | endif | ||||
| @@ -68,6 +70,9 @@ else | |||||
| TARGET_CORE = $(CORE) | TARGET_CORE = $(CORE) | ||||
| KDIR = | KDIR = | ||||
| TSUFFIX = | TSUFFIX = | ||||
| ifeq ($(TARGET_CORE), LOONGSON3R4) | |||||
| override CFLAGS += $(MSA_FLAGS) | |||||
| endif | |||||
| endif | endif | ||||
| -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) | -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) | ||||
| @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| ifeq ($(TARGET), LOONGSON3B) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| ifneq ($(DYNAMIC_ARCH), 1) | ifneq ($(DYNAMIC_ARCH), 1) | ||||
| ifeq ($(TARGET), GENERIC) | ifeq ($(TARGET), GENERIC) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | ||||
| { \ | { \ | ||||
| LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ | LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ | ||||
| src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||||
| src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||||
| SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | ||||
| \ | \ | ||||
| PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | ||||
| @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | ||||
| { \ | { \ | ||||
| LD_SP2_INC(pa0, 4, src_a0, src_a1); \ | LD_SP2_INC(pa0, 4, src_a0, src_a1); \ | ||||
| src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||||
| src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||||
| SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | ||||
| \ | \ | ||||
| PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | ||||
| @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| { | { | ||||
| if ((0 == c) && (0 == s)) | if ((0 == c) && (0 == s)) | ||||
| { | { | ||||
| v4f32 zero = __msa_cast_to_vector_float(0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||||
| v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||||
| /* process 2 elements */ | /* process 2 elements */ | ||||
| for (j = (n >> 1); j--;) | for (j = (n >> 1); j--;) | ||||
| @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| { | { | ||||
| if ((0.0 == da_r) && (0.0 == da_i)) | if ((0.0 == da_r) && (0.0 == da_i)) | ||||
| { | { | ||||
| v4f32 zero_v = __msa_cast_to_vector_float(0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||||
| v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||||
| for (i = (n >> 5); i--;) | for (i = (n >> 5); i--;) | ||||
| { | { | ||||
| @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| { | { | ||||
| if (0.0 == da) | if (0.0 == da) | ||||
| { | { | ||||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
| v2f64 zero_v = {0.0, 0.0}; | |||||
| for (i = (n >> 5); i--;) | for (i = (n >> 5); i--;) | ||||
| { | { | ||||
| @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); | ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); | ||||
| ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); | ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); | ||||
| src_a54 = __msa_cast_to_vector_double(*(a + 54)); | |||||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||||
| src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); | |||||
| src_a62 = LD_DP(a + 62); | src_a62 = LD_DP(a + 62); | ||||
| src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); | src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); | ||||
| src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); | src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); | ||||
| @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| src_a44 = LD_DP(a + 44); | src_a44 = LD_DP(a + 44); | ||||
| src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); | src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); | ||||
| src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); | src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); | ||||
| src_a36 = __msa_cast_to_vector_double(*(a + 36)); | |||||
| src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||||
| src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); | |||||
| res_c7 *= src_a63; | res_c7 *= src_a63; | ||||
| res_c6 -= res_c7 * src_a62; | res_c6 -= res_c7 * src_a62; | ||||
| @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| src_a26 = LD_DP(a + 26); | src_a26 = LD_DP(a + 26); | ||||
| src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); | src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); | ||||
| src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); | src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); | ||||
| src_a18 = __msa_cast_to_vector_double(*(a + 18)); | |||||
| src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||||
| src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); | |||||
| res_c3 -= res_c7 * src_a59; | res_c3 -= res_c7 * src_a59; | ||||
| res_c2 -= res_c7 * src_a58; | res_c2 -= res_c7 * src_a58; | ||||
| @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| src_a8 = LD_DP(a + 8); | src_a8 = LD_DP(a + 8); | ||||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | ||||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | ||||
| src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||||
| res_c1 -= res_c2 * src_a17; | res_c1 -= res_c2 * src_a17; | ||||
| res_c1 *= src_a9; | res_c1 *= src_a9; | ||||
| @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a52 = LD_DP(a - 12); | src_a52 = LD_DP(a - 12); | ||||
| src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); | src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); | ||||
| src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); | src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); | ||||
| src_a54 = __msa_cast_to_vector_double(*(a - 10)); | |||||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||||
| src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); | |||||
| src_a40 = LD_DP(a - 24); | src_a40 = LD_DP(a - 24); | ||||
| src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); | src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); | ||||
| @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a34 = LD_DP(a - 30); | src_a34 = LD_DP(a - 30); | ||||
| src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); | src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); | ||||
| src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); | src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); | ||||
| src_a36 = __msa_cast_to_vector_double(*(a - 28)); | |||||
| src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||||
| src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); | |||||
| res_c4 *= src_a36; | res_c4 *= src_a36; | ||||
| res_c3 -= res_c4 * src_a35; | res_c3 -= res_c4 * src_a35; | ||||
| @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a16 = LD_DP(a - 48); | src_a16 = LD_DP(a - 48); | ||||
| src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); | src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); | ||||
| src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); | src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); | ||||
| src_a18 = __msa_cast_to_vector_double(*(a - 46)); | |||||
| src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||||
| src_a0 = __msa_cast_to_vector_double(*(a - 64)); | |||||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
| src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); | |||||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); | |||||
| src_a8 = LD_DP(a - 56); | src_a8 = LD_DP(a - 56); | ||||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | ||||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | ||||
| @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | ||||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | ||||
| src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||||
| src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
| src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||||
| src_a4 = LD_DP(a + 4); | src_a4 = LD_DP(a + 4); | ||||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | ||||
| @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | ||||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | ||||
| src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||||
| src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||||
| src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||||
| src_a4 = LD_DP(a + 4); | src_a4 = LD_DP(a + 4); | ||||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | ||||
| @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| res_c14 -= res_c8 * src_a6; | res_c14 -= res_c8 * src_a6; | ||||
| res_c15 -= res_c8 * src_a7; | res_c15 -= res_c8 * src_a7; | ||||
| src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||||
| src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||||
| src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
| @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| res_c14 -= res_c10 * src_a22; | res_c14 -= res_c10 * src_a22; | ||||
| res_c15 -= res_c10 * src_a23; | res_c15 -= res_c10 * src_a23; | ||||
| src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||||
| src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||||
| src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||||
| src_a28 = LD_DP(a + 28); | src_a28 = LD_DP(a + 28); | ||||
| src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | ||||
| src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | ||||
| @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| res_c14 -= res_c12 * src_a38; | res_c14 -= res_c12 * src_a38; | ||||
| res_c15 -= res_c12 * src_a39; | res_c15 -= res_c12 * src_a39; | ||||
| src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||||
| src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||||
| src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||||
| src_a46 = LD_DP(a + 46); | src_a46 = LD_DP(a + 46); | ||||
| src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | ||||
| src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | ||||
| @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); | ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); | ||||
| ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); | ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); | ||||
| src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||||
| src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||||
| src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||||
| src_a54 = LD_DP(a + 54); | src_a54 = LD_DP(a + 54); | ||||
| src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | ||||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | ||||
| @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| res_c6 -= res_c0 * src_a6; | res_c6 -= res_c0 * src_a6; | ||||
| res_c7 -= res_c0 * src_a7; | res_c7 -= res_c0 * src_a7; | ||||
| src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||||
| src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||||
| src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
| @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| res_c6 -= res_c2 * src_a22; | res_c6 -= res_c2 * src_a22; | ||||
| res_c7 -= res_c2 * src_a23; | res_c7 -= res_c2 * src_a23; | ||||
| src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||||
| src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||||
| src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||||
| src_a28 = LD_DP(a + 28); | src_a28 = LD_DP(a + 28); | ||||
| src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | ||||
| src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | ||||
| @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| res_c6 -= res_c4 * src_a38; | res_c6 -= res_c4 * src_a38; | ||||
| res_c7 -= res_c4 * src_a39; | res_c7 -= res_c4 * src_a39; | ||||
| src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||||
| src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||||
| src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||||
| src_a46 = LD_DP(a + 46); | src_a46 = LD_DP(a + 46); | ||||
| src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | ||||
| src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | ||||
| @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| res_c6 -= res_c5 * src_a46; | res_c6 -= res_c5 * src_a46; | ||||
| res_c7 -= res_c5 * src_a47; | res_c7 -= res_c5 * src_a47; | ||||
| src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||||
| src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||||
| src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||||
| src_a54 = LD_DP(a + 54); | src_a54 = LD_DP(a + 54); | ||||
| src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | ||||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | ||||
| @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| res_c6 -= res_c4 * src_a2; | res_c6 -= res_c4 * src_a2; | ||||
| res_c7 -= res_c4 * src_a3; | res_c7 -= res_c4 * src_a3; | ||||
| src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||||
| src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||||
| src_a6 = LD_DP(a + 6); | src_a6 = LD_DP(a + 6); | ||||
| src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | ||||
| src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | ||||
| @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
| src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||||
| src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||||
| src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||||
| res_c2 *= src_a10; | res_c2 *= src_a10; | ||||
| res_c3 -= res_c2 * src_a11; | res_c3 -= res_c2 * src_a11; | ||||
| @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| res_c2 -= res_c0 * src_a2; | res_c2 -= res_c0 * src_a2; | ||||
| res_c3 -= res_c0 * src_a3; | res_c3 -= res_c0 * src_a3; | ||||
| src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||||
| src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||||
| src_a6 = LD_DP(a + 6); | src_a6 = LD_DP(a + 6); | ||||
| src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | ||||
| src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | ||||
| @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_a10 = LD_DP(a + 10); | src_a10 = LD_DP(a + 10); | ||||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | ||||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | ||||
| src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||||
| src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||||
| src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||||
| res_c2 *= src_a10; | res_c2 *= src_a10; | ||||
| res_c3 -= res_c2 * src_a11; | res_c3 -= res_c2 * src_a11; | ||||
| @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
| src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||||
| src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||||
| src_b6 = LD_DP(b + 6); | src_b6 = LD_DP(b + 6); | ||||
| src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | ||||
| src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | ||||
| src_b10 = LD_DP(b + 10); | src_b10 = LD_DP(b + 10); | ||||
| src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | ||||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | ||||
| src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||||
| src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||||
| src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||||
| src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
| src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
| @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_b0 = LD_DP(b + 0); | src_b0 = LD_DP(b + 0); | ||||
| src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | ||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | ||||
| src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||||
| src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||||
| src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
| src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
| @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||||
| } | } | ||||
| } | } | ||||
| src_b0 = __msa_cast_to_vector_double(*b); | |||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||||
| src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
| src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
| @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
| src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||||
| src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||||
| src_b6 = LD_DP(b + 6); | src_b6 = LD_DP(b + 6); | ||||
| src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | ||||
| src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | ||||
| src_b10 = LD_DP(b + 10); | src_b10 = LD_DP(b + 10); | ||||
| src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | ||||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | ||||
| src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||||
| src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||||
| src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||||
| src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
| src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
| @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_b0 = LD_DP(b + 0); | src_b0 = LD_DP(b + 0); | ||||
| src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | ||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | ||||
| src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||||
| src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||||
| src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
| src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
| @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||||
| src_b8 = LD_DP(b + 8); | src_b8 = LD_DP(b + 8); | ||||
| src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | ||||
| src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | ||||
| src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||||
| src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
| src_b4 = LD_DP(b + 4); | src_b4 = LD_DP(b + 4); | ||||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | ||||
| src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | ||||
| @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| a -= 16; | a -= 16; | ||||
| b -= 4; | b -= 4; | ||||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
| src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
| @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||||
| a -= 8; | a -= 8; | ||||
| b -= 1; | b -= 1; | ||||
| src_b0 = __msa_cast_to_vector_double(*b); | |||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||||
| src_c0 *= src_b0; | src_c0 *= src_b0; | ||||
| src_c1 *= src_b0; | src_c1 *= src_b0; | ||||
| @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| src_b8 = LD_DP(b + 8); | src_b8 = LD_DP(b + 8); | ||||
| src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | ||||
| src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | ||||
| src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||||
| src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
| src_b4 = LD_DP(b + 4); | src_b4 = LD_DP(b + 4); | ||||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | ||||
| src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | ||||
| @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||||
| a -= 8; | a -= 8; | ||||
| b -= 4; | b -= 4; | ||||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||||
| src_b2 = LD_DP(b + 2); | src_b2 = LD_DP(b + 2); | ||||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | ||||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | ||||
| @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) | |||||
| #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | ||||
| #define COPY_FLOAT_TO_VECTOR(a) ( { \ | #define COPY_FLOAT_TO_VECTOR(a) ( { \ | ||||
| v4f32 out; \ | |||||
| out = __msa_cast_to_vector_float(a); \ | |||||
| out = (v4f32) __msa_splati_w((v4i32) out, 0); \ | |||||
| v4f32 out = {a, a, a, a}; \ | |||||
| out; \ | out; \ | ||||
| } ) | } ) | ||||
| #define COPY_DOUBLE_TO_VECTOR(a) ( { \ | #define COPY_DOUBLE_TO_VECTOR(a) ( { \ | ||||
| v2f64 out; \ | |||||
| out = __msa_cast_to_vector_double(a); \ | |||||
| out = (v2f64) __msa_splati_d((v2i64) out, 0); \ | |||||
| v2f64 out = {a, a}; \ | |||||
| out; \ | out; \ | ||||
| } ) | } ) | ||||
| @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| { | { | ||||
| if ((0 == c) && (0 == s)) | if ((0 == c) && (0 == s)) | ||||
| { | { | ||||
| v4f32 zero = __msa_cast_to_vector_float(0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||||
| v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||||
| /* process 4 floats */ | /* process 4 floats */ | ||||
| for (j = (n >> 2); j--;) | for (j = (n >> 2); j--;) | ||||
| @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| { | { | ||||
| if (0.0 == da) | if (0.0 == da) | ||||
| { | { | ||||
| v4f32 zero_v = __msa_cast_to_vector_float(0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||||
| v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||||
| for (i = (n >> 6); i--;) | for (i = (n >> 6); i--;) | ||||
| { | { | ||||
| @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| { | { | ||||
| if ((0.0 == da_r) && (0.0 == da_i)) | if ((0.0 == da_r) && (0.0 == da_i)) | ||||
| { | { | ||||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
| v2f64 zero_v = {0.0, 0.0}; | |||||
| for (i = (n >> 4); i--;) | for (i = (n >> 4); i--;) | ||||
| { | { | ||||
| @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| if ((0.0 == da_r) && (0.0 == da_i)) | if ((0.0 == da_r) && (0.0 == da_i)) | ||||
| { | { | ||||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
| v2f64 zero_v = {0.0, 0.0}; | |||||
| for (i = (n >> 4); i--;) | for (i = (n >> 4); i--;) | ||||
| { | { | ||||
| @@ -1,64 +0,0 @@ | |||||
| SAXPYKERNEL=axpy_loongson3a.S | |||||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | |||||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | SGEMMITCOPY = ../generic/gemm_tcopy_8.c | ||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | SGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | ||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | DGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | ||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | ||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | ||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | ||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| DSDOTKERNEL = ../mips/dot.c | DSDOTKERNEL = ../mips/dot.c | ||||
| @@ -0,0 +1,192 @@ | |||||
| ifdef HAVE_MSA | |||||
| SAXPYKERNEL = ../mips/saxpy_msa.c | |||||
| DAXPYKERNEL = ../mips/daxpy_msa.c | |||||
| CAXPYKERNEL = ../mips/caxpy_msa.c | |||||
| ZAXPYKERNEL = ../mips/zaxpy_msa.c | |||||
| else | |||||
| SAXPYKERNEL = axpy_loongson3a.S | |||||
| DAXPYKERNEL = daxpy_loongson3a_simd.S | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SCOPYKERNEL = ../mips/scopy_msa.c | |||||
| DCOPYKERNEL = ../mips/dcopy_msa.c | |||||
| CCOPYKERNEL = ../mips/ccopy_msa.c | |||||
| ZCOPYKERNEL = ../mips/zcopy_msa.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SDOTKERNEL = ../mips/sdot_msa.c | |||||
| DDOTKERNEL = ../mips/ddot_msa.c | |||||
| CDOTKERNEL = ../mips/cdot_msa.c | |||||
| ZDOTKERNEL = ../mips/zdot_msa.c | |||||
| endif | |||||
| DSDOTKERNEL = ../mips/dot.c | |||||
| ifdef HAVE_MSA | |||||
| SROTKERNEL = ../mips/srot_msa.c | |||||
| DROTKERNEL = ../mips/drot_msa.c | |||||
| CROTKERNEL = ../mips/crot_msa.c | |||||
| ZROTKERNEL = ../mips/zrot_msa.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SSCALKERNEL = ../mips/sscal_msa.c | |||||
| DSCALKERNEL = ../mips/dscal_msa.c | |||||
| CSCALKERNEL = ../mips/cscal_msa.c | |||||
| ZSCALKERNEL = ../mips/zscal_msa.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||||
| DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||||
| CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||||
| CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||||
| ZGEMVNKERNEL = ../mips/zgemv_n_msa.c | |||||
| ZGEMVTKERNEL = ../mips/zgemv_t_msa.c | |||||
| else | |||||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SASUMKERNEL = ../mips/sasum_msa.c | |||||
| DASUMKERNEL = ../mips/dasum_msa.c | |||||
| CASUMKERNEL = ../mips/casum_msa.c | |||||
| ZASUMKERNEL = ../mips/zasum_msa.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SSWAPKERNEL = ../mips/sswap_msa.c | |||||
| DSWAPKERNEL = ../mips/dswap_msa.c | |||||
| CSWAPKERNEL = ../mips/cswap_msa.c | |||||
| ZSWAPKERNEL = ../mips/zswap_msa.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| else | |||||
| SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||||
| DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c | |||||
| DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| else | |||||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||||
| CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||||
| CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| else | |||||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| else | |||||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||||
| STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||||
| else | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||||
| DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c | |||||
| else | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| else | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| else | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| @@ -933,6 +933,77 @@ static void init_parameter(void) { | |||||
| } | } | ||||
| #else // (ARCH_ARM64) | #else // (ARCH_ARM64) | ||||
| #if defined(ARCH_MIPS64) | |||||
| static void init_parameter(void) { | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||||
| TABLE_NAME.dgemm_r = 640; | |||||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; | |||||
| TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | |||||
| TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; | |||||
| TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; | |||||
| #endif | |||||
| #if defined(USE_GEMM3M) | |||||
| #ifdef CGEMM3M_DEFAULT_P | |||||
| TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; | |||||
| #else | |||||
| TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; | |||||
| #endif | |||||
| #ifdef ZGEMM3M_DEFAULT_P | |||||
| TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; | |||||
| #else | |||||
| TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; | |||||
| #endif | |||||
| #ifdef CGEMM3M_DEFAULT_Q | |||||
| TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; | |||||
| #else | |||||
| TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; | |||||
| #endif | |||||
| #ifdef ZGEMM3M_DEFAULT_Q | |||||
| TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; | |||||
| #else | |||||
| TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; | |||||
| #endif | |||||
| #ifdef CGEMM3M_DEFAULT_R | |||||
| TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; | |||||
| #else | |||||
| TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; | |||||
| #endif | |||||
| #ifdef ZGEMM3M_DEFAULT_R | |||||
| TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; | |||||
| #else | |||||
| TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; | |||||
| #endif | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; | |||||
| TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; | |||||
| TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; | |||||
| #endif | |||||
| #endif | |||||
| } | |||||
| #else // (ARCH_MIPS64) | |||||
| #if (ARCH_POWER) | #if (ARCH_POWER) | ||||
| static void init_parameter(void) { | static void init_parameter(void) { | ||||
| @@ -1780,4 +1851,5 @@ static void init_parameter(void) { | |||||
| } | } | ||||
| #endif //POWER | #endif //POWER | ||||
| #endif //ZARCH | #endif //ZARCH | ||||
| #endif //(ARCH_MIPS64) | |||||
| #endif //(ARCH_ARM64) | #endif //(ARCH_ARM64) | ||||
| @@ -2570,8 +2570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #endif | #endif | ||||
| #ifdef LOONGSON3A | |||||
| /*Copy from SICORTEX*/ | |||||
| #if defined(LOONGSON3R4) | |||||
| #define SNUMOPT 2 | #define SNUMOPT 2 | ||||
| #define DNUMOPT 2 | #define DNUMOPT 2 | ||||
| @@ -2579,6 +2578,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| #ifdef HAVE_MSA | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2590,6 +2602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | #define ZGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #endif | |||||
| #define SGEMM_DEFAULT_P 64 | #define SGEMM_DEFAULT_P 64 | ||||
| #define DGEMM_DEFAULT_P 44 | #define DGEMM_DEFAULT_P 44 | ||||
| @@ -2612,7 +2625,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #endif | #endif | ||||
| #ifdef LOONGSON3B | |||||
| #if defined(LOONGSON3R3) | |||||
| ////Copy from SICORTEX | |||||
| #define SNUMOPT 2 | #define SNUMOPT 2 | ||||
| #define DNUMOPT 2 | #define DNUMOPT 2 | ||||
| @@ -2620,32 +2634,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | #define CGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | #define ZGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define SGEMM_DEFAULT_P 64 | #define SGEMM_DEFAULT_P 64 | ||||
| #define DGEMM_DEFAULT_P 24 | |||||
| #define CGEMM_DEFAULT_P 24 | |||||
| #define ZGEMM_DEFAULT_P 20 | |||||
| #define DGEMM_DEFAULT_P 44 | |||||
| #define CGEMM_DEFAULT_P 64 | |||||
| #define ZGEMM_DEFAULT_P 32 | |||||
| #define SGEMM_DEFAULT_Q 192 | #define SGEMM_DEFAULT_Q 192 | ||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define DGEMM_DEFAULT_Q 92 | |||||
| #define CGEMM_DEFAULT_Q 128 | #define CGEMM_DEFAULT_Q 128 | ||||
| #define ZGEMM_DEFAULT_Q 64 | |||||
| #define ZGEMM_DEFAULT_Q 80 | |||||
| #define SGEMM_DEFAULT_R 512 | |||||
| #define DGEMM_DEFAULT_R 512 | |||||
| #define CGEMM_DEFAULT_R 512 | |||||
| #define ZGEMM_DEFAULT_R 512 | |||||
| #define SGEMM_DEFAULT_R 640 | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define CGEMM_DEFAULT_R 640 | |||||
| #define ZGEMM_DEFAULT_R 640 | |||||
| #define GEMM_OFFSET_A1 0x10000 | #define GEMM_OFFSET_A1 0x10000 | ||||
| #define GEMM_OFFSET_B1 0x100000 | #define GEMM_OFFSET_B1 0x100000 | ||||