1. Using core loongson3r3 and loongson3r4 for loongson 2. Add DYNAMIC_ARCH for loongson Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1tags/v0.3.13^2
| @@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 | |||
| DYNAMIC_CORE += THUNDERX3T110 | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| @@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
| CCOMMON_OPT += -march=loongson3a | |||
| FCOMMON_OPT += -march=loongson3a | |||
| endif | |||
| ifeq ($(CORE), MIPS24K) | |||
| @@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 | |||
| else | |||
| FCOMMON_OPT += -n64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| ifeq ($(CORE), LOONGSON3R3) | |||
| FCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| ifeq ($(CORE), LOONGSON3R4) | |||
| FCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| @@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 | |||
| else | |||
| CCOMMON_OPT += -n64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| ifeq ($(CORE), LOONGSON3R3) | |||
| CCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| ifeq ($(CORE), LOONGSON3R4) | |||
| CCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| @@ -1223,10 +1222,8 @@ ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| ifeq ($(ARCH), mips64) | |||
| ifneq ($(CORE), LOONGSON3B) | |||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -1342,11 +1339,9 @@ endif | |||
| ifneq ($(ARCH), x86_64) | |||
| ifneq ($(ARCH), x86) | |||
| ifneq ($(CORE), LOONGSON3B) | |||
| NO_AFFINITY = 1 | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef NO_AFFINITY | |||
| ifeq ($(NO_AFFINITY), 0) | |||
| @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | |||
| return 0; | |||
| #else | |||
| #if defined (LOONGSON3B) | |||
| #if defined (__64BIT__) | |||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
| #else | |||
| return 0; //NULL Implementation on Loongson 3B 32bit. | |||
| #endif | |||
| #else | |||
| //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | |||
| // unsigned long null_nodemask=0; | |||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
| #endif | |||
| #endif | |||
| } | |||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | |||
| @@ -229,12 +229,7 @@ REALNAME: ;\ | |||
| #define BUFFER_SIZE ( 32 << 21) | |||
| #if defined(LOONGSON3A) | |||
| #define PAGESIZE (16UL << 10) | |||
| #define FIXED_PAGESIZE (16UL << 10) | |||
| #endif | |||
| #if defined(LOONGSON3B) | |||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
| #define PAGESIZE (16UL << 10) | |||
| #define FIXED_PAGESIZE (16UL << 10) | |||
| #endif | |||
| @@ -250,7 +245,7 @@ REALNAME: ;\ | |||
| #define MAP_ANONYMOUS MAP_ANON | |||
| #endif | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
| #define PREFETCHD_(x) ld $0, x | |||
| #define PREFETCHD(x) PREFETCHD_(x) | |||
| #else | |||
| @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3A 2 | |||
| #define CPU_LOONGSON3B 3 | |||
| #define CPU_I6400 4 | |||
| #define CPU_P6600 5 | |||
| #define CPU_I6500 6 | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3R3 2 | |||
| #define CPU_LOONGSON3R4 3 | |||
| #define CPU_I6400 4 | |||
| #define CPU_P6600 5 | |||
| #define CPU_I6500 6 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "SICORTEX", | |||
| "LOONGSON3A", | |||
| "LOONGSON3B", | |||
| "LOONGSON3R3", | |||
| "LOONGSON3R4", | |||
| "I6400", | |||
| "P6600", | |||
| "I6500" | |||
| @@ -90,48 +90,13 @@ static char *cpuname[] = { | |||
| int detect(void){ | |||
| #ifdef __linux | |||
| #ifdef linux | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("cpu", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("system type", buffer, 11)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "loongson3a")) | |||
| return CPU_LOONGSON3A; | |||
| }else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| @@ -140,14 +105,16 @@ int detect(void){ | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| } | |||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
| return CPU_LOONGSON3R3; | |||
| }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
| return CPU_LOONGSON3R4; | |||
| } else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| } | |||
| char *get_corename(void){ | |||
| @@ -159,10 +126,10 @@ void get_architecture(void){ | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("LOONGSON3A"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("LOONGSON3B"); | |||
| if(detect()==CPU_LOONGSON3R3) { | |||
| printf("LOONGSON3R3"); | |||
| }else if(detect()==CPU_LOONGSON3R4){ | |||
| printf("LOONGSON3R4"); | |||
| }else if(detect()==CPU_I6400){ | |||
| printf("I6400"); | |||
| }else if(detect()==CPU_P6600){ | |||
| @@ -179,8 +146,8 @@ void get_subdirname(void){ | |||
| } | |||
| void get_cpuconfig(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("#define LOONGSON3A\n"); | |||
| if(detect()==CPU_LOONGSON3R3) { | |||
| printf("#define LOONGSON3R3\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| @@ -188,8 +155,8 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("#define LOONGSON3B\n"); | |||
| }else if(detect()==CPU_LOONGSON3R4){ | |||
| printf("#define LOONGSON3R4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| @@ -237,10 +204,10 @@ void get_cpuconfig(void){ | |||
| } | |||
| void get_libname(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("loongson3a\n"); | |||
| }else if(detect()==CPU_LOONGSON3B) { | |||
| printf("loongson3b\n"); | |||
| if(detect()==CPU_LOONGSON3R3) { | |||
| printf("loongson3r3\n"); | |||
| }else if(detect()==CPU_LOONGSON3R4) { | |||
| printf("loongson3r4\n"); | |||
| }else if(detect()==CPU_I6400) { | |||
| printf("i6400\n"); | |||
| }else if(detect()==CPU_P6600) { | |||
| @@ -24,10 +24,14 @@ else | |||
| ifeq ($(ARCH),zarch) | |||
| COMMONOBJS += dynamic_zarch.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),mips64) | |||
| COMMONOBJS += dynamic_mips64.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -92,10 +96,14 @@ else | |||
| ifeq ($(ARCH),zarch) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),mips64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| #if defined(ARCH_MIPS64) | |||
| #ifndef DYNAMIC_ARCH | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,230 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <sys/wait.h> | |||
| #include <stdio.h> | |||
| #include <unistd.h> | |||
| #include <stdlib.h> | |||
| #include <string.h> | |||
| #include <sys/resource.h> | |||
| #include "common.h" | |||
| extern gotoblas_t gotoblas_LOONGSON3R3; | |||
| extern gotoblas_t gotoblas_LOONGSON3R4; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 2 | |||
| static char *corename[] = { | |||
| "loongson3r3", | |||
| "loongson3r4", | |||
| "UNKNOWN" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; | |||
| if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| static gotoblas_t *force_coretype(char *coretype) { | |||
| int i; | |||
| int found = -1; | |||
| char message[128]; | |||
| for ( i=0 ; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| switch (found) | |||
| { | |||
| case 0: return (&gotoblas_LOONGSON3R3); | |||
| case 1: return (&gotoblas_LOONGSON3R4); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| #define MMI_MASK 0x00000010 | |||
| #define MSA_MASK 0x00000020 | |||
| int fd[2]; | |||
| int support_cpucfg; | |||
| static void handler(int signum) | |||
| { | |||
| close(fd[1]); | |||
| exit(1); | |||
| } | |||
| /* Brief : Function to check if cpucfg supported on loongson | |||
| * Return: 1 supported | |||
| * 0 not supported | |||
| */ | |||
| static int cpucfg_test(void) { | |||
| pid_t pid; | |||
| int status = 0; | |||
| support_cpucfg = 0; | |||
| pipe(fd); | |||
| pid = fork(); | |||
| if (pid == 0) { /* Subprocess */ | |||
| struct sigaction act; | |||
| close(fd[0]); | |||
| /* Set signal action for SIGILL. */ | |||
| act.sa_handler = handler; | |||
| sigaction(SIGILL,&act,NULL); | |||
| /* Execute cpucfg in subprocess. */ | |||
| __asm__ volatile( | |||
| ".insn \n\t" | |||
| ".word (0xc8080118) \n\t" | |||
| ::: | |||
| ); | |||
| support_cpucfg = 1; | |||
| write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); | |||
| close(fd[1]); | |||
| exit(0); | |||
| } else if (pid > 0){ /* Parent process*/ | |||
| close(fd[1]); | |||
| if ((waitpid(pid,&status,0) <= 0) || | |||
| (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) | |||
| support_cpucfg = 0; | |||
| close(fd[0]); | |||
| } else { | |||
| support_cpucfg = 0; | |||
| } | |||
| return support_cpucfg; | |||
| } | |||
| static gotoblas_t *get_coretype_from_cpucfg(void) { | |||
| int flag = 0; | |||
| __asm__ volatile( | |||
| ".insn \n\t" | |||
| "dli $8, 0x01 \n\t" | |||
| ".word (0xc9084918) \n\t" | |||
| "usw $9, 0x00(%0) \n\t" | |||
| : | |||
| : "r"(&flag) | |||
| : "memory" | |||
| ); | |||
| if (flag & MSA_MASK) | |||
| return (&gotoblas_LOONGSON3R4); | |||
| if (flag & MMI_MASK) | |||
| return (&gotoblas_LOONGSON3R3); | |||
| return NULL; | |||
| } | |||
| static gotoblas_t *get_coretype_from_cpuinfo(void) { | |||
| #ifdef linux | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) | |||
| return (&gotoblas_LOONGSON3R3); | |||
| else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) | |||
| return (&gotoblas_LOONGSON3R4); | |||
| else | |||
| return NULL; | |||
| } | |||
| #endif | |||
| return NULL; | |||
| } | |||
| static gotoblas_t *get_coretype(void) { | |||
| int ret = 0; | |||
| ret = cpucfg_test(); | |||
| if (ret == 1) | |||
| return get_coretype_from_cpucfg(); | |||
| else | |||
| return get_coretype_from_cpuinfo(); | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char coren[22]; | |||
| char *p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if ( p ) | |||
| { | |||
| gotoblas = force_coretype(p); | |||
| } | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| } | |||
| if (gotoblas == NULL) | |||
| { | |||
| snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_LOONGSON3R3; | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -717,7 +717,7 @@ void blas_set_parameter(void){ | |||
| #if defined(ARCH_MIPS64) | |||
| void blas_set_parameter(void){ | |||
| #if defined(LOONGSON3A) | |||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| #endif | |||
| @@ -731,20 +731,6 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| #if defined(LOONGSON3B) | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1 || blas_num_threads == 2){ | |||
| #endif | |||
| //single thread | |||
| dgemm_r = 640; | |||
| #ifdef SMP | |||
| }else{ | |||
| //multi thread | |||
| dgemm_r = 160; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_PPC440FP2 */ | |||
| /* #define FORCE_CELL */ | |||
| /* #define FORCE_SICORTEX */ | |||
| /* #define FORCE_LOONGSON3A */ | |||
| /* #define FORCE_LOONGSON3B */ | |||
| /* #define FORCE_LOONGSON3R3 */ | |||
| /* #define FORCE_LOONGSON3R4 */ | |||
| /* #define FORCE_I6400 */ | |||
| /* #define FORCE_P6600 */ | |||
| /* #define FORCE_P5600 */ | |||
| @@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef FORCE_LOONGSON3A | |||
| #ifdef FORCE_LOONGSON3R3 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "LOONGSON3A" | |||
| #define SUBARCHITECTURE "LOONGSON3R3" | |||
| #define SUBDIRNAME "mips64" | |||
| #define ARCHCONFIG "-DLOONGSON3A " \ | |||
| #define ARCHCONFIG "-DLOONGSON3R3 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "loongson3a" | |||
| #define CORENAME "LOONGSON3A" | |||
| #define LIBNAME "loongson3r3" | |||
| #define CORENAME "LOONGSON3R3" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_LOONGSON3B | |||
| #ifdef FORCE_LOONGSON3R4 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "LOONGSON3B" | |||
| #define SUBARCHITECTURE "LOONGSON3R4" | |||
| #define SUBDIRNAME "mips64" | |||
| #define ARCHCONFIG "-DLOONGSON3B " \ | |||
| #define ARCHCONFIG "-DLOONGSON3R4 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "loongson3b" | |||
| #define CORENAME "LOONGSON3B" | |||
| #define LIBNAME "loongson3r4" | |||
| #define CORENAME "LOONGSON3R4" | |||
| #else | |||
| #endif | |||
| @@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| endif | |||
| else ifeq ($(TARGET_CORE), HASWELL) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
| else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||
| else | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| endif | |||
| @@ -68,6 +70,9 @@ else | |||
| TARGET_CORE = $(CORE) | |||
| KDIR = | |||
| TSUFFIX = | |||
| ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
| override CFLAGS += $(MSA_FLAGS) | |||
| endif | |||
| endif | |||
| -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) | |||
| @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(TARGET), LOONGSON3B) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifneq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(TARGET), GENERIC) | |||
| USE_TRMM = 1 | |||
| @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | |||
| { \ | |||
| LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ | |||
| src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||
| src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||
| SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | |||
| \ | |||
| PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | |||
| @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ | |||
| { \ | |||
| LD_SP2_INC(pa0, 4, src_a0, src_a1); \ | |||
| src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ | |||
| src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ | |||
| SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ | |||
| \ | |||
| PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ | |||
| @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| { | |||
| if ((0 == c) && (0 == s)) | |||
| { | |||
| v4f32 zero = __msa_cast_to_vector_float(0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||
| v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||
| /* process 2 elements */ | |||
| for (j = (n >> 1); j--;) | |||
| @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||
| { | |||
| v4f32 zero_v = __msa_cast_to_vector_float(0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||
| v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| { | |||
| if (0.0 == da) | |||
| { | |||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
| v2f64 zero_v = {0.0, 0.0}; | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); | |||
| ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); | |||
| src_a54 = __msa_cast_to_vector_double(*(a + 54)); | |||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
| src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); | |||
| src_a62 = LD_DP(a + 62); | |||
| src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); | |||
| src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); | |||
| @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| src_a44 = LD_DP(a + 44); | |||
| src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); | |||
| src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); | |||
| src_a36 = __msa_cast_to_vector_double(*(a + 36)); | |||
| src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||
| src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); | |||
| res_c7 *= src_a63; | |||
| res_c6 -= res_c7 * src_a62; | |||
| @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| src_a26 = LD_DP(a + 26); | |||
| src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); | |||
| src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); | |||
| src_a18 = __msa_cast_to_vector_double(*(a + 18)); | |||
| src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||
| src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); | |||
| res_c3 -= res_c7 * src_a59; | |||
| res_c2 -= res_c7 * src_a58; | |||
| @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| src_a8 = LD_DP(a + 8); | |||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | |||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
| src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||
| res_c1 -= res_c2 * src_a17; | |||
| res_c1 *= src_a9; | |||
| @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a52 = LD_DP(a - 12); | |||
| src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); | |||
| src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); | |||
| src_a54 = __msa_cast_to_vector_double(*(a - 10)); | |||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
| src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); | |||
| src_a40 = LD_DP(a - 24); | |||
| src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); | |||
| @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a34 = LD_DP(a - 30); | |||
| src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); | |||
| src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); | |||
| src_a36 = __msa_cast_to_vector_double(*(a - 28)); | |||
| src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); | |||
| src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); | |||
| res_c4 *= src_a36; | |||
| res_c3 -= res_c4 * src_a35; | |||
| @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a16 = LD_DP(a - 48); | |||
| src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); | |||
| src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); | |||
| src_a18 = __msa_cast_to_vector_double(*(a - 46)); | |||
| src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); | |||
| src_a0 = __msa_cast_to_vector_double(*(a - 64)); | |||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
| src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); | |||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); | |||
| src_a8 = LD_DP(a - 56); | |||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); | |||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
| @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | |||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
| src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||
| src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
| src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||
| src_a4 = LD_DP(a + 4); | |||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | |||
| @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); | |||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
| src_a8 = __msa_cast_to_vector_double(*(a + 8)); | |||
| src_a0 = __msa_cast_to_vector_double(*(a + 0)); | |||
| src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); | |||
| src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); | |||
| src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); | |||
| src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); | |||
| src_a4 = LD_DP(a + 4); | |||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); | |||
| @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| res_c14 -= res_c8 * src_a6; | |||
| res_c15 -= res_c8 * src_a7; | |||
| src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
| src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||
| src_a10 = LD_DP(a + 10); | |||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
| @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| res_c14 -= res_c10 * src_a22; | |||
| res_c15 -= res_c10 * src_a23; | |||
| src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||
| src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||
| src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||
| src_a28 = LD_DP(a + 28); | |||
| src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | |||
| src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | |||
| @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| res_c14 -= res_c12 * src_a38; | |||
| res_c15 -= res_c12 * src_a39; | |||
| src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||
| src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||
| src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||
| src_a46 = LD_DP(a + 46); | |||
| src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | |||
| src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | |||
| @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); | |||
| ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); | |||
| src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||
| src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||
| src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||
| src_a54 = LD_DP(a + 54); | |||
| src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | |||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
| @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c6 -= res_c0 * src_a6; | |||
| res_c7 -= res_c0 * src_a7; | |||
| src_a9 = __msa_cast_to_vector_double(*(a + 9)); | |||
| src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); | |||
| src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); | |||
| src_a10 = LD_DP(a + 10); | |||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
| @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c6 -= res_c2 * src_a22; | |||
| res_c7 -= res_c2 * src_a23; | |||
| src_a27 = __msa_cast_to_vector_double(*(a + 27)); | |||
| src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); | |||
| src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); | |||
| src_a28 = LD_DP(a + 28); | |||
| src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); | |||
| src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); | |||
| @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c6 -= res_c4 * src_a38; | |||
| res_c7 -= res_c4 * src_a39; | |||
| src_a45 = __msa_cast_to_vector_double(*(a + 45)); | |||
| src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); | |||
| src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); | |||
| src_a46 = LD_DP(a + 46); | |||
| src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); | |||
| src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); | |||
| @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c6 -= res_c5 * src_a46; | |||
| res_c7 -= res_c5 * src_a47; | |||
| src_a63 = __msa_cast_to_vector_double(*(a + 63)); | |||
| src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); | |||
| src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); | |||
| src_a54 = LD_DP(a + 54); | |||
| src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); | |||
| src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); | |||
| @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c6 -= res_c4 * src_a2; | |||
| res_c7 -= res_c4 * src_a3; | |||
| src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||
| src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||
| src_a6 = LD_DP(a + 6); | |||
| src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | |||
| src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | |||
| @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a10 = LD_DP(a + 10); | |||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
| src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||
| src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||
| src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||
| res_c2 *= src_a10; | |||
| res_c3 -= res_c2 * src_a11; | |||
| @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c2 -= res_c0 * src_a2; | |||
| res_c3 -= res_c0 * src_a3; | |||
| src_a5 = __msa_cast_to_vector_double(*(a + 5)); | |||
| src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); | |||
| src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); | |||
| src_a6 = LD_DP(a + 6); | |||
| src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); | |||
| src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); | |||
| @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a10 = LD_DP(a + 10); | |||
| src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); | |||
| src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); | |||
| src_a15 = __msa_cast_to_vector_double(*(a + 15)); | |||
| src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); | |||
| src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); | |||
| res_c2 *= src_a10; | |||
| res_c3 -= res_c2 * src_a11; | |||
| @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| src_b2 = LD_DP(b + 2); | |||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
| src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||
| src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||
| src_b6 = LD_DP(b + 6); | |||
| src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | |||
| src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | |||
| src_b10 = LD_DP(b + 10); | |||
| src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | |||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
| src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||
| src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||
| src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b0 = LD_DP(b + 0); | |||
| src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||
| src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| } | |||
| } | |||
| src_b0 = __msa_cast_to_vector_double(*b); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b2 = LD_DP(b + 2); | |||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
| src_b5 = __msa_cast_to_vector_double(*(b + 5)); | |||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); | |||
| src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); | |||
| src_b6 = LD_DP(b + 6); | |||
| src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); | |||
| src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); | |||
| src_b10 = LD_DP(b + 10); | |||
| src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); | |||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
| src_b15 = __msa_cast_to_vector_double(*(b + 15)); | |||
| src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); | |||
| src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b0 = LD_DP(b + 0); | |||
| src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b3 = __msa_cast_to_vector_double(*(b + 3)); | |||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); | |||
| src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) | |||
| src_b8 = LD_DP(b + 8); | |||
| src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | |||
| src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | |||
| src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
| src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
| src_b4 = LD_DP(b + 4); | |||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | |||
| src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | |||
| @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| a -= 16; | |||
| b -= 4; | |||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
| src_b2 = LD_DP(b + 2); | |||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
| @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| a -= 8; | |||
| b -= 1; | |||
| src_b0 = __msa_cast_to_vector_double(*b); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*b); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b8 = LD_DP(b + 8); | |||
| src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); | |||
| src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); | |||
| src_b10 = __msa_cast_to_vector_double(*(b + 10)); | |||
| src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); | |||
| src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); | |||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
| src_b4 = LD_DP(b + 4); | |||
| src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); | |||
| src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); | |||
| @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| a -= 8; | |||
| b -= 4; | |||
| src_b0 = __msa_cast_to_vector_double(*(b + 0)); | |||
| src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); | |||
| src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); | |||
| src_b2 = LD_DP(b + 2); | |||
| src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); | |||
| src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); | |||
| @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) | |||
| #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | |||
| #define COPY_FLOAT_TO_VECTOR(a) ( { \ | |||
| v4f32 out; \ | |||
| out = __msa_cast_to_vector_float(a); \ | |||
| out = (v4f32) __msa_splati_w((v4i32) out, 0); \ | |||
| v4f32 out = {a, a, a, a}; \ | |||
| out; \ | |||
| } ) | |||
| #define COPY_DOUBLE_TO_VECTOR(a) ( { \ | |||
| v2f64 out; \ | |||
| out = __msa_cast_to_vector_double(a); \ | |||
| out = (v2f64) __msa_splati_d((v2i64) out, 0); \ | |||
| v2f64 out = {a, a}; \ | |||
| out; \ | |||
| } ) | |||
| @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| { | |||
| if ((0 == c) && (0 == s)) | |||
| { | |||
| v4f32 zero = __msa_cast_to_vector_float(0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); | |||
| zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); | |||
| v4f32 zero = {0.0, 0.0, 0.0, 0.0}; | |||
| /* process 4 floats */ | |||
| for (j = (n >> 2); j--;) | |||
| @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| { | |||
| if (0.0 == da) | |||
| { | |||
| v4f32 zero_v = __msa_cast_to_vector_float(0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||
| v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||
| for (i = (n >> 6); i--;) | |||
| { | |||
| @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||
| { | |||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
| v2f64 zero_v = {0.0, 0.0}; | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||
| { | |||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
| v2f64 zero_v = {0.0, 0.0}; | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| @@ -1,64 +0,0 @@ | |||
| SAXPYKERNEL=axpy_loongson3a.S | |||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | |||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DSDOTKERNEL = ../mips/dot.c | |||
| @@ -0,0 +1,192 @@ | |||
| ifdef HAVE_MSA | |||
| SAXPYKERNEL = ../mips/saxpy_msa.c | |||
| DAXPYKERNEL = ../mips/daxpy_msa.c | |||
| CAXPYKERNEL = ../mips/caxpy_msa.c | |||
| ZAXPYKERNEL = ../mips/zaxpy_msa.c | |||
| else | |||
| SAXPYKERNEL = axpy_loongson3a.S | |||
| DAXPYKERNEL = daxpy_loongson3a_simd.S | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SCOPYKERNEL = ../mips/scopy_msa.c | |||
| DCOPYKERNEL = ../mips/dcopy_msa.c | |||
| CCOPYKERNEL = ../mips/ccopy_msa.c | |||
| ZCOPYKERNEL = ../mips/zcopy_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SDOTKERNEL = ../mips/sdot_msa.c | |||
| DDOTKERNEL = ../mips/ddot_msa.c | |||
| CDOTKERNEL = ../mips/cdot_msa.c | |||
| ZDOTKERNEL = ../mips/zdot_msa.c | |||
| endif | |||
| DSDOTKERNEL = ../mips/dot.c | |||
| ifdef HAVE_MSA | |||
| SROTKERNEL = ../mips/srot_msa.c | |||
| DROTKERNEL = ../mips/drot_msa.c | |||
| CROTKERNEL = ../mips/crot_msa.c | |||
| ZROTKERNEL = ../mips/zrot_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SSCALKERNEL = ../mips/sscal_msa.c | |||
| DSCALKERNEL = ../mips/dscal_msa.c | |||
| CSCALKERNEL = ../mips/cscal_msa.c | |||
| ZSCALKERNEL = ../mips/zscal_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||
| DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||
| CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||
| CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||
| ZGEMVNKERNEL = ../mips/zgemv_n_msa.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t_msa.c | |||
| else | |||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SASUMKERNEL = ../mips/sasum_msa.c | |||
| DASUMKERNEL = ../mips/dasum_msa.c | |||
| CASUMKERNEL = ../mips/casum_msa.c | |||
| ZASUMKERNEL = ../mips/zasum_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SSWAPKERNEL = ../mips/sswap_msa.c | |||
| DSWAPKERNEL = ../mips/dswap_msa.c | |||
| CSWAPKERNEL = ../mips/cswap_msa.c | |||
| ZSWAPKERNEL = ../mips/zswap_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||
| DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c | |||
| DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||
| CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||
| CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||
| STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||
| else | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||
| DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c | |||
| else | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| else | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| else | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| @@ -933,6 +933,77 @@ static void init_parameter(void) { | |||
| } | |||
| #else // (ARCH_ARM64) | |||
| #if defined(ARCH_MIPS64) | |||
| static void init_parameter(void) { | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = 640; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; | |||
| TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | |||
| TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; | |||
| TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; | |||
| #endif | |||
| #if defined(USE_GEMM3M) | |||
| #ifdef CGEMM3M_DEFAULT_P | |||
| TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; | |||
| #else | |||
| TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; | |||
| #endif | |||
| #ifdef ZGEMM3M_DEFAULT_P | |||
| TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; | |||
| #else | |||
| TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; | |||
| #endif | |||
| #ifdef CGEMM3M_DEFAULT_Q | |||
| TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; | |||
| #else | |||
| TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; | |||
| #endif | |||
| #ifdef ZGEMM3M_DEFAULT_Q | |||
| TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; | |||
| #else | |||
| TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; | |||
| #endif | |||
| #ifdef CGEMM3M_DEFAULT_R | |||
| TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; | |||
| #else | |||
| TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; | |||
| #endif | |||
| #ifdef ZGEMM3M_DEFAULT_R | |||
| TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; | |||
| #else | |||
| TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; | |||
| #endif | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; | |||
| TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; | |||
| TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; | |||
| #endif | |||
| #endif | |||
| } | |||
| #else // (ARCH_MIPS64) | |||
| #if (ARCH_POWER) | |||
| static void init_parameter(void) { | |||
| @@ -1780,4 +1851,5 @@ static void init_parameter(void) { | |||
| } | |||
| #endif //POWER | |||
| #endif //ZARCH | |||
| #endif //(ARCH_MIPS64) | |||
| #endif //(ARCH_ARM64) | |||
| @@ -2570,8 +2570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #ifdef LOONGSON3A | |||
| /*Copy from SICORTEX*/ | |||
| #if defined(LOONGSON3R4) | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| @@ -2579,6 +2578,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #ifdef HAVE_MSA | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2590,6 +2602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #endif | |||
| #define SGEMM_DEFAULT_P 64 | |||
| #define DGEMM_DEFAULT_P 44 | |||
| @@ -2612,7 +2625,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #ifdef LOONGSON3B | |||
| #if defined(LOONGSON3R3) | |||
| ////Copy from SICORTEX | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| @@ -2620,32 +2634,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 64 | |||
| #define DGEMM_DEFAULT_P 24 | |||
| #define CGEMM_DEFAULT_P 24 | |||
| #define ZGEMM_DEFAULT_P 20 | |||
| #define DGEMM_DEFAULT_P 44 | |||
| #define CGEMM_DEFAULT_P 64 | |||
| #define ZGEMM_DEFAULT_P 32 | |||
| #define SGEMM_DEFAULT_Q 192 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define DGEMM_DEFAULT_Q 92 | |||
| #define CGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q 64 | |||
| #define ZGEMM_DEFAULT_Q 80 | |||
| #define SGEMM_DEFAULT_R 512 | |||
| #define DGEMM_DEFAULT_R 512 | |||
| #define CGEMM_DEFAULT_R 512 | |||
| #define ZGEMM_DEFAULT_R 512 | |||
| #define SGEMM_DEFAULT_R 640 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R 640 | |||
| #define ZGEMM_DEFAULT_R 640 | |||
| #define GEMM_OFFSET_A1 0x10000 | |||
| #define GEMM_OFFSET_B1 0x100000 | |||