| @@ -78,6 +78,66 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| # Use a72 tunings because Neoverse-V1 is only available | |||||
| # in GCC>=9.4 | |||||
| ifeq ($(CORE), NEOVERSEV1) | |||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||||
| CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||||
| endif | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8.4-a -mtune=native | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.4-a -mtune=native | |||||
| endif | |||||
| endif | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||||
| endif | |||||
| endif | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||||
| endif | |||||
| endif | |||||
| endif | |||||
| # Use a72 tunings because Neoverse-N2 is only available | |||||
| # in GCC>=9.4 | |||||
| ifeq ($(CORE), NEOVERSEN2) | |||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||||
| CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||||
| endif | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8.5-a -mtune=native | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.5-a -mtune=native | |||||
| endif | |||||
| endif | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||||
| endif | |||||
| endif | |||||
| else | |||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||||
| endif | |||||
| endif | |||||
| endif | |||||
| # Use a53 tunings because a55 is only available in GCC>=8.1 | # Use a53 tunings because a55 is only available in GCC>=8.1 | ||||
| ifeq ($(CORE), CORTEXA55) | ifeq ($(CORE), CORTEXA55) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ||||
| @@ -374,6 +374,7 @@ else | |||||
| endif | endif | ||||
| GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) | GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) | ||||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) | GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) | ||||
| GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) | |||||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | ||||
| endif | endif | ||||
| @@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57 | |||||
| DYNAMIC_CORE += CORTEXA72 | DYNAMIC_CORE += CORTEXA72 | ||||
| DYNAMIC_CORE += CORTEXA73 | DYNAMIC_CORE += CORTEXA73 | ||||
| DYNAMIC_CORE += NEOVERSEN1 | DYNAMIC_CORE += NEOVERSEN1 | ||||
| DYNAMIC_CORE += NEOVERSEV1 | |||||
| DYNAMIC_CORE += NEOVERSEN2 | |||||
| DYNAMIC_CORE += CORTEXA55 | DYNAMIC_CORE += CORTEXA55 | ||||
| DYNAMIC_CORE += FALKOR | DYNAMIC_CORE += FALKOR | ||||
| DYNAMIC_CORE += THUNDERX | DYNAMIC_CORE += THUNDERX | ||||
| @@ -93,6 +93,8 @@ CORTEXA57 | |||||
| CORTEXA72 | CORTEXA72 | ||||
| CORTEXA73 | CORTEXA73 | ||||
| NEOVERSEN1 | NEOVERSEN1 | ||||
| NEOVERSEV1 | |||||
| NEOVERSEN2 | |||||
| CORTEXA55 | CORTEXA55 | ||||
| EMAG8180 | EMAG8180 | ||||
| FALKOR | FALKOR | ||||
| @@ -44,7 +44,7 @@ endif () | |||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| if (ARM64) | if (ARM64) | ||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) | |||||
| if (DYNAMIC_LIST) | if (DYNAMIC_LIST) | ||||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | ||||
| endif () | endif () | ||||
| @@ -237,6 +237,61 @@ endif () | |||||
| set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
| set(SYMV_P 16) | set(SYMV_P 16) | ||||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN1") | elseif ("${TCORE}" STREQUAL "NEOVERSEN1") | ||||
| file(APPEND ${TARGET_CONF_TEMP} | |||||
| "#define L1_CODE_SIZE\t65536\n" | |||||
| "#define L1_CODE_LINESIZE\t64\n" | |||||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||||
| "#define L1_DATA_SIZE\t65536\n" | |||||
| "#define L1_DATA_LINESIZE\t64\n" | |||||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||||
| "#define L2_SIZE\t1048576\n\n" | |||||
| "#define L2_LINESIZE\t64\n" | |||||
| "#define L2_ASSOCIATIVE\t8\n" | |||||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||||
| "#define DTB_SIZE\t4096\n" | |||||
| "#define HAVE_VFPV4\n" | |||||
| "#define HAVE_VFPV3\n" | |||||
| "#define HAVE_VFP\n" | |||||
| "#define HAVE_NEON\n" | |||||
| "#define ARMV8\n") | |||||
| set(SGEMM_UNROLL_M 16) | |||||
| set(SGEMM_UNROLL_N 4) | |||||
| set(DGEMM_UNROLL_M 8) | |||||
| set(DGEMM_UNROLL_N 4) | |||||
| set(CGEMM_UNROLL_M 8) | |||||
| set(CGEMM_UNROLL_N 4) | |||||
| set(ZGEMM_UNROLL_M 4) | |||||
| set(ZGEMM_UNROLL_N 4) | |||||
| set(SYMV_P 16) | |||||
| elseif ("${TCORE}" STREQUAL "NEOVERSEV1") | |||||
| file(APPEND ${TARGET_CONF_TEMP} | |||||
| "#define L1_CODE_SIZE\t65536\n" | |||||
| "#define L1_CODE_LINESIZE\t64\n" | |||||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||||
| "#define L1_DATA_SIZE\t65536\n" | |||||
| "#define L1_DATA_LINESIZE\t64\n" | |||||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||||
| "#define L2_SIZE\t1048576\n\n" | |||||
| "#define L2_LINESIZE\t64\n" | |||||
| "#define L2_ASSOCIATIVE\t8\n" | |||||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||||
| "#define DTB_SIZE\t4096\n" | |||||
| "#define HAVE_VFPV4\n" | |||||
| "#define HAVE_VFPV3\n" | |||||
| "#define HAVE_VFP\n" | |||||
| "#define HAVE_NEON\n" | |||||
| "#define HAVE_SVE\n" | |||||
| "#define ARMV8\n") | |||||
| set(SGEMM_UNROLL_M 16) | |||||
| set(SGEMM_UNROLL_N 4) | |||||
| set(DGEMM_UNROLL_M 8) | |||||
| set(DGEMM_UNROLL_N 4) | |||||
| set(CGEMM_UNROLL_M 8) | |||||
| set(CGEMM_UNROLL_N 4) | |||||
| set(ZGEMM_UNROLL_M 4) | |||||
| set(ZGEMM_UNROLL_N 4) | |||||
| set(SYMV_P 16) | |||||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||||
| file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
| "#define L1_CODE_SIZE\t65536\n" | "#define L1_CODE_SIZE\t65536\n" | ||||
| "#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
| @@ -246,13 +301,14 @@ endif () | |||||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | "#define L1_DATA_ASSOCIATIVE\t2\n" | ||||
| "#define L2_SIZE\t1048576\n\n" | "#define L2_SIZE\t1048576\n\n" | ||||
| "#define L2_LINESIZE\t64\n" | "#define L2_LINESIZE\t64\n" | ||||
| "#define L2_ASSOCIATIVE\t16\n" | |||||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||||
| "#define L2_ASSOCIATIVE\t8\n" | |||||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||||
| "#define DTB_SIZE\t4096\n" | "#define DTB_SIZE\t4096\n" | ||||
| "#define HAVE_VFPV4\n" | "#define HAVE_VFPV4\n" | ||||
| "#define HAVE_VFPV3\n" | "#define HAVE_VFPV3\n" | ||||
| "#define HAVE_VFP\n" | "#define HAVE_VFP\n" | ||||
| "#define HAVE_NEON\n" | "#define HAVE_NEON\n" | ||||
| "#define HAVE_SVE\n" | |||||
| "#define ARMV8\n") | "#define ARMV8\n") | ||||
| set(SGEMM_UNROLL_M 16) | set(SGEMM_UNROLL_M 16) | ||||
| set(SGEMM_UNROLL_N 4) | set(SGEMM_UNROLL_N 4) | ||||
| @@ -43,6 +43,8 @@ size_t length64=sizeof(value64); | |||||
| #define CPU_CORTEXA72 4 | #define CPU_CORTEXA72 4 | ||||
| #define CPU_CORTEXA73 5 | #define CPU_CORTEXA73 5 | ||||
| #define CPU_NEOVERSEN1 11 | #define CPU_NEOVERSEN1 11 | ||||
| #define CPU_NEOVERSEV1 16 | |||||
| #define CPU_NEOVERSEN2 17 | |||||
| // Qualcomm | // Qualcomm | ||||
| #define CPU_FALKOR 6 | #define CPU_FALKOR 6 | ||||
| // Cavium | // Cavium | ||||
| @@ -71,6 +73,8 @@ static char *cpuname[] = { | |||||
| "TSV110", | "TSV110", | ||||
| "EMAG8180", | "EMAG8180", | ||||
| "NEOVERSEN1", | "NEOVERSEN1", | ||||
| "NEOVERSEV1" | |||||
| "NEOVERSEN2" | |||||
| "THUNDERX3T110", | "THUNDERX3T110", | ||||
| "VORTEX", | "VORTEX", | ||||
| "CORTEXA55", | "CORTEXA55", | ||||
| @@ -90,6 +94,8 @@ static char *cpuname_lower[] = { | |||||
| "tsv110", | "tsv110", | ||||
| "emag8180", | "emag8180", | ||||
| "neoversen1", | "neoversen1", | ||||
| "neoversev1", | |||||
| "neoversen2", | |||||
| "thunderx3t110", | "thunderx3t110", | ||||
| "vortex", | "vortex", | ||||
| "cortexa55", | "cortexa55", | ||||
| @@ -170,6 +176,10 @@ int detect(void) | |||||
| return CPU_CORTEXA73; | return CPU_CORTEXA73; | ||||
| else if (strstr(cpu_part, "0xd0c")) | else if (strstr(cpu_part, "0xd0c")) | ||||
| return CPU_NEOVERSEN1; | return CPU_NEOVERSEN1; | ||||
| else if (strstr(cpu_part, "0xd40")) | |||||
| return CPU_NEOVERSEV1; | |||||
| else if (strstr(cpu_part, "0xd49")) | |||||
| return CPU_NEOVERSEN2; | |||||
| else if (strstr(cpu_part, "0xd05")) | else if (strstr(cpu_part, "0xd05")) | ||||
| return CPU_CORTEXA55; | return CPU_CORTEXA55; | ||||
| } | } | ||||
| @@ -338,11 +348,41 @@ void get_cpuconfig(void) | |||||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | printf("#define L1_DATA_ASSOCIATIVE 4\n"); | ||||
| printf("#define L2_SIZE 1048576\n"); | printf("#define L2_SIZE 1048576\n"); | ||||
| printf("#define L2_LINESIZE 64\n"); | printf("#define L2_LINESIZE 64\n"); | ||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | printf("#define DTB_SIZE 4096\n"); | ||||
| break; | break; | ||||
| case CPU_NEOVERSEV1: | |||||
| printf("#define %s\n", cpuname[d]); | |||||
| printf("#define L1_CODE_SIZE 65536\n"); | |||||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| break; | |||||
| case CPU_NEOVERSEN2: | |||||
| printf("#define %s\n", cpuname[d]); | |||||
| printf("#define L1_CODE_SIZE 65536\n"); | |||||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| break; | |||||
| case CPU_FALKOR: | case CPU_FALKOR: | ||||
| printf("#define FALKOR\n"); | printf("#define FALKOR\n"); | ||||
| printf("#define L1_CODE_SIZE 65536\n"); | printf("#define L1_CODE_SIZE 65536\n"); | ||||
| @@ -147,6 +147,8 @@ static char *corename[] = { | |||||
| "tsv110", | "tsv110", | ||||
| "emag8180", | "emag8180", | ||||
| "neoversen1", | "neoversen1", | ||||
| "neoversev1", | |||||
| "neoversen2", | |||||
| "thunderx3t110", | "thunderx3t110", | ||||
| "cortexa55", | "cortexa55", | ||||
| "unknown" | "unknown" | ||||
| @@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | ||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ | "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ | ||||
| "-march=armv8.2-a -mtune=cortex-a72" | |||||
| "-march=armv8.2-a -mtune=neoverse-n1" | |||||
| #define LIBNAME "neoversen1" | #define LIBNAME "neoversen1" | ||||
| #define CORENAME "NEOVERSEN1" | #define CORENAME "NEOVERSEN1" | ||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_NEOVERSEV1 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "NEOVERSEV1" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DNEOVERSEV1 " \ | |||||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | |||||
| "-march=armv8.4-a -mtune=neoverse-v1" | |||||
| #define LIBNAME "neoversev1" | |||||
| #define CORENAME "NEOVERSEV1" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_NEOVERSEN2 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "NEOVERSEN2" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DNEOVERSEN2 " \ | |||||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | |||||
| "-march=armv8.5-a -mtune=neoverse-n2" | |||||
| #define LIBNAME "neoversen2" | |||||
| #define CORENAME "NEOVERSEN2" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_CORTEXA55 | #ifdef FORCE_CORTEXA55 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "ARM64" | #define ARCHITECTURE "ARM64" | ||||
| @@ -0,0 +1,189 @@ | |||||
| SAMINKERNEL = ../arm/amin.c | |||||
| DAMINKERNEL = ../arm/amin.c | |||||
| CAMINKERNEL = ../arm/zamin.c | |||||
| ZAMINKERNEL = ../arm/zamin.c | |||||
| SMAXKERNEL = ../arm/max.c | |||||
| DMAXKERNEL = ../arm/max.c | |||||
| SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SAMAXKERNEL = amax.S | |||||
| DAMAXKERNEL = amax.S | |||||
| CAMAXKERNEL = zamax.S | |||||
| ZAMAXKERNEL = zamax.S | |||||
| SAXPYKERNEL = axpy.S | |||||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||||
| CAXPYKERNEL = zaxpy.S | |||||
| ZAXPYKERNEL = zaxpy.S | |||||
| SROTKERNEL = rot.S | |||||
| DROTKERNEL = rot.S | |||||
| CROTKERNEL = zrot.S | |||||
| ZROTKERNEL = zrot.S | |||||
| SSCALKERNEL = scal.S | |||||
| DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | |||||
| ZSCALKERNEL = zscal.S | |||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| CGEMVNKERNEL = zgemv_n.S | |||||
| ZGEMVNKERNEL = zgemv_n.S | |||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| CGEMVTKERNEL = zgemv_t.S | |||||
| ZGEMVTKERNEL = zgemv_t.S | |||||
| SASUMKERNEL = sasum_thunderx2t99.c | |||||
| DASUMKERNEL = dasum_thunderx2t99.c | |||||
| CASUMKERNEL = casum_thunderx2t99.c | |||||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||||
| SCOPYKERNEL = copy_thunderx2t99.c | |||||
| DCOPYKERNEL = copy_thunderx2t99.c | |||||
| CCOPYKERNEL = copy_thunderx2t99.c | |||||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||||
| SSWAPKERNEL = swap_thunderx2t99.S | |||||
| DSWAPKERNEL = swap_thunderx2t99.S | |||||
| CSWAPKERNEL = swap_thunderx2t99.S | |||||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||||
| DDOTKERNEL = dot_thunderx2t99.c | |||||
| SDOTKERNEL = dot_thunderx2t99.c | |||||
| CDOTKERNEL = zdot_thunderx2t99.c | |||||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||||
| DSDOTKERNEL = dot.S | |||||
| DGEMM_BETA = dgemm_beta.S | |||||
| SGEMM_BETA = sgemm_beta.S | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||||
| else | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||||
| else | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| endif | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||||
| else | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||||
| else | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| endif | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||||
| else | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| endif | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||||
| else | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| endif | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| @@ -0,0 +1,189 @@ | |||||
| SAMINKERNEL = ../arm/amin.c | |||||
| DAMINKERNEL = ../arm/amin.c | |||||
| CAMINKERNEL = ../arm/zamin.c | |||||
| ZAMINKERNEL = ../arm/zamin.c | |||||
| SMAXKERNEL = ../arm/max.c | |||||
| DMAXKERNEL = ../arm/max.c | |||||
| SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SAMAXKERNEL = amax.S | |||||
| DAMAXKERNEL = amax.S | |||||
| CAMAXKERNEL = zamax.S | |||||
| ZAMAXKERNEL = zamax.S | |||||
| SAXPYKERNEL = axpy.S | |||||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||||
| CAXPYKERNEL = zaxpy.S | |||||
| ZAXPYKERNEL = zaxpy.S | |||||
| SROTKERNEL = rot.S | |||||
| DROTKERNEL = rot.S | |||||
| CROTKERNEL = zrot.S | |||||
| ZROTKERNEL = zrot.S | |||||
| SSCALKERNEL = scal.S | |||||
| DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | |||||
| ZSCALKERNEL = zscal.S | |||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| CGEMVNKERNEL = zgemv_n.S | |||||
| ZGEMVNKERNEL = zgemv_n.S | |||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| CGEMVTKERNEL = zgemv_t.S | |||||
| ZGEMVTKERNEL = zgemv_t.S | |||||
| SASUMKERNEL = sasum_thunderx2t99.c | |||||
| DASUMKERNEL = dasum_thunderx2t99.c | |||||
| CASUMKERNEL = casum_thunderx2t99.c | |||||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||||
| SCOPYKERNEL = copy_thunderx2t99.c | |||||
| DCOPYKERNEL = copy_thunderx2t99.c | |||||
| CCOPYKERNEL = copy_thunderx2t99.c | |||||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||||
| SSWAPKERNEL = swap_thunderx2t99.S | |||||
| DSWAPKERNEL = swap_thunderx2t99.S | |||||
| CSWAPKERNEL = swap_thunderx2t99.S | |||||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||||
| DDOTKERNEL = dot_thunderx2t99.c | |||||
| SDOTKERNEL = dot_thunderx2t99.c | |||||
| CDOTKERNEL = zdot_thunderx2t99.c | |||||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||||
| DSDOTKERNEL = dot.S | |||||
| DGEMM_BETA = dgemm_beta.S | |||||
| SGEMM_BETA = sgemm_beta.S | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||||
| else | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||||
| else | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| endif | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||||
| else | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| endif | |||||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||||
| else | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| endif | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||||
| else | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| endif | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||||
| else | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| endif | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| @@ -3307,6 +3307,64 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define CGEMM_DEFAULT_R 4096 | #define CGEMM_DEFAULT_R 4096 | ||||
| #define ZGEMM_DEFAULT_R 4096 | #define ZGEMM_DEFAULT_R 4096 | ||||
| #elif defined(NEOVERSEV1) | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_P 128 | |||||
| #define DGEMM_DEFAULT_P 160 | |||||
| #define CGEMM_DEFAULT_P 128 | |||||
| #define ZGEMM_DEFAULT_P 128 | |||||
| #define SGEMM_DEFAULT_Q 352 | |||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 112 | |||||
| #define SGEMM_DEFAULT_R 4096 | |||||
| #define DGEMM_DEFAULT_R 4096 | |||||
| #define CGEMM_DEFAULT_R 4096 | |||||
| #define ZGEMM_DEFAULT_R 4096 | |||||
| #elif defined(NEOVERSEN2) | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_P 128 | |||||
| #define DGEMM_DEFAULT_P 160 | |||||
| #define CGEMM_DEFAULT_P 128 | |||||
| #define ZGEMM_DEFAULT_P 128 | |||||
| #define SGEMM_DEFAULT_Q 352 | |||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 112 | |||||
| #define SGEMM_DEFAULT_R 4096 | |||||
| #define DGEMM_DEFAULT_R 4096 | |||||
| #define CGEMM_DEFAULT_R 4096 | |||||
| #define ZGEMM_DEFAULT_R 4096 | |||||
| #elif defined(ARMV8SVE) || defined(A64FX) | #elif defined(ARMV8SVE) || defined(A64FX) | ||||
| /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". | /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". | ||||