| @@ -303,6 +303,11 @@ FCOMMON_OPT += -march=armv8.3-a | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(CORE), VORTEXM4) | |||||
| CCOMMON_OPT += -march=armv8.4-a+sme | |||||
| FCOMMON_OPT += -march=armv8.4-a+sme | |||||
| endif | |||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | ||||
| ifeq ($(CORE), TSV110) | ifeq ($(CORE), TSV110) | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | ||||
| @@ -427,7 +427,7 @@ ifndef MACOSX_DEPLOYMENT_TARGET | |||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| export MACOSX_DEPLOYMENT_TARGET=11.0 | export MACOSX_DEPLOYMENT_TARGET=11.0 | ||||
| export NO_SVE = 1 | export NO_SVE = 1 | ||||
| export NO_SME = 1 | |||||
| # export NO_SME = 1 | |||||
| else | else | ||||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | export MACOSX_DEPLOYMENT_TARGET=10.8 | ||||
| endif | endif | ||||
| @@ -723,6 +723,7 @@ DYNAMIC_CORE += A64FX | |||||
| endif | endif | ||||
| ifneq ($(NO_SME), 1) | ifneq ($(NO_SME), 1) | ||||
| DYNAMIC_CORE += ARMV9SME | DYNAMIC_CORE += ARMV9SME | ||||
| DYNAMIC_CORE += VORTEXM4 | |||||
| endif | endif | ||||
| DYNAMIC_CORE += THUNDERX | DYNAMIC_CORE += THUNDERX | ||||
| DYNAMIC_CORE += THUNDERX2T99 | DYNAMIC_CORE += THUNDERX2T99 | ||||
| @@ -111,6 +111,7 @@ THUNDERX2T99 | |||||
| TSV110 | TSV110 | ||||
| THUNDERX3T110 | THUNDERX3T110 | ||||
| VORTEX | VORTEX | ||||
| VORTEXM4 | |||||
| A64FX | A64FX | ||||
| ARMV8SVE | ARMV8SVE | ||||
| ARMV9SME | ARMV9SME | ||||
| @@ -39,14 +39,14 @@ if (DYNAMIC_ARCH) | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | ||||
| endif () | endif () | ||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 | if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 | ||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4) | |||||
| endif() | endif() | ||||
| elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") | elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") | ||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 | if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 | ||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | ||||
| endif () | endif () | ||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17 | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4) | |||||
| endif() | endif() | ||||
| endif () | endif () | ||||
| if (DYNAMIC_LIST) | if (DYNAMIC_LIST) | ||||
| @@ -315,6 +315,16 @@ if (${CORE} STREQUAL ARMV9SME) | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (${CORE} STREQUAL VORTEXM4) | |||||
| if (NOT DYNAMIC_ARCH) | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||||
| else () | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sme") | |||||
| endif () | |||||
| endif () | |||||
| endif () | |||||
| if (${CORE} STREQUAL CORTEXA510) | if (${CORE} STREQUAL CORTEXA510) | ||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | ||||
| @@ -1252,7 +1252,7 @@ endif () | |||||
| set(ZGEMM_UNROLL_M 4) | set(ZGEMM_UNROLL_M 4) | ||||
| set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
| set(SYMV_P 16) | set(SYMV_P 16) | ||||
| elseif ("${TCORE}" STREQUAL "VORTEX") | |||||
| elseif ("${TCORE}" STREQUAL "VORTEX" OR "${TCORE}" STREQUAL "VORTEXM4") | |||||
| file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
| "#define ARMV8\n" | "#define ARMV8\n" | ||||
| "#define L1_CODE_SIZE\t32768\n" | "#define L1_CODE_SIZE\t32768\n" | ||||
| @@ -361,6 +361,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) | |||||
| if (${TARGET} STREQUAL ARMV9SME) | if (${TARGET} STREQUAL ARMV9SME) | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") | ||||
| endif() | endif() | ||||
| if (${TARGET} STREQUAL VORTEXM4) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sme -O3") | |||||
| endif() | |||||
| if (${TARGET} STREQUAL A64FX) | if (${TARGET} STREQUAL A64FX) | ||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | ||||
| @@ -142,7 +142,7 @@ endif() | |||||
| if (ARM64) | if (ARM64) | ||||
| if (NOT NO_SME) | if (NOT NO_SME) | ||||
| file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") | file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") | ||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) | |||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv8.4-a+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) | |||||
| if (NO_SME EQUAL 1) | if (NO_SME EQUAL 1) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") | set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") | ||||
| endif() | endif() | ||||
| @@ -257,6 +257,7 @@ int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *); | |||||
| #ifdef ARCH_ARM64 | #ifdef ARCH_ARM64 | ||||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | ||||
| void (*sgemm_direct_alpha_beta) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG); | void (*sgemm_direct_alpha_beta) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG); | ||||
| int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||||
| #endif | #endif | ||||
| @@ -217,7 +217,7 @@ | |||||
| #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | ||||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | #define SGEMM_DIRECT gotoblas -> sgemm_direct | ||||
| #elif ARCH_ARM64 | #elif ARCH_ARM64 | ||||
| #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||||
| #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | |||||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | #define SGEMM_DIRECT gotoblas -> sgemm_direct | ||||
| #define SGEMM_DIRECT_ALPHA_BETA gotoblas -> sgemm_direct_alpha_beta | #define SGEMM_DIRECT_ALPHA_BETA gotoblas -> sgemm_direct_alpha_beta | ||||
| #endif | #endif | ||||
| @@ -82,6 +82,7 @@ size_t length64=sizeof(value64); | |||||
| #define CPU_AMPERE1 25 | #define CPU_AMPERE1 25 | ||||
| // Apple | // Apple | ||||
| #define CPU_VORTEX 13 | #define CPU_VORTEX 13 | ||||
| #define CPU_VORTEXM4 26 | |||||
| // Fujitsu | // Fujitsu | ||||
| #define CPU_A64FX 15 | #define CPU_A64FX 15 | ||||
| // Phytium | // Phytium | ||||
| @@ -113,7 +114,8 @@ static char *cpuname[] = { | |||||
| "FT2000", | "FT2000", | ||||
| "CORTEXA76", | "CORTEXA76", | ||||
| "NEOVERSEV2", | "NEOVERSEV2", | ||||
| "AMPERE1" | |||||
| "AMPERE1", | |||||
| "VORTEXM4", | |||||
| }; | }; | ||||
| static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
| @@ -143,7 +145,7 @@ static char *cpuname_lower[] = { | |||||
| "cortexa76", | "cortexa76", | ||||
| "neoversev2", | "neoversev2", | ||||
| "ampere1", | "ampere1", | ||||
| "ampere1a" | |||||
| "vortexm4" | |||||
| }; | }; | ||||
| static int cpulowperf=0; | static int cpulowperf=0; | ||||
| @@ -400,7 +402,7 @@ int detect(void) | |||||
| if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | ||||
| if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | ||||
| if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | ||||
| if (value64 == 1867590060) return CPU_VORTEX; //M4 | |||||
| if (value64 == 1867590060) return CPU_VORTEXM4; //M4 | |||||
| #else | #else | ||||
| #ifdef OS_WINDOWS | #ifdef OS_WINDOWS | ||||
| HKEY reghandle; | HKEY reghandle; | ||||
| @@ -740,6 +742,27 @@ void get_cpuconfig(void) | |||||
| length64 = sizeof(value64); | length64 = sizeof(value64); | ||||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | ||||
| printf("#define L2_SIZE %lld \n",value64); | printf("#define L2_SIZE %lld \n",value64); | ||||
| #endif | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
| printf("#define DTB_SIZE 4096 \n"); | |||||
| break; | |||||
| case CPU_VORTEXM4: | |||||
| printf("#define VORTEXM4 \n"); | |||||
| printf("#define HAVE_SME 1 \n"); | |||||
| #ifdef __APPLE__ | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||||
| printf("#define L1_DATA_LINESIZE %lld \n",value64); | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||||
| printf("#define L2_SIZE %lld \n",value64); | |||||
| #endif | #endif | ||||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | ||||
| printf("#define DTB_SIZE 4096 \n"); | printf("#define DTB_SIZE 4096 \n"); | ||||
| @@ -128,6 +128,12 @@ extern gotoblas_t gotoblas_ARMV9SME; | |||||
| #else | #else | ||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | #define gotoblas_ARMV9SME gotoblas_ARMV8 | ||||
| #endif | #endif | ||||
| #ifdef DYN_VORTEXM4 | |||||
| extern gotoblas_t gotoblas_VORTEXM4; | |||||
| #else | |||||
| #error "dont have vortexm4" | |||||
| #define gotoblas_VORTEXM4 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_CORTEXA55 | #ifdef DYN_CORTEXA55 | ||||
| extern gotoblas_t gotoblas_CORTEXA55; | extern gotoblas_t gotoblas_CORTEXA55; | ||||
| #else | #else | ||||
| @@ -155,17 +161,22 @@ extern gotoblas_t gotoblas_NEOVERSEV1; | |||||
| extern gotoblas_t gotoblas_NEOVERSEN2; | extern gotoblas_t gotoblas_NEOVERSEN2; | ||||
| extern gotoblas_t gotoblas_ARMV8SVE; | extern gotoblas_t gotoblas_ARMV8SVE; | ||||
| extern gotoblas_t gotoblas_A64FX; | extern gotoblas_t gotoblas_A64FX; | ||||
| #ifndef NO_SME | |||||
| extern gotoblas_t gotoblas_ARMV9SME; | |||||
| #else | |||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||||
| #endif | |||||
| #else | #else | ||||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | ||||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | ||||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | #define gotoblas_ARMV8SVE gotoblas_ARMV8 | ||||
| #define gotoblas_A64FX gotoblas_ARMV8 | #define gotoblas_A64FX gotoblas_ARMV8 | ||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifndef NO_SME | |||||
| extern gotoblas_t gotoblas_ARMV9SME; | |||||
| extern gotoblas_t gotoblas_VORTEXM4; | |||||
| #else | |||||
| #ifndef NO_SVE | |||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||||
| #else | |||||
| #define gotoblas_ARMV9SME gotoblas_NEOVERSEN1 | |||||
| #endif | |||||
| #define gotoblas_VORTEXM4 gotoblas_NEOVERSEN1 | |||||
| #endif | #endif | ||||
| extern gotoblas_t gotoblas_THUNDERX3T110; | extern gotoblas_t gotoblas_THUNDERX3T110; | ||||
| @@ -176,7 +187,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||||
| #define FALLBACK_VERBOSE 1 | #define FALLBACK_VERBOSE 1 | ||||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | ||||
| #define NUM_CORETYPES 19 | |||||
| #define NUM_CORETYPES 20 | |||||
| /* | /* | ||||
| * In case asm/hwcap.h is outdated on the build system, make sure | * In case asm/hwcap.h is outdated on the build system, make sure | ||||
| @@ -216,6 +227,7 @@ static char *corename[] = { | |||||
| "armv8sve", | "armv8sve", | ||||
| "a64fx", | "a64fx", | ||||
| "armv9sme", | "armv9sme", | ||||
| "vortexm4", | |||||
| "unknown" | "unknown" | ||||
| }; | }; | ||||
| @@ -239,6 +251,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | ||||
| if (gotoblas == &gotoblas_A64FX) return corename[17]; | if (gotoblas == &gotoblas_A64FX) return corename[17]; | ||||
| if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | ||||
| if (gotoblas == &gotoblas_VORTEXM4) return corename[19]; | |||||
| return corename[NUM_CORETYPES]; | return corename[NUM_CORETYPES]; | ||||
| } | } | ||||
| @@ -277,6 +290,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||||
| case 16: return (&gotoblas_ARMV8SVE); | case 16: return (&gotoblas_ARMV8SVE); | ||||
| case 17: return (&gotoblas_A64FX); | case 17: return (&gotoblas_A64FX); | ||||
| case 18: return (&gotoblas_ARMV9SME); | case 18: return (&gotoblas_ARMV9SME); | ||||
| case 19: return (&gotoblas_VORTEXM4); | |||||
| } | } | ||||
| snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
| openblas_warning(1, message); | openblas_warning(1, message); | ||||
| @@ -288,11 +302,11 @@ static gotoblas_t *get_coretype(void) { | |||||
| char coremsg[128]; | char coremsg[128]; | ||||
| #if defined (OS_DARWIN) | #if defined (OS_DARWIN) | ||||
| //future #if !defined(NO_SME) | |||||
| // if (support_sme1()) { | |||||
| // return &gotoblas_ARMV9SME; | |||||
| // } | |||||
| // #endif | |||||
| #if !defined(NO_SME) | |||||
| if (support_sme1()) { | |||||
| return &gotoblas_VORTEXM4; | |||||
| } | |||||
| #endif | |||||
| return &gotoblas_NEOVERSEN1; | return &gotoblas_NEOVERSEN1; | ||||
| #endif | #endif | ||||
| @@ -463,7 +477,7 @@ static gotoblas_t *get_coretype(void) { | |||||
| } | } | ||||
| break; | break; | ||||
| case 0x61: // Apple | case 0x61: // Apple | ||||
| //future if (support_sme1()) return &gotoblas_ARMV9SME; | |||||
| if (support_sme1()) return &gotoblas_VORTEXM4; | |||||
| return &gotoblas_NEOVERSEN1; | return &gotoblas_NEOVERSEN1; | ||||
| break; | break; | ||||
| default: | default: | ||||
| @@ -1654,6 +1654,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "VORTEX" | #define CORENAME "VORTEX" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_VORTEXM4 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "VORTEXM4" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DVORTEXM4 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SME -DARMV8" | |||||
| #define LIBNAME "vortexm4" | |||||
| #define CORENAME "VORTEXM4" | |||||
| #endif | |||||
| #ifdef FORCE_A64FX | #ifdef FORCE_A64FX | ||||
| #define ARMV8 | #define ARMV8 | ||||
| #define FORCE | #define FORCE | ||||
| @@ -266,6 +266,7 @@ void NAME(char *TRANSA, char *TRANSB, | |||||
| int transa, transb, nrowa, nrowb; | int transa, transb, nrowa, nrowb; | ||||
| blasint info; | blasint info; | ||||
| int order = -1; | |||||
| char transA, transB; | char transA, transB; | ||||
| IFLOAT *buffer; | IFLOAT *buffer; | ||||
| @@ -424,30 +425,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (support_avx512() ) | |||||
| #endif | |||||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (support_sme1()) | |||||
| #endif | |||||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { | |||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||||
| return; | |||||
| }else if (order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { | |||||
| SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| args.alpha = (void *)α | args.alpha = (void *)α | ||||
| args.beta = (void *)β | args.beta = (void *)β | ||||
| @@ -564,6 +541,36 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| return; | return; | ||||
| } | } | ||||
| if ((args.m == 0) || (args.n == 0)) return; | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (support_avx512() ) | |||||
| #endif | |||||
| if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (strcmp(gotoblas_corename(), "armv9sme") == 0 || strcmp(gotoblas_corename(), "vortexm4") == 0) | |||||
| // if (support_sme1()) | |||||
| #endif | |||||
| if (order == CblasRowMajor && m==lda && n ==ldb && k==ldc && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||||
| return; | |||||
| } | |||||
| else | |||||
| if (order == CblasRowMajor && m==lda && n==ldb && k==ldc && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||||
| SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | ||||
| @@ -241,7 +241,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| if (X86_64 OR ARM64) | if (X86_64 OR ARM64) | ||||
| set(USE_DIRECT_SGEMM true) | set(USE_DIRECT_SGEMM true) | ||||
| endif() | endif() | ||||
| if (UC_TARGET_CORE MATCHES ARMV9SME) | |||||
| if (UC_TARGET_CORE MATCHES ARMV9SME OR UC_TARGET_CORE MATCHES VORTEXM4) | |||||
| set (HAVE_SME true) | set (HAVE_SME true) | ||||
| endif () | endif () | ||||
| @@ -254,14 +254,16 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | ||||
| elseif (ARM64) | elseif (ARM64) | ||||
| set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | |||||
| set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) | set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) | ||||
| set (SGEMMDIRECTKERNEL_ALPHA_BETA sgemm_direct_alpha_beta_arm64_sme1.c) | set (SGEMMDIRECTKERNEL_ALPHA_BETA sgemm_direct_alpha_beta_arm64_sme1.c) | ||||
| set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | |||||
| set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1_2VLx2VL.S) | |||||
| set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL_ALPHA_BETA}" "" "gemm_direct_alpha_beta" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL_ALPHA_BETA}" "" "gemm_direct_alpha_beta" false "" "" false SINGLE) | ||||
| if (HAVE_SME) | if (HAVE_SME) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1_2VLx2VL" false "" "" false SINGLE) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -29,6 +29,9 @@ ifdef TARGET_CORE | |||||
| ifeq ($(TARGET_CORE), ARMV9SME) | ifeq ($(TARGET_CORE), ARMV9SME) | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme | ||||
| endif | endif | ||||
| ifeq ($(TARGET_CORE), VORTEXM4) | |||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv8.4-a+sme | |||||
| endif | |||||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | ||||
| @@ -131,8 +131,12 @@ ifeq ($(ARCH), arm64) | |||||
| ifeq ($(TARGET_CORE), ARMV9SME) | ifeq ($(TARGET_CORE), ARMV9SME) | ||||
| HAVE_SME = 1 | HAVE_SME = 1 | ||||
| endif | endif | ||||
| ifeq ($(TARGET_CORE), VORTEXM4) | |||||
| HAVE_SME = 1 | |||||
| endif | |||||
| SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | ||||
| SGEMMDIRECTKERNEL_ALPHA_BETA = sgemm_direct_alpha_beta_arm64_sme1.c | SGEMMDIRECTKERNEL_ALPHA_BETA = sgemm_direct_alpha_beta_arm64_sme1.c | ||||
| SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -209,11 +213,12 @@ SKERNELOBJS += \ | |||||
| endif | endif | ||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| SKERNELOBJS += \ | SKERNELOBJS += \ | ||||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_direct$(TSUFFIX).$(SUFFIX) \ | sgemm_direct$(TSUFFIX).$(SUFFIX) \ | ||||
| sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) | sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) | ||||
| ifdef HAVE_SME | ifdef HAVE_SME | ||||
| SKERNELOBJS += \ | SKERNELOBJS += \ | ||||
| sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -969,13 +974,15 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| endif | endif | ||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | ||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| $(KDIR)sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL_ALPHA_BETA) | $(KDIR)sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL_ALPHA_BETA) | ||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| ifdef HAVE_SME | ifdef HAVE_SME | ||||
| $(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | |||||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | |||||
| $(KDIR)sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) : | |||||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_2VLx2VL.S -UDOUBLE -UCOMPLEX -o $@ | |||||
| $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | ||||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ | $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ | ||||
| endif | endif | ||||
| @@ -0,0 +1 @@ | |||||
| include $(KERNELDIR)/KERNEL.NEOVERSEN1 | |||||
| @@ -14,9 +14,17 @@ | |||||
| #include <arm_sme.h> | #include <arm_sme.h> | ||||
| #endif | #endif | ||||
| #if defined(DYNAMIC_ARCH) | |||||
| #define COMBINE(a,b) a ## b | |||||
| #define COMBINE2(a,b) COMBINE(a,b) | |||||
| #define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess | |||||
| #define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS) | |||||
| #else | |||||
| #define SME1_PREPROCESS sgemm_direct_sme1_preprocess | |||||
| #endif | |||||
| /* Function prototypes */ | /* Function prototypes */ | ||||
| extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ | |||||
| const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); | |||||
| extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ | |||||
| const float * restrict a, float * a_mod); | |||||
| /* Function Definitions */ | /* Function Definitions */ | ||||
| static uint64_t sve_cntw() { | static uint64_t sve_cntw() { | ||||
| @@ -99,10 +107,11 @@ kernel_2x2(const float *A, const float *B, float *C, size_t shared_dim, | |||||
| svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); | svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); | ||||
| svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); | svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); | ||||
| } | } | ||||
| return; | |||||
| } | } | ||||
| __arm_new("za") __arm_locally_streaming | __arm_new("za") __arm_locally_streaming | ||||
| void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ | |||||
| static void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ | |||||
| const float *ba, const float *restrict bb, const float* beta,\ | const float *ba, const float *restrict bb, const float* beta,\ | ||||
| float *restrict C) { | float *restrict C) { | ||||
| @@ -125,6 +134,7 @@ void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, co | |||||
| // Block over row dimension of C | // Block over row dimension of C | ||||
| for (; row_idx < num_rows; row_idx += row_batch) { | for (; row_idx < num_rows; row_idx += row_batch) { | ||||
| row_batch = MIN(row_batch, num_rows - row_idx); | row_batch = MIN(row_batch, num_rows - row_idx); | ||||
| uint64_t col_idx = 0; | uint64_t col_idx = 0; | ||||
| uint64_t col_batch = 2*svl; | uint64_t col_batch = 2*svl; | ||||
| @@ -143,7 +153,7 @@ void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, co | |||||
| #else | #else | ||||
| void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ | void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ | ||||
| const float *ba, const float *restrict bb, const float* beta,\ | const float *ba, const float *restrict bb, const float* beta,\ | ||||
| float *restrict C){} | |||||
| float *restrict C){fprintf(stderr,"empty sgemm_alpha_beta2x2 should never get called!!!\n");} | |||||
| #endif | #endif | ||||
| /*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ | /*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ | ||||
| @@ -166,25 +176,27 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict | |||||
| * of reading directly from vector (z) registers. | * of reading directly from vector (z) registers. | ||||
| * */ | * */ | ||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | ||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | ||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | ||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | ||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31","za"); | |||||
| /* Pre-process the left matrix to make it suitable for | /* Pre-process the left matrix to make it suitable for | ||||
| matrix sum of outer-product calculation | matrix sum of outer-product calculation | ||||
| */ | */ | ||||
| sgemm_direct_sme1_preprocess(M, K, A, A_mod); | |||||
| SME1_PREPROCESS(M, K, A, A_mod); | |||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | ||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15","d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | ||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | ||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | ||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "za"); | |||||
| /* Calculate C = alpha*A*B + beta*C */ | /* Calculate C = alpha*A*B + beta*C */ | ||||
| sgemm_direct_alpha_beta_sme1_2VLx2VL(M, K, N, &alpha, A_mod, B, &beta, R); | sgemm_direct_alpha_beta_sme1_2VLx2VL(M, K, N, &alpha, A_mod, B, &beta, R); | ||||
| free(A_mod); | free(A_mod); | ||||
| @@ -194,6 +206,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict | |||||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ | void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ | ||||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | ||||
| float beta, float * __restrict R, BLASLONG strideR){} | |||||
| float beta, float * __restrict R, BLASLONG strideR){fprintf(stderr,"empty sgemm_direct_alpha_beta should not be called!!!\n");} | |||||
| #endif | #endif | ||||
| @@ -8,17 +8,28 @@ | |||||
| #include <inttypes.h> | #include <inttypes.h> | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(HAVE_SME) | #if defined(HAVE_SME) | ||||
| #if defined(DYNAMIC_ARCH) | |||||
| #define COMBINE(a,b) a ## b | |||||
| #define COMBINE2(a,b) COMBINE(a,b) | |||||
| #define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess | |||||
| #define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS) | |||||
| #define SME1_DIRECT2X2_BASE sgemm_direct_sme1_2VLx2VL | |||||
| #define SME1_DIRECT2X2 COMBINE2(SME1_DIRECT2X2_BASE,TS) | |||||
| #else | |||||
| #define SME1_PREPROCESS sgemm_direct_sme1_preprocess | |||||
| #define SME1_DIRECT2X2 sgemm_direct_sme1_2VLx2VL | |||||
| #endif | |||||
| /* Function prototypes */ | /* Function prototypes */ | ||||
| extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ | |||||
| const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); | |||||
| extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ | |||||
| extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ | |||||
| const float * restrict a, float * a_mod) ; | |||||
| extern void SME1_DIRECT2X2(uint64_t m, uint64_t k, uint64_t n,\ | |||||
| const float * matLeft,\ | const float * matLeft,\ | ||||
| const float * restrict matRight,\ | const float * restrict matRight,\ | ||||
| const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); | |||||
| const float * restrict matResult) ; | |||||
| /* Function Definitions */ | /* Function Definitions */ | ||||
| uint64_t sve_cntw() { | |||||
| static uint64_t sve_cntw() { | |||||
| uint64_t cnt; | uint64_t cnt; | ||||
| asm volatile( | asm volatile( | ||||
| "rdsvl %[res], #1\n" | "rdsvl %[res], #1\n" | ||||
| @@ -39,7 +50,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| uint64_t m_mod, vl_elms; | uint64_t m_mod, vl_elms; | ||||
| vl_elms = sve_cntw(); | vl_elms = sve_cntw(); | ||||
| m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | ||||
| float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | ||||
| @@ -48,7 +58,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| * of reading directly from vector (z) registers. | * of reading directly from vector (z) registers. | ||||
| * */ | * */ | ||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | ||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | ||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | ||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | ||||
| @@ -57,13 +67,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| /* Pre-process the left matrix to make it suitable for | /* Pre-process the left matrix to make it suitable for | ||||
| matrix sum of outer-product calculation | matrix sum of outer-product calculation | ||||
| */ | */ | ||||
| sgemm_direct_sme1_preprocess(M, K, A, A_mod); | |||||
| SME1_PREPROCESS(M, K, A, A_mod); | |||||
| /* Calculate C = A*B */ | /* Calculate C = A*B */ | ||||
| sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | |||||
| SME1_DIRECT2X2(M, K, N, A_mod, B, R); | |||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | ||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | ||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | ||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | ||||
| @@ -75,6 +85,8 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | ||||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | ||||
| float * __restrict R, BLASLONG strideR){} | |||||
| float * __restrict R, BLASLONG strideR){ | |||||
| fprintf(stderr,"EMPTY sgemm_kernel_direct should never be called \n"); | |||||
| } | |||||
| #endif | #endif | ||||
| @@ -0,0 +1,31 @@ | |||||
| #include "common.h" | |||||
| /* helper for the direct sgemm code written by Arjan van der Ven */ | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) | |||||
| { | |||||
| if (M<3 || M%2==1) return 0; | |||||
| unsigned long long mnk = M * N * K; | |||||
| /* large matrixes -> not performant */ | |||||
| if (mnk >= 28 * 512 * 512) | |||||
| return 0; | |||||
| /* | |||||
| * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, | |||||
| * and the regular sgemm copy/realignment of data pays off much quicker | |||||
| */ | |||||
| if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) | |||||
| return 0; | |||||
| #ifdef SMP | |||||
| /* if we can run multithreaded, the threading changes the based threshold */ | |||||
| if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) | |||||
| return 0; | |||||
| #endif | |||||
| return 1; | |||||
| } | |||||
| @@ -35,16 +35,17 @@ | |||||
| #define K_exit x15 //Exit condition for K loop | #define K_exit x15 //Exit condition for K loop | ||||
| #define M_cntr x16 //M loop counter | #define M_cntr x16 //M loop counter | ||||
| #define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements | #define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements | ||||
| #define C2 x18 //Constant2: N + SVLs | |||||
| #define C3 x19 //Constant3: K*SVLs + SVLs | |||||
| #define C4 x20 //Constant4: SVLs-2 | |||||
| #define C5 x21 //Constant5: K*SVLs | |||||
| #define C6 x22 //Constant6: N*SVLs | |||||
| #define C2 x19 //Constant2: N + SVLs | |||||
| #define C3 x20 //Constant3: K*SVLs + SVLs | |||||
| #define C4 x21 //Constant4: SVLs-2 | |||||
| #define C5 x22 //Constant5: K*SVLs | |||||
| #define C6 x23 //Constant6: N*SVLs | |||||
| .text | .text | ||||
| .global sgemm_direct_sme1_2VLx2VL | |||||
| .global ASMNAME | |||||
| sgemm_direct_sme1_2VLx2VL: | |||||
| ASMNAME: | |||||
| //sgemm_direct_sme1_2VLx2VL: | |||||
| stp x19, x20, [sp, #-48]! | stp x19, x20, [sp, #-48]! | ||||
| stp x21, x22, [sp, #16] | stp x21, x22, [sp, #16] | ||||
| @@ -61,7 +62,7 @@ | |||||
| add C2, N, C4 //N + SVLs | add C2, N, C4 //N + SVLs | ||||
| add C3, C5, C4 //K*SVLs + SVLs | add C3, C5, C4 //K*SVLs + SVLs | ||||
| whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) | whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) | ||||
| sub w20, w20, #2 //SVLs-2 | |||||
| sub w21, w21, #2 //SVLs-2 | |||||
| .M_Loop: | .M_Loop: | ||||
| incw M_cntr | incw M_cntr | ||||
| @@ -198,7 +199,7 @@ process_K_less_than_equal_2: | |||||
| st1w {za1h.s[w13, #0]}, p5, [Cptr1] | st1w {za1h.s[w13, #0]}, p5, [Cptr1] | ||||
| st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] | st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] | ||||
| st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] | st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] | ||||
| cmp w13, w20 | |||||
| cmp w13, w21 | |||||
| b.mi .Loop_store_ZA | b.mi .Loop_store_ZA | ||||
| psel p4, p0, p2.s[w13, 1] | psel p4, p0, p2.s[w13, 1] | ||||
| psel p5, p1, p2.s[w13, 1] | psel p5, p1, p2.s[w13, 1] | ||||
| @@ -211,12 +212,12 @@ process_K_less_than_equal_2: | |||||
| addvl Cptr, Cptr, #2 | addvl Cptr, Cptr, #2 | ||||
| addvl Bptr, Bptr, #1 | addvl Bptr, Bptr, #1 | ||||
| whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) | whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) | ||||
| b.first .N_Loop | |||||
| b.mi .N_Loop | |||||
| add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements | add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements | ||||
| add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements | add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements | ||||
| incw M_cntr | incw M_cntr | ||||
| whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) | whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) | ||||
| b.first .M_Loop | |||||
| b.mi .M_Loop | |||||
| smstop | smstop | ||||
| @@ -37,9 +37,9 @@ | |||||
| #define C6 x15 //Constant6: 3*ncol | #define C6 x15 //Constant6: 3*ncol | ||||
| .text | .text | ||||
| .global sgemm_direct_sme1_preprocess | |||||
| .global ASMNAME //sgemm_direct_sme1_preprocess | |||||
| sgemm_direct_sme1_preprocess: | |||||
| ASMNAME: //sgemm_direct_sme1_preprocess: | |||||
| stp x19, x20, [sp, #-48]! | stp x19, x20, [sp, #-48]! | ||||
| stp x21, x22, [sp, #16] | stp x21, x22, [sp, #16] | ||||
| @@ -114,14 +114,14 @@ | |||||
| addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb | addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb | ||||
| whilelt p8.b, mat_ptr0, inner_loop_exit | whilelt p8.b, mat_ptr0, inner_loop_exit | ||||
| b.first .Loop_process | |||||
| b.mi .Loop_process | |||||
| add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements | add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements | ||||
| add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements | add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements | ||||
| incw outer_loop_cntr | incw outer_loop_cntr | ||||
| whilelt p0.s, outer_loop_cntr, nrow | whilelt p0.s, outer_loop_cntr, nrow | ||||
| b.first .M_Loop | |||||
| b.mi .M_Loop | |||||
| smstop | smstop | ||||
| @@ -216,6 +216,7 @@ gotoblas_t TABLE_NAME = { | |||||
| #ifdef ARCH_ARM64 | #ifdef ARCH_ARM64 | ||||
| sgemm_directTS, | sgemm_directTS, | ||||
| sgemm_direct_alpha_betaTS, | sgemm_direct_alpha_betaTS, | ||||
| sgemm_direct_performantTS, | |||||
| #endif | #endif | ||||
| sgemm_kernelTS, sgemm_betaTS, | sgemm_kernelTS, sgemm_betaTS, | ||||
| @@ -3353,7 +3353,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(CORTEXA57) || defined(CORTEXX1) || \ | #if defined(CORTEXA57) || defined(CORTEXX1) || \ | ||||
| defined(CORTEXA72) || defined(CORTEXA73) || \ | defined(CORTEXA72) || defined(CORTEXA73) || \ | ||||
| defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) | |||||
| defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) || defined(VORTEXM4) | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -3370,7 +3370,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /*FIXME: this should be using the cache size, but there is currently no easy way to | /*FIXME: this should be using the cache size, but there is currently no easy way to | ||||
| query that on ARM. So if getarch counted more than 8 cores we simply assume the host | query that on ARM. So if getarch counted more than 8 cores we simply assume the host | ||||
| is a big desktop or server with abundant cache rather than a phone or embedded device */ | is a big desktop or server with abundant cache rather than a phone or embedded device */ | ||||
| #if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) | |||||
| #if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) || defined(VORTEXM4) | |||||
| #define SGEMM_DEFAULT_P 512 | #define SGEMM_DEFAULT_P 512 | ||||
| #define DGEMM_DEFAULT_P 256 | #define DGEMM_DEFAULT_P 256 | ||||
| #define CGEMM_DEFAULT_P 256 | #define CGEMM_DEFAULT_P 256 | ||||
| @@ -3598,15 +3598,15 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #undef BGEMM_ALIGN_K | #undef BGEMM_ALIGN_K | ||||
| #undef BGEMM_DEFAULT_UNROLL_M | #undef BGEMM_DEFAULT_UNROLL_M | ||||
| #undef BGEMM_DEFAULT_UNROLL_N | #undef BGEMM_DEFAULT_UNROLL_N | ||||
| #define BGEMM_ALIGN_K 4 | |||||
| #define BGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define BGEMM_ALIGN_K 8 | |||||
| #define BGEMM_DEFAULT_UNROLL_N 4 | #define BGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define BGEMM_DEFAULT_UNROLL_M 4 | |||||
| #undef SBGEMM_ALIGN_K | #undef SBGEMM_ALIGN_K | ||||
| #undef SBGEMM_DEFAULT_UNROLL_M | #undef SBGEMM_DEFAULT_UNROLL_M | ||||
| #undef SBGEMM_DEFAULT_UNROLL_N | #undef SBGEMM_DEFAULT_UNROLL_N | ||||
| #define SBGEMM_ALIGN_K 4 | |||||
| #define SBGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SBGEMM_ALIGN_K 8 | |||||
| #define SBGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SBGEMM_DEFAULT_UNROLL_N 4 | #define SBGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| @@ -3842,7 +3842,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout | |||||
| #endif /* ARMv8 */ | #endif /* ARMv8 */ | ||||
| #if defined(ARMV9SME) /* ARMv9 SME */ | |||||
| #if defined(ARMV9SME) || defined(VORTEXM4) /* ARMv9 SME */ | |||||
| #define USE_SGEMM_KERNEL_DIRECT 1 | #define USE_SGEMM_KERNEL_DIRECT 1 | ||||
| #endif /* ARMv9 SME */ | #endif /* ARMv9 SME */ | ||||