Browse Source

Merge 2b5d8c789d into 06c09deee9

pull/5423/merge
Martin Kroeker GitHub 5 months ago
parent
commit
ef8a44d981
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
25 changed files with 247 additions and 98 deletions
  1. +5
    -0
      Makefile.arm64
  2. +2
    -1
      Makefile.system
  3. +1
    -0
      TargetList.txt
  4. +3
    -3
      cmake/arch.cmake
  5. +10
    -0
      cmake/cc.cmake
  6. +1
    -1
      cmake/prebuild.cmake
  7. +3
    -0
      cmake/system.cmake
  8. +1
    -1
      cmake/system_check.cmake
  9. +1
    -0
      common_param.h
  10. +1
    -1
      common_s.h
  11. +26
    -3
      cpuid_arm64.c
  12. +27
    -13
      driver/others/dynamic_arm64.c
  13. +14
    -0
      getarch.c
  14. +31
    -24
      interface/gemm.c
  15. +5
    -3
      kernel/CMakeLists.txt
  16. +3
    -0
      kernel/Makefile
  17. +10
    -3
      kernel/Makefile.L3
  18. +1
    -0
      kernel/arm64/KERNEL.VORTEXM4
  19. +22
    -10
      kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c
  20. +25
    -13
      kernel/arm64/sgemm_direct_arm64_sme1.c
  21. +31
    -0
      kernel/arm64/sgemm_direct_performant.c
  22. +12
    -11
      kernel/arm64/sgemm_direct_sme1_2VLx2VL.S
  23. +4
    -4
      kernel/arm64/sgemm_direct_sme1_preprocess.S
  24. +1
    -0
      kernel/setparam-ref.c
  25. +7
    -7
      param.h

+ 5
- 0
Makefile.arm64 View File

@@ -303,6 +303,11 @@ FCOMMON_OPT += -march=armv8.3-a
endif
endif

ifeq ($(CORE), VORTEXM4)
CCOMMON_OPT += -march=armv8.4-a+sme
FCOMMON_OPT += -march=armv8.4-a+sme
endif

ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110


+ 2
- 1
Makefile.system View File

@@ -427,7 +427,7 @@ ifndef MACOSX_DEPLOYMENT_TARGET
ifeq ($(ARCH), arm64)
export MACOSX_DEPLOYMENT_TARGET=11.0
export NO_SVE = 1
export NO_SME = 1
# export NO_SME = 1
else
export MACOSX_DEPLOYMENT_TARGET=10.8
endif
@@ -723,6 +723,7 @@ DYNAMIC_CORE += A64FX
endif
ifneq ($(NO_SME), 1)
DYNAMIC_CORE += ARMV9SME
DYNAMIC_CORE += VORTEXM4
endif
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99


+ 1
- 0
TargetList.txt View File

@@ -111,6 +111,7 @@ THUNDERX2T99
TSV110
THUNDERX3T110
VORTEX
VORTEXM4
A64FX
ARMV8SVE
ARMV9SME


+ 3
- 3
cmake/arch.cmake View File

@@ -39,14 +39,14 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4)
endif()
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4)
endif()
endif ()
if (DYNAMIC_LIST)


+ 10
- 0
cmake/cc.cmake View File

@@ -315,6 +315,16 @@ if (${CORE} STREQUAL ARMV9SME)
endif ()
endif ()

if (${CORE} STREQUAL VORTEXM4)
if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sme")
endif ()
endif ()
endif ()

if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")


+ 1
- 1
cmake/prebuild.cmake View File

@@ -1252,7 +1252,7 @@ endif ()
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "VORTEX")
elseif ("${TCORE}" STREQUAL "VORTEX" OR "${TCORE}" STREQUAL "VORTEXM4")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n"


+ 3
- 0
cmake/system.cmake View File

@@ -361,6 +361,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
if (${TARGET} STREQUAL ARMV9SME)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
endif()
if (${TARGET} STREQUAL VORTEXM4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sme -O3")
endif()
if (${TARGET} STREQUAL A64FX)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")


+ 1
- 1
cmake/system_check.cmake View File

@@ -142,7 +142,7 @@ endif()
if (ARM64)
if (NOT NO_SME)
file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv8.4-a+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
if (NO_SME EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME")
endif()


+ 1
- 0
common_param.h View File

@@ -257,6 +257,7 @@ int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *);
#ifdef ARCH_ARM64
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
void (*sgemm_direct_alpha_beta) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
#endif



+ 1
- 1
common_s.h View File

@@ -217,7 +217,7 @@
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#elif ARCH_ARM64
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#define SGEMM_DIRECT_ALPHA_BETA gotoblas -> sgemm_direct_alpha_beta
#endif


+ 26
- 3
cpuid_arm64.c View File

@@ -82,6 +82,7 @@ size_t length64=sizeof(value64);
#define CPU_AMPERE1 25
// Apple
#define CPU_VORTEX 13
#define CPU_VORTEXM4 26
// Fujitsu
#define CPU_A64FX 15
// Phytium
@@ -113,7 +114,8 @@ static char *cpuname[] = {
"FT2000",
"CORTEXA76",
"NEOVERSEV2",
"AMPERE1"
"AMPERE1",
"VORTEXM4",
};

static char *cpuname_lower[] = {
@@ -143,7 +145,7 @@ static char *cpuname_lower[] = {
"cortexa76",
"neoversev2",
"ampere1",
"ampere1a"
"vortexm4"
};

static int cpulowperf=0;
@@ -400,7 +402,7 @@ int detect(void)
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
if (value64 == 1867590060) return CPU_VORTEX; //M4
if (value64 == 1867590060) return CPU_VORTEXM4; //M4
#else
#ifdef OS_WINDOWS
HKEY reghandle;
@@ -740,6 +742,27 @@ void get_cpuconfig(void)
length64 = sizeof(value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_VORTEXM4:
printf("#define VORTEXM4 \n");
printf("#define HAVE_SME 1 \n");
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %lld \n",value64);
printf("#define L1_DATA_LINESIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");


+ 27
- 13
driver/others/dynamic_arm64.c View File

@@ -128,6 +128,12 @@ extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifdef DYN_VORTEXM4
extern gotoblas_t gotoblas_VORTEXM4;
#else
#error "dont have vortexm4"
#define gotoblas_VORTEXM4 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEXA55
extern gotoblas_t gotoblas_CORTEXA55;
#else
@@ -155,17 +161,22 @@ extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
extern gotoblas_t gotoblas_A64FX;
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#endif
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#define gotoblas_A64FX gotoblas_ARMV8
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
extern gotoblas_t gotoblas_VORTEXM4;
#else
#ifndef NO_SVE
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#else
#define gotoblas_ARMV9SME gotoblas_NEOVERSEN1
#endif
#define gotoblas_VORTEXM4 gotoblas_NEOVERSEN1
#endif

extern gotoblas_t gotoblas_THUNDERX3T110;
@@ -176,7 +187,7 @@ extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

#define NUM_CORETYPES 19
#define NUM_CORETYPES 20

/*
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -216,6 +227,7 @@ static char *corename[] = {
"armv8sve",
"a64fx",
"armv9sme",
"vortexm4",
"unknown"
};

@@ -239,6 +251,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
if (gotoblas == &gotoblas_A64FX) return corename[17];
if (gotoblas == &gotoblas_ARMV9SME) return corename[18];
if (gotoblas == &gotoblas_VORTEXM4) return corename[19];
return corename[NUM_CORETYPES];
}

@@ -277,6 +290,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 16: return (&gotoblas_ARMV8SVE);
case 17: return (&gotoblas_A64FX);
case 18: return (&gotoblas_ARMV9SME);
case 19: return (&gotoblas_VORTEXM4);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@@ -288,11 +302,11 @@ static gotoblas_t *get_coretype(void) {
char coremsg[128];

#if defined (OS_DARWIN)
//future #if !defined(NO_SME)
// if (support_sme1()) {
// return &gotoblas_ARMV9SME;
// }
// #endif
#if !defined(NO_SME)
if (support_sme1()) {
return &gotoblas_VORTEXM4;
}
#endif
return &gotoblas_NEOVERSEN1;
#endif
@@ -463,7 +477,7 @@ static gotoblas_t *get_coretype(void) {
}
break;
case 0x61: // Apple
//future if (support_sme1()) return &gotoblas_ARMV9SME;
if (support_sme1()) return &gotoblas_VORTEXM4;
return &gotoblas_NEOVERSEN1;
break;
default:


+ 14
- 0
getarch.c View File

@@ -1654,6 +1654,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "VORTEX"
#endif

#ifdef FORCE_VORTEXM4
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VORTEXM4"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVORTEXM4 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SME -DARMV8"
#define LIBNAME "vortexm4"
#define CORENAME "VORTEXM4"
#endif

#ifdef FORCE_A64FX
#define ARMV8
#define FORCE


+ 31
- 24
interface/gemm.c View File

@@ -266,6 +266,7 @@ void NAME(char *TRANSA, char *TRANSB,

int transa, transb, nrowa, nrowb;
blasint info;
int order = -1;

char transA, transB;
IFLOAT *buffer;
@@ -424,30 +425,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

PRINT_DEBUG_CNAME;

#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_sme1())
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}else if (order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}
#endif
#endif

#ifndef COMPLEX
args.alpha = (void *)α
args.beta = (void *)β
@@ -564,6 +541,36 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
return;
}


if ((args.m == 0) || (args.n == 0)) return;
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (strcmp(gotoblas_corename(), "armv9sme") == 0 || strcmp(gotoblas_corename(), "vortexm4") == 0)
// if (support_sme1())
#endif
if (order == CblasRowMajor && m==lda && n ==ldb && k==ldc && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
else
if (order == CblasRowMajor && m==lda && n==ldb && k==ldc && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}

#endif
#endif

#endif

#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)


+ 5
- 3
kernel/CMakeLists.txt View File

@@ -241,7 +241,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
if (X86_64 OR ARM64)
set(USE_DIRECT_SGEMM true)
endif()
if (UC_TARGET_CORE MATCHES ARMV9SME)
if (UC_TARGET_CORE MATCHES ARMV9SME OR UC_TARGET_CORE MATCHES VORTEXM4)
set (HAVE_SME true)
endif ()

@@ -254,14 +254,16 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
elseif (ARM64)
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c)
set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c)
set (SGEMMDIRECTKERNEL_ALPHA_BETA sgemm_direct_alpha_beta_arm64_sme1.c)
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S)
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1_2VLx2VL.S)
set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL_ALPHA_BETA}" "" "gemm_direct_alpha_beta" false "" "" false SINGLE)
if (HAVE_SME)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1_2VLx2VL" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE)
endif ()
endif ()


+ 3
- 0
kernel/Makefile View File

@@ -29,6 +29,9 @@ ifdef TARGET_CORE
ifeq ($(TARGET_CORE), ARMV9SME)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme
endif
ifeq ($(TARGET_CORE), VORTEXM4)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv8.4-a+sme
endif
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))


+ 10
- 3
kernel/Makefile.L3 View File

@@ -131,8 +131,12 @@ ifeq ($(ARCH), arm64)
ifeq ($(TARGET_CORE), ARMV9SME)
HAVE_SME = 1
endif
ifeq ($(TARGET_CORE), VORTEXM4)
HAVE_SME = 1
endif
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c
SGEMMDIRECTKERNEL_ALPHA_BETA = sgemm_direct_alpha_beta_arm64_sme1.c
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
endif
endif
endif
@@ -209,11 +213,12 @@ SKERNELOBJS += \
endif
ifeq ($(ARCH), arm64)
SKERNELOBJS += \
sgemm_direct_performant$(TSUFFIX).$(SUFFIX) \
sgemm_direct$(TSUFFIX).$(SUFFIX) \
sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX)
ifdef HAVE_SME
SKERNELOBJS += \
sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \
sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) \
sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX)
endif
endif
@@ -969,13 +974,15 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(ARCH), arm64)
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL_ALPHA_BETA)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifdef HAVE_SME
$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) :
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@
$(KDIR)sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) :
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_2VLx2VL.S -UDOUBLE -UCOMPLEX -o $@
$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) :
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@
endif


+ 1
- 0
kernel/arm64/KERNEL.VORTEXM4 View File

@@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.NEOVERSEN1

+ 22
- 10
kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c View File

@@ -14,9 +14,17 @@
#include <arm_sme.h>
#endif
#if defined(DYNAMIC_ARCH)
#define COMBINE(a,b) a ## b
#define COMBINE2(a,b) COMBINE(a,b)
#define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess
#define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS)
#else
#define SME1_PREPROCESS sgemm_direct_sme1_preprocess
#endif
/* Function prototypes */
extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\
const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess");
extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\
const float * restrict a, float * a_mod);
/* Function Definitions */
static uint64_t sve_cntw() {
@@ -99,10 +107,11 @@ kernel_2x2(const float *A, const float *B, float *C, size_t shared_dim,
svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]);
svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]);
}
return;
}
__arm_new("za") __arm_locally_streaming
void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\
static void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\
const float *ba, const float *restrict bb, const float* beta,\
float *restrict C) {
@@ -125,6 +134,7 @@ void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, co
// Block over row dimension of C
for (; row_idx < num_rows; row_idx += row_batch) {
row_batch = MIN(row_batch, num_rows - row_idx);
uint64_t col_idx = 0;
uint64_t col_batch = 2*svl;
@@ -143,7 +153,7 @@ void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, co
#else
void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\
const float *ba, const float *restrict bb, const float* beta,\
float *restrict C){}
float *restrict C){fprintf(stderr,"empty sgemm_alpha_beta2x2 should never get called!!!\n");}
#endif
/*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\
@@ -166,25 +176,27 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict
* of reading directly from vector (z) registers.
* */
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31","za");
/* Pre-process the left matrix to make it suitable for
matrix sum of outer-product calculation
*/
sgemm_direct_sme1_preprocess(M, K, A, A_mod);
SME1_PREPROCESS(M, K, A, A_mod);
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15","d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "za");
/* Calculate C = alpha*A*B + beta*C */
sgemm_direct_alpha_beta_sme1_2VLx2VL(M, K, N, &alpha, A_mod, B, &beta, R);
free(A_mod);
@@ -194,6 +206,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\
float beta, float * __restrict R, BLASLONG strideR){}
float beta, float * __restrict R, BLASLONG strideR){fprintf(stderr,"empty sgemm_direct_alpha_beta should not be called!!!\n");}
#endif

+ 25
- 13
kernel/arm64/sgemm_direct_arm64_sme1.c View File

@@ -8,17 +8,28 @@
#include <inttypes.h>
#include <math.h>
#if defined(HAVE_SME)
#if defined(DYNAMIC_ARCH)
#define COMBINE(a,b) a ## b
#define COMBINE2(a,b) COMBINE(a,b)
#define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess
#define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS)
#define SME1_DIRECT2X2_BASE sgemm_direct_sme1_2VLx2VL
#define SME1_DIRECT2X2 COMBINE2(SME1_DIRECT2X2_BASE,TS)
#else
#define SME1_PREPROCESS sgemm_direct_sme1_preprocess
#define SME1_DIRECT2X2 sgemm_direct_sme1_2VLx2VL
#endif
/* Function prototypes */
extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\
const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess");
extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\
extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\
const float * restrict a, float * a_mod) ;
extern void SME1_DIRECT2X2(uint64_t m, uint64_t k, uint64_t n,\
const float * matLeft,\
const float * restrict matRight,\
const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL");
const float * restrict matResult) ;
/* Function Definitions */
uint64_t sve_cntw() {
static uint64_t sve_cntw() {
uint64_t cnt;
asm volatile(
"rdsvl %[res], #1\n"
@@ -39,7 +50,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
uint64_t m_mod, vl_elms;
vl_elms = sve_cntw();
m_mod = ceil((double)M/(double)vl_elms) * vl_elms;
float *A_mod = (float *) malloc(m_mod*K*sizeof(float));
@@ -48,7 +58,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
* of reading directly from vector (z) registers.
* */
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
@@ -57,13 +67,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
/* Pre-process the left matrix to make it suitable for
matrix sum of outer-product calculation
*/
sgemm_direct_sme1_preprocess(M, K, A, A_mod);
SME1_PREPROCESS(M, K, A, A_mod);
/* Calculate C = A*B */
sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R);
SME1_DIRECT2X2(M, K, N, A_mod, B, R);
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
@@ -75,6 +85,8 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\
float * __restrict R, BLASLONG strideR){}
float * __restrict R, BLASLONG strideR){
fprintf(stderr,"EMPTY sgemm_kernel_direct should never be called \n");
}
#endif

+ 31
- 0
kernel/arm64/sgemm_direct_performant.c View File

@@ -0,0 +1,31 @@
#include "common.h"
/* helper for the direct sgemm code written by Arjan van der Ven */




int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
{
if (M<3 || M%2==1) return 0;
unsigned long long mnk = M * N * K;
/* large matrixes -> not performant */
if (mnk >= 28 * 512 * 512)
return 0;

/*
* if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
* and the regular sgemm copy/realignment of data pays off much quicker
*/
if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
return 0;

#ifdef SMP
/* if we can run multithreaded, the threading changes the based threshold */
if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
return 0;
#endif

return 1;
}



kernel/arm64/sgemm_direct_sme1.S → kernel/arm64/sgemm_direct_sme1_2VLx2VL.S View File

@@ -35,16 +35,17 @@
#define K_exit x15 //Exit condition for K loop
#define M_cntr x16 //M loop counter
#define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements
#define C2 x18 //Constant2: N + SVLs
#define C3 x19 //Constant3: K*SVLs + SVLs
#define C4 x20 //Constant4: SVLs-2
#define C5 x21 //Constant5: K*SVLs
#define C6 x22 //Constant6: N*SVLs
#define C2 x19 //Constant2: N + SVLs
#define C3 x20 //Constant3: K*SVLs + SVLs
#define C4 x21 //Constant4: SVLs-2
#define C5 x22 //Constant5: K*SVLs
#define C6 x23 //Constant6: N*SVLs

.text
.global sgemm_direct_sme1_2VLx2VL
.global ASMNAME

sgemm_direct_sme1_2VLx2VL:
ASMNAME:
//sgemm_direct_sme1_2VLx2VL:

stp x19, x20, [sp, #-48]!
stp x21, x22, [sp, #16]
@@ -61,7 +62,7 @@
add C2, N, C4 //N + SVLs
add C3, C5, C4 //K*SVLs + SVLs
whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension)
sub w20, w20, #2 //SVLs-2
sub w21, w21, #2 //SVLs-2

.M_Loop:
incw M_cntr
@@ -198,7 +199,7 @@ process_K_less_than_equal_2:
st1w {za1h.s[w13, #0]}, p5, [Cptr1]
st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2]
st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2]
cmp w13, w20
cmp w13, w21
b.mi .Loop_store_ZA
psel p4, p0, p2.s[w13, 1]
psel p5, p1, p2.s[w13, 1]
@@ -211,12 +212,12 @@ process_K_less_than_equal_2:
addvl Cptr, Cptr, #2
addvl Bptr, Bptr, #1
whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension)
b.first .N_Loop
b.mi .N_Loop
add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements
add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements
incw M_cntr
whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension)
b.first .M_Loop
b.mi .M_Loop

smstop


+ 4
- 4
kernel/arm64/sgemm_direct_sme1_preprocess.S View File

@@ -37,9 +37,9 @@
#define C6 x15 //Constant6: 3*ncol

.text
.global sgemm_direct_sme1_preprocess
.global ASMNAME //sgemm_direct_sme1_preprocess

sgemm_direct_sme1_preprocess:
ASMNAME: //sgemm_direct_sme1_preprocess:

stp x19, x20, [sp, #-48]!
stp x21, x22, [sp, #16]
@@ -114,14 +114,14 @@

addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb
whilelt p8.b, mat_ptr0, inner_loop_exit
b.first .Loop_process
b.mi .Loop_process

add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements
add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements
incw outer_loop_cntr

whilelt p0.s, outer_loop_cntr, nrow
b.first .M_Loop
b.mi .M_Loop

smstop



+ 1
- 0
kernel/setparam-ref.c View File

@@ -216,6 +216,7 @@ gotoblas_t TABLE_NAME = {
#ifdef ARCH_ARM64
sgemm_directTS,
sgemm_direct_alpha_betaTS,
sgemm_direct_performantTS,
#endif

sgemm_kernelTS, sgemm_betaTS,


+ 7
- 7
param.h View File

@@ -3353,7 +3353,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(CORTEXA57) || defined(CORTEXX1) || \
defined(CORTEXA72) || defined(CORTEXA73) || \
defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000)
defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) || defined(VORTEXM4)

#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -3370,7 +3370,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*FIXME: this should be using the cache size, but there is currently no easy way to
query that on ARM. So if getarch counted more than 8 cores we simply assume the host
is a big desktop or server with abundant cache rather than a phone or embedded device */
#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1)
#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) || defined(VORTEXM4)
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
@@ -3598,15 +3598,15 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#undef BGEMM_ALIGN_K
#undef BGEMM_DEFAULT_UNROLL_M
#undef BGEMM_DEFAULT_UNROLL_N
#define BGEMM_ALIGN_K 4
#define BGEMM_DEFAULT_UNROLL_M 8
#define BGEMM_ALIGN_K 8
#define BGEMM_DEFAULT_UNROLL_N 4
#define BGEMM_DEFAULT_UNROLL_M 4

#undef SBGEMM_ALIGN_K
#undef SBGEMM_DEFAULT_UNROLL_M
#undef SBGEMM_DEFAULT_UNROLL_N
#define SBGEMM_ALIGN_K 4
#define SBGEMM_DEFAULT_UNROLL_M 8
#define SBGEMM_ALIGN_K 8
#define SBGEMM_DEFAULT_UNROLL_M 4
#define SBGEMM_DEFAULT_UNROLL_N 4

#define SGEMM_DEFAULT_UNROLL_M 16
@@ -3842,7 +3842,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout

#endif /* ARMv8 */

#if defined(ARMV9SME) /* ARMv9 SME */
#if defined(ARMV9SME) || defined(VORTEXM4) /* ARMv9 SME */
#define USE_SGEMM_KERNEL_DIRECT 1
#endif /* ARMv9 SME */



Loading…
Cancel
Save