diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 3bb53ba4f..3931b2e55 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -484,7 +484,7 @@ jobs: CXX: g++-12 run: | mkdir build-avx512-spr && cd build-avx512-spr - cmake cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 2 - name: test-avx512-spr run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fc66061e..05361daa4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,8 +163,32 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16") check_cxx_source_compiles("#include \nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16+dotprod") - check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD) + set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve") + check_cxx_source_compiles("#include \nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2") + check_cxx_source_compiles("#include \nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm") + check_cxx_source_compiles("#include \nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) unset(CMAKE_REQUIRED_FLAGS) endif() @@ -176,16 +200,72 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") endif() if(NCNN_COMPILER_SUPPORT_ARM82_FP16) - option(NCNN_ARM82 "optimize aarch64 platform with armv8.2" ON) - if(NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD) + option(NCNN_ARM82 "optimize aarch64 platform with armv8.2 fp16" ON) + if(NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) if(NCNN_ARM82) option(NCNN_ARM82DOT "optimize aarch64 platform with armv8.2 dotprod" ON) + if(NCNN_COMPILER_SUPPORT_ARM82_FP16FML) + if(NCNN_ARM82DOT) + option(NCNN_ARM82FP16FML "optimize aarch64 platform with armv8.2 fp16fml" ON) + if(NCNN_COMPILER_SUPPORT_ARM84_BF16) + if(NCNN_ARM82FP16FML) + option(NCNN_ARM84BF16 "optimize aarch64 platform with armv8.4 bf16" ON) + endif() + else() + message(WARNING "The compiler does not support armv8.4 bf16. NCNN_ARM86BF16 will be OFF.") + endif() + if(NCNN_COMPILER_SUPPORT_ARM84_I8MM) + if(NCNN_ARM82FP16FML) + option(NCNN_ARM86I8MM "optimize aarch64 platform with armv8.4 i8mm" ON) + endif() + else() + message(WARNING "The compiler does not support armv8.4 i8mm. NCNN_ARM86I8MM will be OFF.") + endif() + if(NCNN_COMPILER_SUPPORT_ARM86_SVE) + if(NCNN_ARM84BF16 AND NCNN_ARM84I8MM) + option(NCNN_ARM86SVE "optimize aarch64 platform with armv8.6 sve" ON) + if(NCNN_COMPILER_SUPPORT_ARM86_SVE2) + if(NCNN_ARM86SVE) + option(NCNN_ARM86SVE2 "optimize aarch64 platform with armv8.6 sve2" ON) + endif() + else() + message(WARNING "The compiler does not support armv8.6 sve2. NCNN_ARM86SVE2 will be OFF.") + endif() + if(NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) + if(NCNN_ARM86SVE) + option(NCNN_ARM86SVEBF16 "optimize aarch64 platform with armv8.6 sve bf16" ON) + endif() + else() + message(WARNING "The compiler does not support armv8.6 sve bf16. NCNN_ARM86SVEBF16 will be OFF.") + endif() + if(NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) + if(NCNN_ARM86SVE) + option(NCNN_ARM86SVEI8MM "optimize aarch64 platform with armv8.6 sve i8mm" ON) + endif() + else() + message(WARNING "The compiler does not support armv8.6 sve i8mm. NCNN_ARM86SVEI8MM will be OFF.") + endif() + if(NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) + if(NCNN_ARM86SVE) + option(NCNN_ARM86SVEF32MM "optimize aarch64 platform with armv8.6 sve f32mm" ON) + endif() + else() + message(WARNING "The compiler does not support armv8.6 sve f32mm. NCNN_ARM86SVEF32MM will be OFF.") + endif() + endif() + else() + message(WARNING "The compiler does not support armv8.6 sve. NCNN_ARM86SVE will be OFF.") + endif() + endif() + else() + message(WARNING "The compiler does not support armv8.2 fp16fml. NCNN_ARM82FP16FML will be OFF.") + endif() endif() else() message(WARNING "The compiler does not support armv8.2 dotprod. NCNN_ARM82DOT will be OFF.") endif() else() - message(WARNING "The compiler does not support armv8.2. NCNN_ARM82 will be OFF.") + message(WARNING "The compiler does not support armv8.2 fp16. NCNN_ARM82 will be OFF.") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)") set(NCNN_TARGET_ARCH mips) diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 400e03306..5f0adb009 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -235,6 +235,30 @@ macro(ncnn_add_layer class) if(NCNN_ARM82DOT) ncnn_add_arch_opt_source(${class} asimddp "-march=armv8.2-a+fp16+dotprod") endif() + if(NCNN_ARM82FP16FML) + ncnn_add_arch_opt_source(${class} asimdfhm "-march=armv8.2-a+fp16+dotprod+fp16fml") + endif() + if(NCNN_ARM84BF16) + ncnn_add_arch_opt_source(${class} bf16 "-march=armv8.4-a+bf16") + endif() + if(NCNN_ARM84I8MM) + ncnn_add_arch_opt_source(${class} i8mm "-march=armv8.4-a+i8mm") + endif() + if(NCNN_ARM86SVE) + ncnn_add_arch_opt_source(${class} sve "-march=armv8.6-a+sve") + endif() + if(NCNN_ARM86SVE2) + ncnn_add_arch_opt_source(${class} sve2 "-march=armv8.6-a+sve2") + endif() + if(NCNN_ARM86SVEBF16) + ncnn_add_arch_opt_source(${class} svebf16 "-march=armv8.6-a+sve+bf16") + endif() + if(NCNN_ARM86SVEI8MM) + ncnn_add_arch_opt_source(${class} svei8mm "-march=armv8.6-a+sve+i8mm") + endif() + if(NCNN_ARM86SVEF32MM) + ncnn_add_arch_opt_source(${class} svef32mm "-march=armv8.6-a+sve+f32mm") + endif() endif() if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "mips") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3eb31c8b3..95878bae6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -413,13 +413,38 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") OR (CMAKE_SYSTEM_PROCESSOR MA endif() if(((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)"))) - if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82) + if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM86SVE) + set(ARM_MARCH_FLAG "-march=armv8.6-a+sve") + if(NCNN_ARM86SVE2) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+sve2") + endif() + if(NCNN_ARM86SVEBF16) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16") + endif() + if(NCNN_ARM86SVEI8MM) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+i8mm") + endif() + if(NCNN_ARM86SVEF32MM) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+f32mm") + endif() + elseif(NOT NCNN_RUNTIME_CPU AND (NCNN_ARM84BF16 OR NCNN_ARM84I8MM)) + set(ARM_MARCH_FLAG "-march=armv8.4-a") + if(NCNN_ARM84BF16) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16") + endif() + if(NCNN_ARM84I8MM) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+i8mm") + endif() + elseif(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82) + set(ARM_MARCH_FLAG "-march=armv8.2-a+fp16") if(NCNN_ARM82DOT) - target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16+dotprod) - else() - target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+dotprod") + endif() + if(NCNN_ARM82FP16FML) + set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+fp16fml") endif() endif() + target_compile_options(ncnn PRIVATE ${ARM_MARCH_FLAG}) endif() if(NCNN_TARGET_ARCH STREQUAL "mips") diff --git a/src/cpu.cpp b/src/cpu.cpp index bc329deb4..7129a4566 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -114,10 +114,10 @@ namespace ncnn { // its implementation does not parse /proc/self/auxv. Instead it depends // on values that are passed by the kernel at process-init time to the // C runtime initialization layer. -static unsigned int get_elf_hwcap_from_getauxval() +static unsigned int get_elf_hwcap_from_getauxval(unsigned int type) { #if __ANDROID_API__ >= 18 - unsigned int hwcap = getauxval(AT_HWCAP); + unsigned int hwcap = getauxval(type); if (hwcap) return hwcap; #endif @@ -141,7 +141,7 @@ static unsigned int get_elf_hwcap_from_getauxval() else { // Note: getauxval() returns 0 on failure. Doesn't touch errno. - result = (unsigned int)(*func)(AT_HWCAP); + result = (unsigned int)(*func)(type); } dlclose(libc_handle); @@ -150,7 +150,7 @@ static unsigned int get_elf_hwcap_from_getauxval() #endif // defined __ANDROID__ // extract the ELF HW capabilities bitmap from /proc/self/auxv -static unsigned int get_elf_hwcap_from_proc_self_auxv() +static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type) { FILE* fp = fopen("/proc/self/auxv", "rb"); if (!fp) @@ -184,7 +184,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv() if (entry.tag == 0 && entry.value == 0) break; - if (entry.tag == AT_HWCAP) + if (entry.tag == type) { result = entry.value; break; @@ -196,24 +196,33 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv() return result; } -static unsigned int get_elf_hwcap() +static unsigned int get_elf_hwcap(unsigned int type) { #if defined __ANDROID__ - unsigned int hwcap = get_elf_hwcap_from_getauxval(); + unsigned int hwcap = get_elf_hwcap_from_getauxval(type); if (hwcap) return hwcap; #endif - return get_elf_hwcap_from_proc_self_auxv(); + return get_elf_hwcap_from_proc_self_auxv(type); } -static unsigned int g_hwcaps = get_elf_hwcap(); +static unsigned int g_hwcaps = get_elf_hwcap(AT_HWCAP); +static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2); #if __aarch64__ // from arch/arm64/include/uapi/asm/hwcap.h -#define HWCAP_ASIMD (1 << 1) -#define HWCAP_ASIMDHP (1 << 10) -#define HWCAP_ASIMDDP (1 << 20) +#define HWCAP_ASIMD (1 << 1) +#define HWCAP_ASIMDHP (1 << 10) +#define HWCAP_ASIMDDP (1 << 20) +#define HWCAP_SVE (1 << 22) +#define HWCAP_ASIMDFHM (1 << 23) +#define HWCAP2_SVE2 (1 << 1) +#define HWCAP2_SVEI8MM (1 << 9) +#define HWCAP2_SVEF32MM (1 << 10) +#define HWCAP2_SVEBF16 (1 << 12) +#define HWCAP2_I8MM (1 << 13) +#define HWCAP2_BF16 (1 << 14) #else // from arch/arm/include/uapi/asm/hwcap.h #define HWCAP_NEON (1 << 12) @@ -443,6 +452,158 @@ int cpu_support_arm_asimddp() #endif } +int cpu_support_arm_asimdfhm() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_ASIMDFHM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD; +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_bf16() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_BF16; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 bf16 +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_i8mm() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_I8MM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 i8mm +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_sve() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_SVE; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 sve +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_sve2() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVE2; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 sve2 +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_svebf16() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVEBF16; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 svebf16 +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_svei8mm() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVEI8MM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 svei8mm +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_arm_svef32mm() +{ +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVEF32MM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 svef32mm +#else + return 0; +#endif +#else + return 0; +#endif +} + #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) static inline void x86_cpuid(int level, unsigned int out[4]) { diff --git a/src/cpu.h b/src/cpu.h index be6d1d436..5946b2863 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -53,6 +53,22 @@ NCNN_EXPORT int cpu_support_arm_vfpv4(); NCNN_EXPORT int cpu_support_arm_asimdhp(); // asimddp = aarch64 asimd dot product NCNN_EXPORT int cpu_support_arm_asimddp(); +// asimdfhm = aarch64 asimd fhm +NCNN_EXPORT int cpu_support_arm_asimdfhm(); +// bf16 = aarch64 bf16 +NCNN_EXPORT int cpu_support_arm_bf16(); +// i8mm = aarch64 i8mm +NCNN_EXPORT int cpu_support_arm_i8mm(); +// sve = aarch64 sve +NCNN_EXPORT int cpu_support_arm_sve(); +// sve2 = aarch64 sve2 +NCNN_EXPORT int cpu_support_arm_sve2(); +// svebf16 = aarch64 svebf16 +NCNN_EXPORT int cpu_support_arm_svebf16(); +// svei8mm = aarch64 svei8mm +NCNN_EXPORT int cpu_support_arm_svei8mm(); +// svef32mm = aarch64 svef32mm +NCNN_EXPORT int cpu_support_arm_svef32mm(); // avx = x86 avx NCNN_EXPORT int cpu_support_x86_avx(); diff --git a/src/platform.h.in b/src/platform.h.in index b6831d794..755f8294b 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -45,6 +45,14 @@ #if __aarch64__ #cmakedefine01 NCNN_ARM82 #cmakedefine01 NCNN_ARM82DOT +#cmakedefine01 NCNN_ARM82FP16FML +#cmakedefine01 NCNN_ARM84BF16 +#cmakedefine01 NCNN_ARM84I8MM +#cmakedefine01 NCNN_ARM86SVE +#cmakedefine01 NCNN_ARM86SVE2 +#cmakedefine01 NCNN_ARM86SVEBF16 +#cmakedefine01 NCNN_ARM86SVEI8MM +#cmakedefine01 NCNN_ARM86SVEF32MM #endif // __aarch64__ #cmakedefine01 NCNN_MSA #cmakedefine01 NCNN_MMI