Browse Source

armv8.2 asimdfhm and armv8.4 bf16 i8mm and armv8.6 sve sve2 compiler flags and runtime detection functions (#3964)

tags/20220701
nihui GitHub 4 years ago
parent
commit
b85bfb6085
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 336 additions and 22 deletions
  1. +1
    -1
      .github/workflows/test-coverage.yml
  2. +85
    -5
      CMakeLists.txt
  3. +24
    -0
      cmake/ncnn_add_layer.cmake
  4. +29
    -4
      src/CMakeLists.txt
  5. +173
    -12
      src/cpu.cpp
  6. +16
    -0
      src/cpu.h
  7. +8
    -0
      src/platform.h.in

+ 1
- 1
.github/workflows/test-coverage.yml View File

@@ -484,7 +484,7 @@ jobs:
CXX: g++-12
run: |
mkdir build-avx512-spr && cd build-avx512-spr
cmake cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j 2
- name: test-avx512-spr
run: |


+ 85
- 5
CMakeLists.txt View File

@@ -163,8 +163,32 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16+dotprod")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD)
set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)

set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)

unset(CMAKE_REQUIRED_FLAGS)
endif()
@@ -176,16 +200,72 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
endif()

if(NCNN_COMPILER_SUPPORT_ARM82_FP16)
option(NCNN_ARM82 "optimize aarch64 platform with armv8.2" ON)
if(NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD)
option(NCNN_ARM82 "optimize aarch64 platform with armv8.2 fp16" ON)
if(NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
if(NCNN_ARM82)
option(NCNN_ARM82DOT "optimize aarch64 platform with armv8.2 dotprod" ON)
if(NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
if(NCNN_ARM82DOT)
option(NCNN_ARM82FP16FML "optimize aarch64 platform with armv8.2 fp16fml" ON)
if(NCNN_COMPILER_SUPPORT_ARM84_BF16)
if(NCNN_ARM82FP16FML)
option(NCNN_ARM84BF16 "optimize aarch64 platform with armv8.4 bf16" ON)
endif()
else()
message(WARNING "The compiler does not support armv8.4 bf16. NCNN_ARM86BF16 will be OFF.")
endif()
if(NCNN_COMPILER_SUPPORT_ARM84_I8MM)
if(NCNN_ARM82FP16FML)
option(NCNN_ARM86I8MM "optimize aarch64 platform with armv8.4 i8mm" ON)
endif()
else()
message(WARNING "The compiler does not support armv8.4 i8mm. NCNN_ARM86I8MM will be OFF.")
endif()
if(NCNN_COMPILER_SUPPORT_ARM86_SVE)
if(NCNN_ARM84BF16 AND NCNN_ARM84I8MM)
option(NCNN_ARM86SVE "optimize aarch64 platform with armv8.6 sve" ON)
if(NCNN_COMPILER_SUPPORT_ARM86_SVE2)
if(NCNN_ARM86SVE)
option(NCNN_ARM86SVE2 "optimize aarch64 platform with armv8.6 sve2" ON)
endif()
else()
message(WARNING "The compiler does not support armv8.6 sve2. NCNN_ARM86SVE2 will be OFF.")
endif()
if(NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
if(NCNN_ARM86SVE)
option(NCNN_ARM86SVEBF16 "optimize aarch64 platform with armv8.6 sve bf16" ON)
endif()
else()
message(WARNING "The compiler does not support armv8.6 sve bf16. NCNN_ARM86SVEBF16 will be OFF.")
endif()
if(NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
if(NCNN_ARM86SVE)
option(NCNN_ARM86SVEI8MM "optimize aarch64 platform with armv8.6 sve i8mm" ON)
endif()
else()
message(WARNING "The compiler does not support armv8.6 sve i8mm. NCNN_ARM86SVEI8MM will be OFF.")
endif()
if(NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
if(NCNN_ARM86SVE)
option(NCNN_ARM86SVEF32MM "optimize aarch64 platform with armv8.6 sve f32mm" ON)
endif()
else()
message(WARNING "The compiler does not support armv8.6 sve f32mm. NCNN_ARM86SVEF32MM will be OFF.")
endif()
endif()
else()
message(WARNING "The compiler does not support armv8.6 sve. NCNN_ARM86SVE will be OFF.")
endif()
endif()
else()
message(WARNING "The compiler does not support armv8.2 fp16fml. NCNN_ARM82FP16FML will be OFF.")
endif()
endif()
else()
message(WARNING "The compiler does not support armv8.2 dotprod. NCNN_ARM82DOT will be OFF.")
endif()
else()
message(WARNING "The compiler does not support armv8.2. NCNN_ARM82 will be OFF.")
message(WARNING "The compiler does not support armv8.2 fp16. NCNN_ARM82 will be OFF.")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
set(NCNN_TARGET_ARCH mips)


+ 24
- 0
cmake/ncnn_add_layer.cmake View File

@@ -235,6 +235,30 @@ macro(ncnn_add_layer class)
if(NCNN_ARM82DOT)
ncnn_add_arch_opt_source(${class} asimddp "-march=armv8.2-a+fp16+dotprod")
endif()
if(NCNN_ARM82FP16FML)
ncnn_add_arch_opt_source(${class} asimdfhm "-march=armv8.2-a+fp16+dotprod+fp16fml")
endif()
if(NCNN_ARM84BF16)
ncnn_add_arch_opt_source(${class} bf16 "-march=armv8.4-a+bf16")
endif()
if(NCNN_ARM84I8MM)
ncnn_add_arch_opt_source(${class} i8mm "-march=armv8.4-a+i8mm")
endif()
if(NCNN_ARM86SVE)
ncnn_add_arch_opt_source(${class} sve "-march=armv8.6-a+sve")
endif()
if(NCNN_ARM86SVE2)
ncnn_add_arch_opt_source(${class} sve2 "-march=armv8.6-a+sve2")
endif()
if(NCNN_ARM86SVEBF16)
ncnn_add_arch_opt_source(${class} svebf16 "-march=armv8.6-a+sve+bf16")
endif()
if(NCNN_ARM86SVEI8MM)
ncnn_add_arch_opt_source(${class} svei8mm "-march=armv8.6-a+sve+i8mm")
endif()
if(NCNN_ARM86SVEF32MM)
ncnn_add_arch_opt_source(${class} svef32mm "-march=armv8.6-a+sve+f32mm")
endif()
endif()

if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "mips")


+ 29
- 4
src/CMakeLists.txt View File

@@ -413,13 +413,38 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") OR (CMAKE_SYSTEM_PROCESSOR MA
endif()

if(((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)")))
if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82)
if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM86SVE)
set(ARM_MARCH_FLAG "-march=armv8.6-a+sve")
if(NCNN_ARM86SVE2)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+sve2")
endif()
if(NCNN_ARM86SVEBF16)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16")
endif()
if(NCNN_ARM86SVEI8MM)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+i8mm")
endif()
if(NCNN_ARM86SVEF32MM)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+f32mm")
endif()
elseif(NOT NCNN_RUNTIME_CPU AND (NCNN_ARM84BF16 OR NCNN_ARM84I8MM))
set(ARM_MARCH_FLAG "-march=armv8.4-a")
if(NCNN_ARM84BF16)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16")
endif()
if(NCNN_ARM84I8MM)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+i8mm")
endif()
elseif(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82)
set(ARM_MARCH_FLAG "-march=armv8.2-a+fp16")
if(NCNN_ARM82DOT)
target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16+dotprod)
else()
target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+dotprod")
endif()
if(NCNN_ARM82FP16FML)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+fp16fml")
endif()
endif()
target_compile_options(ncnn PRIVATE ${ARM_MARCH_FLAG})
endif()

if(NCNN_TARGET_ARCH STREQUAL "mips")


+ 173
- 12
src/cpu.cpp View File

@@ -114,10 +114,10 @@ namespace ncnn {
// its implementation does not parse /proc/self/auxv. Instead it depends
// on values that are passed by the kernel at process-init time to the
// C runtime initialization layer.
static unsigned int get_elf_hwcap_from_getauxval()
static unsigned int get_elf_hwcap_from_getauxval(unsigned int type)
{
#if __ANDROID_API__ >= 18
unsigned int hwcap = getauxval(AT_HWCAP);
unsigned int hwcap = getauxval(type);
if (hwcap)
return hwcap;
#endif
@@ -141,7 +141,7 @@ static unsigned int get_elf_hwcap_from_getauxval()
else
{
// Note: getauxval() returns 0 on failure. Doesn't touch errno.
result = (unsigned int)(*func)(AT_HWCAP);
result = (unsigned int)(*func)(type);
}
dlclose(libc_handle);

@@ -150,7 +150,7 @@ static unsigned int get_elf_hwcap_from_getauxval()
#endif // defined __ANDROID__

// extract the ELF HW capabilities bitmap from /proc/self/auxv
static unsigned int get_elf_hwcap_from_proc_self_auxv()
static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
{
FILE* fp = fopen("/proc/self/auxv", "rb");
if (!fp)
@@ -184,7 +184,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv()
if (entry.tag == 0 && entry.value == 0)
break;

if (entry.tag == AT_HWCAP)
if (entry.tag == type)
{
result = entry.value;
break;
@@ -196,24 +196,33 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv()
return result;
}

static unsigned int get_elf_hwcap()
static unsigned int get_elf_hwcap(unsigned int type)
{
#if defined __ANDROID__
unsigned int hwcap = get_elf_hwcap_from_getauxval();
unsigned int hwcap = get_elf_hwcap_from_getauxval(type);
if (hwcap)
return hwcap;
#endif

return get_elf_hwcap_from_proc_self_auxv();
return get_elf_hwcap_from_proc_self_auxv(type);
}

static unsigned int g_hwcaps = get_elf_hwcap();
static unsigned int g_hwcaps = get_elf_hwcap(AT_HWCAP);
static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);

#if __aarch64__
// from arch/arm64/include/uapi/asm/hwcap.h
#define HWCAP_ASIMD (1 << 1)
#define HWCAP_ASIMDHP (1 << 10)
#define HWCAP_ASIMDDP (1 << 20)
#define HWCAP_ASIMD (1 << 1)
#define HWCAP_ASIMDHP (1 << 10)
#define HWCAP_ASIMDDP (1 << 20)
#define HWCAP_SVE (1 << 22)
#define HWCAP_ASIMDFHM (1 << 23)
#define HWCAP2_SVE2 (1 << 1)
#define HWCAP2_SVEI8MM (1 << 9)
#define HWCAP2_SVEF32MM (1 << 10)
#define HWCAP2_SVEBF16 (1 << 12)
#define HWCAP2_I8MM (1 << 13)
#define HWCAP2_BF16 (1 << 14)
#else
// from arch/arm/include/uapi/asm/hwcap.h
#define HWCAP_NEON (1 << 12)
@@ -443,6 +452,158 @@ int cpu_support_arm_asimddp()
#endif
}

int cpu_support_arm_asimdfhm()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMDFHM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_bf16()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_BF16;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 bf16
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_i8mm()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_I8MM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 i8mm
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_sve()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_SVE;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 sve
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_sve2()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVE2;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 sve2
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_svebf16()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVEBF16;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 svebf16
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_svei8mm()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVEI8MM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 svei8mm
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_svef32mm()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps2 & HWCAP2_SVEF32MM;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return 0; // no known apple cpu support armv8.6 svef32mm
#else
return 0;
#endif
#else
return 0;
#endif
}

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static inline void x86_cpuid(int level, unsigned int out[4])
{


+ 16
- 0
src/cpu.h View File

@@ -53,6 +53,22 @@ NCNN_EXPORT int cpu_support_arm_vfpv4();
NCNN_EXPORT int cpu_support_arm_asimdhp();
// asimddp = aarch64 asimd dot product
NCNN_EXPORT int cpu_support_arm_asimddp();
// asimdfhm = aarch64 asimd fhm
NCNN_EXPORT int cpu_support_arm_asimdfhm();
// bf16 = aarch64 bf16
NCNN_EXPORT int cpu_support_arm_bf16();
// i8mm = aarch64 i8mm
NCNN_EXPORT int cpu_support_arm_i8mm();
// sve = aarch64 sve
NCNN_EXPORT int cpu_support_arm_sve();
// sve2 = aarch64 sve2
NCNN_EXPORT int cpu_support_arm_sve2();
// svebf16 = aarch64 svebf16
NCNN_EXPORT int cpu_support_arm_svebf16();
// svei8mm = aarch64 svei8mm
NCNN_EXPORT int cpu_support_arm_svei8mm();
// svef32mm = aarch64 svef32mm
NCNN_EXPORT int cpu_support_arm_svef32mm();

// avx = x86 avx
NCNN_EXPORT int cpu_support_x86_avx();


+ 8
- 0
src/platform.h.in View File

@@ -45,6 +45,14 @@
#if __aarch64__
#cmakedefine01 NCNN_ARM82
#cmakedefine01 NCNN_ARM82DOT
#cmakedefine01 NCNN_ARM82FP16FML
#cmakedefine01 NCNN_ARM84BF16
#cmakedefine01 NCNN_ARM84I8MM
#cmakedefine01 NCNN_ARM86SVE
#cmakedefine01 NCNN_ARM86SVE2
#cmakedefine01 NCNN_ARM86SVEBF16
#cmakedefine01 NCNN_ARM86SVEI8MM
#cmakedefine01 NCNN_ARM86SVEF32MM
#endif // __aarch64__
#cmakedefine01 NCNN_MSA
#cmakedefine01 NCNN_MMI


Loading…
Cancel
Save