diff --git a/src/cpu.cpp b/src/cpu.cpp index 7656da982..00dcd836e 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -113,7 +113,47 @@ #include #endif -namespace ncnn { +// topology info +static int g_cpucount; +static int g_physical_cpucount; +static int g_powersave; +static ncnn::CpuSet g_cpu_affinity_mask_all; +static ncnn::CpuSet g_cpu_affinity_mask_little; +static ncnn::CpuSet g_cpu_affinity_mask_big; + +// isa info +#if defined __ANDROID__ || defined __linux__ +static unsigned int g_hwcaps; +static unsigned int g_hwcaps2; +#endif // defined __ANDROID__ || defined __linux__ +#if __APPLE__ +static unsigned int g_hw_cpufamily; +static cpu_type_t g_hw_cputype; +static cpu_subtype_t g_hw_cpusubtype; +static int g_hw_optional_arm_FEAT_FP16; +static int g_hw_optional_arm_FEAT_DotProd; +static int g_hw_optional_arm_FEAT_FHM; +static int g_hw_optional_arm_FEAT_BF16; +static int g_hw_optional_arm_FEAT_I8MM; +#endif // __APPLE__ +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +static int g_cpu_support_x86_avx; +static int g_cpu_support_x86_fma; +static int g_cpu_support_x86_xop; +static int g_cpu_support_x86_f16c; +static int g_cpu_support_x86_avx2; +static int g_cpu_support_x86_avx_vnni; +static int g_cpu_support_x86_avx512; +static int g_cpu_support_x86_avx512_vnni; +static int g_cpu_support_x86_avx512_bf16; +static int g_cpu_support_x86_avx512_fp16; +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + +static int g_cpu_level2_cachesize; +static int g_cpu_level3_cachesize; + +// misc info +static int g_cpu_is_arm_a53_a55; #if defined __ANDROID__ || defined __linux__ @@ -282,9 +322,6 @@ static unsigned int get_elf_hwcap(unsigned int type) return hwcap; } - -static unsigned int g_hwcaps = get_elf_hwcap(AT_HWCAP); -static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2); #endif // defined __ANDROID__ || defined __linux__ #if __APPLE__ @@ -312,10 +349,6 @@ static cpu_subtype_t get_hw_cpusubtype() return value; } -static unsigned int g_hw_cpufamily = get_hw_cpufamily(); -static cpu_type_t g_hw_cputype = get_hw_cputype(); -static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype(); - static int get_hw_capability(const char* cap) { int64_t value = 0; @@ -323,914 +356,1154 @@ static int get_hw_capability(const char* cap) sysctlbyname(cap, &value, &len, NULL, 0); return value; } - -static int g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16"); -static int g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd"); -static int g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM"); -static int g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16"); -static int g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM"); #endif // __APPLE__ -#if (defined _WIN32 && !(defined __MINGW32__)) -CpuSet::CpuSet() +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +static inline void x86_cpuid(int level, unsigned int out[4]) { - disable_all(); +#if defined(_MSC_VER) + __cpuid((int*)out, level); +#elif defined(__clang__) || defined(__GNUC__) + __get_cpuid(level, out, out + 1, out + 2, out + 3); +#else + NCNN_LOGE("x86_cpuid is unknown for current compiler"); + out[0] = 0; + out[1] = 0; + out[2] = 0; + out[3] = 0; +#endif } -void CpuSet::enable(int cpu) +static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4]) { - mask |= (1 << cpu); +#if defined(_MSC_VER) + __cpuidex((int*)out, level, sublevel); +#elif defined(__clang__) || defined(__GNUC__) + __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]); +#else + NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler"); + out[0] = 0; + out[1] = 0; + out[2] = 0; + out[3] = 0; +#endif } -void CpuSet::disable(int cpu) +static inline int x86_get_xcr0() { - mask &= ~(1 << cpu); +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) + return _xgetbv(0); +#elif defined(__i386__) || defined(__x86_64__) + int xcr0 = 0; + asm(".byte 0x0f, 0x01, 0xd0" + : "=a"(xcr0) + : "c"(0) + : "%edx"); + return xcr0; +#else + NCNN_LOGE("x86_get_xcr0 is unknown for current compiler"); + return 0xffffffff; // assume it will work +#endif } -void CpuSet::disable_all() +static int get_cpu_support_x86_avx() { - mask = 0; -} +#if !NCNN_AVX + return 0; +#endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); -bool CpuSet::is_enabled(int cpu) const -{ - return mask & (1 << cpu); -} + int nIds = cpu_info[0]; + if (nIds < 1) + return 0; -int CpuSet::num_enabled() const -{ - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(mask) * 8; i++) - { - if (is_enabled(i)) - num_enabled++; - } + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; - return num_enabled; -} -#elif defined __ANDROID__ || defined __linux__ -CpuSet::CpuSet() -{ - disable_all(); -} + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; -void CpuSet::enable(int cpu) -{ - CPU_SET(cpu, &cpu_set); + return 1; } -void CpuSet::disable(int cpu) +static int get_cpu_support_x86_fma() { - CPU_CLR(cpu, &cpu_set); -} +#if !NCNN_FMA + return 0; +#endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); -void CpuSet::disable_all() -{ - CPU_ZERO(&cpu_set); -} + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; -bool CpuSet::is_enabled(int cpu) const -{ - return CPU_ISSET(cpu, &cpu_set); -} + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; -int CpuSet::num_enabled() const -{ - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++) - { - if (is_enabled(i)) - num_enabled++; - } + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; - return num_enabled; -} -#elif __APPLE__ -CpuSet::CpuSet() -{ - disable_all(); + return cpu_info[2] & (1u << 12); } -void CpuSet::enable(int cpu) +static int get_cpu_support_x86_xop() { - policy |= (1 << cpu); -} +#if !NCNN_XOP + return 0; +#endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0x80000000, cpu_info); -void CpuSet::disable(int cpu) -{ - policy &= ~(1 << cpu); -} + if (cpu_info[0] < 0x80000001) + return 0; -void CpuSet::disable_all() -{ - policy = 0; -} + x86_cpuid(0x80000001, cpu_info); -bool CpuSet::is_enabled(int cpu) const -{ - return policy & (1 << cpu); + return cpu_info[2] & (1u << 11); } -int CpuSet::num_enabled() const +static int get_cpu_support_x86_f16c() { - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(policy) * 8; i++) - { - if (is_enabled(i)) - num_enabled++; - } +#if !NCNN_F16C + return 0; +#endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); - return num_enabled; -} -#else -CpuSet::CpuSet() -{ -} + int nIds = cpu_info[0]; + if (nIds < 1) + return 0; -void CpuSet::enable(int /* cpu */) -{ -} + x86_cpuid(1, cpu_info); -void CpuSet::disable(int /* cpu */) -{ + return cpu_info[2] & (1u << 29); } -void CpuSet::disable_all() +static int get_cpu_support_x86_avx2() { -} +#if !NCNN_AVX2 + return 0; +#endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); -bool CpuSet::is_enabled(int /* cpu */) const -{ - return true; + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; + + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; + + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; + + x86_cpuid_sublevel(7, 0, cpu_info); + return cpu_info[1] & (1u << 5); } -int CpuSet::num_enabled() const +static int get_cpu_support_x86_avx_vnni() { - return get_cpu_count(); -} +#if !NCNN_AVXVNNI + return 0; #endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); -int cpu_support_arm_edsp() + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; + + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; + + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; + + x86_cpuid_sublevel(7, 1, cpu_info); + return cpu_info[0] & (1u << 4); +} + +static int get_cpu_support_x86_avx512() { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ +#if !NCNN_AVX512 return 0; -#else - return g_hwcaps & HWCAP_EDSP; #endif -#elif __APPLE__ -#if __aarch64__ - return 0; -#else - return g_hw_cputype == CPU_TYPE_ARM; -#endif -#else - return 0; -#endif -} + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); -int cpu_support_arm_neon() -{ -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps & HWCAP_ASIMD; -#else - return g_hwcaps & HWCAP_NEON; -#endif -#elif __APPLE__ -#if __aarch64__ - return g_hw_cputype == CPU_TYPE_ARM64; -#else - return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7; -#endif -#else - return 0; -#endif -} + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; -int cpu_support_arm_vfpv4() -{ -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - // neon always enable fma and fp16 - return g_hwcaps & HWCAP_ASIMD; -#else - return g_hwcaps & HWCAP_VFPv4; -#endif -#elif __APPLE__ -#if __aarch64__ - return g_hw_cputype == CPU_TYPE_ARM64; -#else - return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S; -#endif -#else - return 0; -#endif + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; + + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; + + // check avx512 XSAVE enabled by kernel + if ((x86_get_xcr0() & 0xe0) != 0xe0) + return 0; + + x86_cpuid_sublevel(7, 0, cpu_info); + return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31)); } -int cpu_support_arm_asimdhp() +static int get_cpu_support_x86_avx512_vnni() { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps & HWCAP_ASIMDHP; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return g_hw_optional_arm_FEAT_FP16 - || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL - || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST - || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER - || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM - || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD - || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; -#else - return 0; -#endif -#else +#if !NCNN_AVX512VNNI return 0; #endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); + + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; + + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; + + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; + + // check avx512 XSAVE enabled by kernel + if ((x86_get_xcr0() & 0xe0) != 0xe0) + return 0; + + x86_cpuid_sublevel(7, 0, cpu_info); + return cpu_info[2] & (1u << 11); } -int cpu_support_arm_cpuid() +static int get_cpu_support_x86_avx512_bf16() { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps & HWCAP_CPUID; -#else - return 0; -#endif -#elif __APPLE__ - return 0; -#else +#if !NCNN_AVX512BF16 return 0; #endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); + + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; + + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; + + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; + + x86_cpuid_sublevel(7, 1, cpu_info); + return cpu_info[0] & (1u << 5); } -int cpu_support_arm_asimddp() +static int get_cpu_support_x86_avx512_fp16() { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps & HWCAP_ASIMDDP; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return g_hw_optional_arm_FEAT_DotProd - || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER - || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM - || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD - || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; -#else - return 0; -#endif -#else +#if !NCNN_AVX512FP16 return 0; #endif + unsigned int cpu_info[4] = {0}; + x86_cpuid(0, cpu_info); + + int nIds = cpu_info[0]; + if (nIds < 7) + return 0; + + x86_cpuid(1, cpu_info); + // check AVX XSAVE OSXSAVE + if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) + return 0; + + // check XSAVE enabled by kernel + if ((x86_get_xcr0() & 6) != 6) + return 0; + + // check avx512 XSAVE enabled by kernel + if ((x86_get_xcr0() & 0xe0) != 0xe0) + return 0; + + x86_cpuid_sublevel(7, 0, cpu_info); + return cpu_info[3] & (1u << 23); } +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) -int cpu_support_arm_asimdfhm() +static int get_cpucount() { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps & HWCAP_ASIMDFHM; -#else - return 0; -#endif + int count = 0; +#ifdef __EMSCRIPTEN__ + if (emscripten_has_threading_support()) + count = emscripten_num_logical_cores(); + else + count = 1; +#elif (defined _WIN32 && !(defined __MINGW32__)) + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + count = system_info.dwNumberOfProcessors; +#elif defined __ANDROID__ || defined __linux__ + // get cpu count from /proc/cpuinfo + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) + return 1; + + char line[1024]; + while (!feof(fp)) + { + char* s = fgets(line, 1024, fp); + if (!s) + break; + + if (memcmp(line, "processor", 9) == 0) + { + count++; + } + } + + fclose(fp); #elif __APPLE__ -#if __aarch64__ - return g_hw_optional_arm_FEAT_FHM - || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER - || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM - || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD - || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; + size_t len = sizeof(count); + sysctlbyname("hw.ncpu", &count, &len, NULL, 0); #else - return 0; -#endif +#ifdef _OPENMP + count = omp_get_max_threads(); #else - return 0; + count = 1; +#endif // _OPENMP #endif + + if (count < 1) + count = 1; + + return count; } -int cpu_support_arm_bf16() -{ #if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps2 & HWCAP2_BF16; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return g_hw_optional_arm_FEAT_BF16 - || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD - || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; -#else - return 0; -#endif -#else - return 0; -#endif -} - -int cpu_support_arm_i8mm() +static int get_thread_siblings(int cpuid) { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps2 & HWCAP2_I8MM; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return g_hw_optional_arm_FEAT_I8MM - || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD - || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; -#else - return 0; -#endif -#else - return 0; -#endif -} + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid); -int cpu_support_arm_sve() -{ -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps & HWCAP_SVE; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return 0; // no known apple cpu support armv8.6 sve -#else - return 0; -#endif -#else - return 0; -#endif -} + FILE* fp = fopen(path, "rb"); + if (!fp) + return -1; -int cpu_support_arm_sve2() -{ -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps2 & HWCAP2_SVE2; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return 0; // no known apple cpu support armv8.6 sve2 -#else - return 0; -#endif -#else - return 0; -#endif -} + int thread_siblings = -1; + int nscan = fscanf(fp, "%x", &thread_siblings); + if (nscan != 1) + { + // ignore + } -int cpu_support_arm_svebf16() -{ -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps2 & HWCAP2_SVEBF16; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return 0; // no known apple cpu support armv8.6 svebf16 -#else - return 0; -#endif -#else - return 0; -#endif + fclose(fp); + + return thread_siblings; } +#endif // defined __ANDROID__ || defined __linux__ -int cpu_support_arm_svei8mm() +static int get_physical_cpucount() { -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps2 & HWCAP2_SVEI8MM; -#else - return 0; -#endif + int count = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi == NULL) + { + NCNN_LOGE("GetLogicalProcessorInformation is not supported"); + return g_cpucount; + } + + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationProcessorCore) + { + count++; + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); +#elif defined __ANDROID__ || defined __linux__ + std::vector thread_set; + for (int i = 0; i < g_cpucount; i++) + { + int thread_siblings = get_thread_siblings(i); + if (thread_siblings == -1) + { + // ignore malformed one + continue; + } + + bool thread_siblings_exists = false; + for (size_t j = 0; j < thread_set.size(); j++) + { + if (thread_set[j] == thread_siblings) + { + thread_siblings_exists = true; + break; + } + } + + if (!thread_siblings_exists) + { + thread_set.push_back(thread_siblings); + count++; + } + } #elif __APPLE__ -#if __aarch64__ - return 0; // no known apple cpu support armv8.6 svei8mm -#else - return 0; -#endif + size_t len = sizeof(count); + sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0); #else - return 0; + count = g_cpucount; #endif + + if (count > g_cpucount) + count = g_cpucount; + + return count; } -int cpu_support_arm_svef32mm() -{ #if defined __ANDROID__ || defined __linux__ -#if __aarch64__ - return g_hwcaps2 & HWCAP2_SVEF32MM; -#else - return 0; -#endif -#elif __APPLE__ -#if __aarch64__ - return 0; // no known apple cpu support armv8.6 svef32mm -#else - return 0; -#endif -#else - return 0; -#endif +static int get_data_cache_size(int cpuid, int level) +{ + char path[256]; + + // discover sysfs cache entry + int indexid = -1; + for (int i = 0;; i++) + { + // check level + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i); + FILE* fp = fopen(path, "rb"); + if (!fp) + break; + + int cache_level = -1; + int nscan = fscanf(fp, "%d", &cache_level); + fclose(fp); + if (nscan != 1 || cache_level != level) + continue; + } + + // check type + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i); + FILE* fp = fopen(path, "rb"); + if (!fp) + break; + + char type[32]; + int nscan = fscanf(fp, "%31s", type); + fclose(fp); + if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0)) + continue; + } + + indexid = i; + break; + } + + if (indexid == -1) + { + // no sysfs entry + return 0; + } + + // get size + int cache_size_K = 0; + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid); + FILE* fp = fopen(path, "rb"); + if (!fp) + return 0; + + int nscan = fscanf(fp, "%dK", &cache_size_K); + fclose(fp); + if (nscan != 1) + { + NCNN_LOGE("fscanf cache_size_K error %d", nscan); + return 0; + } + } + + // parse shared_cpu_map mask + ncnn::CpuSet shared_cpu_map; + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid); + FILE* fp = fopen(path, "rb"); + if (!fp) + return 0; + + char shared_cpu_map_str[256]; + int nscan = fscanf(fp, "%255s", shared_cpu_map_str); + fclose(fp); + if (nscan != 1) + { + NCNN_LOGE("fscanf shared_cpu_map error %d", nscan); + return 0; + } + + int len = strlen(shared_cpu_map_str); + + if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x') + { + // skip leading 0x + len -= 2; + } + + int ci = 0; + for (int i = len - 1; i >= 0; i--) + { + char x = shared_cpu_map_str[i]; + if (x & 1) shared_cpu_map.enable(ci + 0); + if (x & 2) shared_cpu_map.enable(ci + 1); + if (x & 4) shared_cpu_map.enable(ci + 2); + if (x & 8) shared_cpu_map.enable(ci + 3); + + ci += 4; + } + } + + if (shared_cpu_map.num_enabled() == 1) + return cache_size_K * 1024; + + // resolve physical cpu count in the shared_cpu_map + int shared_physical_cpu_count = 0; + { + std::vector thread_set; + for (int i = 0; i < g_cpucount; i++) + { + if (!shared_cpu_map.is_enabled(i)) + continue; + + int thread_siblings = get_thread_siblings(i); + if (thread_siblings == -1) + { + // ignore malformed one + continue; + } + + bool thread_siblings_exists = false; + for (size_t j = 0; j < thread_set.size(); j++) + { + if (thread_set[j] == thread_siblings) + { + thread_siblings_exists = true; + break; + } + } + + if (!thread_siblings_exists) + { + thread_set.push_back(thread_siblings); + shared_physical_cpu_count++; + } + } + } + + // return per-physical-core cache size with 4K aligned + cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4; + + return cache_size_K * 1024; } -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) -static inline void x86_cpuid(int level, unsigned int out[4]) +static int get_big_cpu_data_cache_size(int level) { -#if defined(_MSC_VER) - __cpuid((int*)out, level); -#elif defined(__clang__) || defined(__GNUC__) - __get_cpuid(level, out, out + 1, out + 2, out + 3); -#else - NCNN_LOGE("x86_cpuid is unknown for current compiler"); - out[0] = 0; - out[1] = 0; - out[2] = 0; - out[3] = 0; -#endif + if (g_cpu_affinity_mask_big.num_enabled() == 0) + { + // smp cpu + return get_data_cache_size(0, level); + } + + for (int i = 0; i < g_cpucount; i++) + { + if (g_cpu_affinity_mask_big.is_enabled(i)) + { + return get_data_cache_size(i, level); + } + } + + // should never reach here, fallback to cpu0 + return get_data_cache_size(0, level); } +#endif // defined __ANDROID__ || defined __linux__ -static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4]) +static int get_cpu_level2_cachesize() { -#if defined(_MSC_VER) - __cpuidex((int*)out, level, sublevel); -#elif defined(__clang__) || defined(__GNUC__) - __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]); -#else - NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler"); - out[0] = 0; - out[1] = 0; - out[2] = 0; - out[3] = 0; + int size = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi != NULL) + { + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationCache) + { + PCACHE_DESCRIPTOR Cache = &ptr->Cache; + if (Cache->Level == 2) + { + size = std::max(size, (int)Cache->Size); + } + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); + } +#elif defined __ANDROID__ || defined __linux__ + size = get_big_cpu_data_cache_size(2); +#if defined(_SC_LEVEL2_CACHE_SIZE) + if (size <= 0) + size = sysconf(_SC_LEVEL2_CACHE_SIZE); +#endif +#elif __APPLE__ + // perflevel 0 is the higher performance cluster + int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2"); + int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize"); + size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize; #endif -} -static inline int x86_get_xcr0() -{ -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - return _xgetbv(0); -#elif defined(__i386__) || defined(__x86_64__) - int xcr0 = 0; - asm(".byte 0x0f, 0x01, 0xd0" - : "=a"(xcr0) - : "c"(0) - : "%edx"); - return xcr0; + // fallback to a common value + if (size <= 0) + { +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + size = 64 * 1024; + if (g_cpu_support_x86_avx) + size = 128 * 1024; + if (g_cpu_support_x86_avx2) + size = 256 * 1024; + if (g_cpu_support_x86_avx512) + size = 1024 * 1024; +#elif __aarch64__ + size = 256 * 1024; +#elif __arm__ + size = 128 * 1024; #else - NCNN_LOGE("x86_get_xcr0 is unknown for current compiler"); - return 0xffffffff; // assume it will work + // is 64k still too large here ? + size = 64 * 1024; #endif + } + + return size; } -static int get_cpu_support_x86_avx() +static int get_cpu_level3_cachesize() { -#if !NCNN_AVX - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); + int size = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi != NULL) + { + DWORD return_length = 0; + glpi(NULL, &return_length); - int nIds = cpu_info[0]; - if (nIds < 1) - return 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationCache) + { + PCACHE_DESCRIPTOR Cache = &ptr->Cache; + if (Cache->Level == 3) + { + size = std::max(size, (int)Cache->Size); + } + } - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } - return 1; + free(buffer); + } +#elif defined __ANDROID__ || defined __linux__ + size = get_big_cpu_data_cache_size(3); +#if defined(_SC_LEVEL3_CACHE_SIZE) + if (size <= 0) + size = sysconf(_SC_LEVEL3_CACHE_SIZE); +#endif +#elif __APPLE__ + // perflevel 0 is the higher performance cluster + // get the size shared among all cpus + size = get_hw_capability("hw.perflevel0.l3cachesize"); +#endif + + // l3 cache size can be zero + + return size; } -static int get_cpu_support_x86_fma() +#if (defined _WIN32 && !(defined __MINGW32__)) +static ncnn::CpuSet get_smt_cpu_mask() { -#if !NCNN_FMA - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); - - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; + ncnn::CpuSet smt_cpu_mask; - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi == NULL) + { + NCNN_LOGE("GetLogicalProcessorInformation is not supported"); + return smt_cpu_mask; + } - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + DWORD return_length = 0; + glpi(NULL, &return_length); - return cpu_info[2] & (1u << 12); -} + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); -static int get_cpu_support_x86_xop() -{ -#if !NCNN_XOP - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0x80000000, cpu_info); + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationProcessorCore) + { + ncnn::CpuSet smt_set; + smt_set.mask = ptr->ProcessorMask; + if (smt_set.num_enabled() > 1) + { + // this core is smt + smt_cpu_mask.mask |= smt_set.mask; + } + } - if (cpu_info[0] < 0x80000001) - return 0; + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } - x86_cpuid(0x80000001, cpu_info); + free(buffer); - return cpu_info[2] & (1u << 11); + return smt_cpu_mask; } -static int get_cpu_support_x86_f16c() +static std::vector get_max_freq_mhz() { -#if !NCNN_F16C - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); + typedef struct _PROCESSOR_POWER_INFORMATION + { + ULONG Number; + ULONG MaxMhz; + ULONG CurrentMhz; + ULONG MhzLimit; + ULONG MaxIdleState; + ULONG CurrentIdleState; + } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION; - int nIds = cpu_info[0]; - if (nIds < 1) - return 0; + HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll")); - x86_cpuid(1, cpu_info); + typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG); + LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation"); + if (cnpi == NULL) + { + NCNN_LOGE("CallNtPowerInformation is not supported"); + FreeLibrary(powrprof); + return std::vector(g_cpucount, 0); + } - return cpu_info[2] & (1u << 29); + DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount; + PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length); + + cnpi(ProcessorInformation, NULL, 0, buffer, return_length); + + std::vector ret; + for (int i = 0; i < g_cpucount; i++) + { + ULONG max_mhz = buffer[i].MaxMhz; + ret.push_back(max_mhz); + } + + free(buffer); + FreeLibrary(powrprof); + return ret; } -static int get_cpu_support_x86_avx2() +static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) { -#if !NCNN_AVX2 + DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask); + if (prev_mask == 0) + { + NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); + return -1; + } + return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); +} +#endif // (defined _WIN32 && !(defined __MINGW32__)) - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; +#if defined __ANDROID__ || defined __linux__ +static int get_max_freq_khz(int cpuid) +{ + // first try, for all possible cpu + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + FILE* fp = fopen(path, "rb"); - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + if (!fp) + { + // second try, for online cpu + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); + fp = fopen(path, "rb"); - x86_cpuid_sublevel(7, 0, cpu_info); - return cpu_info[1] & (1u << 5); -} + if (fp) + { + int max_freq_khz = 0; + while (!feof(fp)) + { + int freq_khz = 0; + int nscan = fscanf(fp, "%d %*d", &freq_khz); + if (nscan != 1) + break; -static int get_cpu_support_x86_avx_vnni() -{ -#if !NCNN_AVXVNNI - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); + if (freq_khz > max_freq_khz) + max_freq_khz = freq_khz; + } - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; + fclose(fp); - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + if (max_freq_khz != 0) + return max_freq_khz; - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + fp = NULL; + } - x86_cpuid_sublevel(7, 1, cpu_info); - return cpu_info[0] & (1u << 4); -} + if (!fp) + { + // third try, for online cpu + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); + fp = fopen(path, "rb"); -static int get_cpu_support_x86_avx512() -{ -#if !NCNN_AVX512 - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); + if (!fp) + return -1; - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; + int max_freq_khz = -1; + int nscan = fscanf(fp, "%d", &max_freq_khz); + if (nscan != 1) + { + NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan); + } + fclose(fp); - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + return max_freq_khz; + } + } - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + int max_freq_khz = 0; + while (!feof(fp)) + { + int freq_khz = 0; + int nscan = fscanf(fp, "%d %*d", &freq_khz); + if (nscan != 1) + break; - // check avx512 XSAVE enabled by kernel - if ((x86_get_xcr0() & 0xe0) != 0xe0) - return 0; + if (freq_khz > max_freq_khz) + max_freq_khz = freq_khz; + } - x86_cpuid_sublevel(7, 0, cpu_info); - return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31)); + fclose(fp); + + return max_freq_khz; } -static int get_cpu_support_x86_avx512_vnni() +static bool is_smt_cpu(int cpuid) { -#if !NCNN_AVX512VNNI - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); + // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72 + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid); - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; + FILE* fp = fopen(path, "rb"); - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + if (!fp) + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid); + fp = fopen(path, "rb"); - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + if (!fp) + return false; + } - // check avx512 XSAVE enabled by kernel - if ((x86_get_xcr0() & 0xe0) != 0xe0) - return 0; + bool is_smt = false; + while (!feof(fp)) + { + char ch = fgetc(fp); + if (ch == ',' || ch == '-') + { + is_smt = true; + break; + } + } - x86_cpuid_sublevel(7, 0, cpu_info); - return cpu_info[2] & (1u << 11); + fclose(fp); + + return is_smt; } -static int get_cpu_support_x86_avx512_bf16() +static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) { -#if !NCNN_AVX512BF16 - return 0; + // set affinity for thread +#if defined(__BIONIC__) + pid_t pid = gettid(); +#else + pid_t pid = syscall(SYS_gettid); #endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); - - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; - - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set); + if (syscallret) + { + NCNN_LOGE("syscall error %d", syscallret); + return -1; + } - x86_cpuid_sublevel(7, 1, cpu_info); - return cpu_info[0] & (1u << 5); + return 0; } +#endif // defined __ANDROID__ || defined __linux__ -static int get_cpu_support_x86_avx512_fp16() +#if __APPLE__ +static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) { -#if !NCNN_AVX512FP16 - return 0; -#endif - unsigned int cpu_info[4] = {0}; - x86_cpuid(0, cpu_info); + // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html + // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html + // https://gist.github.com/Coneko/4234842 - int nIds = cpu_info[0]; - if (nIds < 7) - return 0; + // This is a quite outdated document. Apple will not allow developers to set CPU affinity. + // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings. + // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio - x86_cpuid(1, cpu_info); - // check AVX XSAVE OSXSAVE - if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27))) - return 0; + int affinity_tag = THREAD_AFFINITY_TAG_NULL; + for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++) + { + if (thread_affinity_mask.is_enabled(i)) + { + affinity_tag = i + 1; + break; + } + } - // check XSAVE enabled by kernel - if ((x86_get_xcr0() & 6) != 6) - return 0; + mach_port_t tid = pthread_mach_thread_np(pthread_self()); - // check avx512 XSAVE enabled by kernel - if ((x86_get_xcr0() & 0xe0) != 0xe0) - return 0; + thread_affinity_policy_data_t policy_data; + policy_data.affinity_tag = affinity_tag; + int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT); + if (ret && ret != KERN_NOT_SUPPORTED) + { + NCNN_LOGE("thread_policy_set error %d", ret); + return -1; + } - x86_cpuid_sublevel(7, 0, cpu_info); - return cpu_info[3] & (1u << 23); + return 0; } +#endif // __APPLE__ -static int g_cpu_support_x86_avx = get_cpu_support_x86_avx(); -static int g_cpu_support_x86_fma = get_cpu_support_x86_fma(); -static int g_cpu_support_x86_xop = get_cpu_support_x86_xop(); -static int g_cpu_support_x86_f16c = get_cpu_support_x86_f16c(); -static int g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2(); -static int g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni(); -static int g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512(); -static int g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni(); -static int g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16(); -static int g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16(); -#else // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) -static const int g_cpu_support_x86_avx = 0; -static const int g_cpu_support_x86_fma = 0; -static const int g_cpu_support_x86_xop = 0; -static const int g_cpu_support_x86_f16c = 0; -static const int g_cpu_support_x86_avx2 = 0; -static const int g_cpu_support_x86_avx_vnni = 0; -static const int g_cpu_support_x86_avx512 = 0; -static const int g_cpu_support_x86_avx512_vnni = 0; -static const int g_cpu_support_x86_avx512_bf16 = 0; -static const int g_cpu_support_x86_avx512_fp16 = 0; -#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) - -int cpu_support_x86_avx() +static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big) { - return g_cpu_support_x86_avx; -} + mask_all.disable_all(); -int cpu_support_x86_fma() -{ - return g_cpu_support_x86_fma; -} +#if (defined _WIN32 && !(defined __MINGW32__)) + // get max freq mhz for all cores + int max_freq_mhz_min = INT_MAX; + int max_freq_mhz_max = 0; + std::vector cpu_max_freq_mhz = get_max_freq_mhz(); + for (int i = 0; i < g_cpucount; i++) + { + int max_freq_mhz = cpu_max_freq_mhz[i]; -int cpu_support_x86_xop() -{ - return g_cpu_support_x86_xop; -} + // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz); -int cpu_support_x86_f16c() -{ - return g_cpu_support_x86_f16c; -} + if (max_freq_mhz > max_freq_mhz_max) + max_freq_mhz_max = max_freq_mhz; + if (max_freq_mhz < max_freq_mhz_min) + max_freq_mhz_min = max_freq_mhz; + } -int cpu_support_x86_avx2() -{ - return g_cpu_support_x86_avx2; -} + int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2; + if (max_freq_mhz_medium == max_freq_mhz_max) + { + mask_little.disable_all(); + mask_big = mask_all; + return; + } -int cpu_support_x86_avx_vnni() -{ - return g_cpu_support_x86_avx_vnni; -} + ncnn::CpuSet smt_cpu_mask = get_smt_cpu_mask(); + + for (int i = 0; i < g_cpucount; i++) + { + if (smt_cpu_mask.is_enabled(i)) + { + // always treat smt core as big core + mask_big.enable(i); + continue; + } + + if (cpu_max_freq_mhz[i] < max_freq_mhz_medium) + mask_little.enable(i); + else + mask_big.enable(i); + } +#elif defined __ANDROID__ || defined __linux__ + int max_freq_khz_min = INT_MAX; + int max_freq_khz_max = 0; + std::vector cpu_max_freq_khz(g_cpucount); + for (int i = 0; i < g_cpucount; i++) + { + int max_freq_khz = get_max_freq_khz(i); -int cpu_support_x86_avx512() -{ - return g_cpu_support_x86_avx512; -} + // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz); -int cpu_support_x86_avx512_vnni() -{ - return g_cpu_support_x86_avx512_vnni; -} + cpu_max_freq_khz[i] = max_freq_khz; -int cpu_support_x86_avx512_bf16() -{ - return g_cpu_support_x86_avx512_bf16; -} + if (max_freq_khz > max_freq_khz_max) + max_freq_khz_max = max_freq_khz; + if (max_freq_khz < max_freq_khz_min) + max_freq_khz_min = max_freq_khz; + } -int cpu_support_x86_avx512_fp16() -{ - return g_cpu_support_x86_avx512_fp16; -} + int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2; + if (max_freq_khz_medium == max_freq_khz_max) + { + mask_little.disable_all(); + mask_big = mask_all; + return; + } -int cpu_support_mips_msa() -{ -#if defined __ANDROID__ || defined __linux__ -#if __mips__ - return g_hwcaps & HWCAP_MIPS_MSA; -#else - return 0; -#endif -#else - return 0; -#endif -} + for (int i = 0; i < g_cpucount; i++) + { + if (is_smt_cpu(i)) + { + // always treat smt core as big core + mask_big.enable(i); + continue; + } -int cpu_support_loongarch_lsx() -{ -#if defined __ANDROID__ || defined __linux__ -#if __loongarch64 - return g_hwcaps & HWCAP_LOONGARCH_LSX; -#else - return 0; -#endif + if (cpu_max_freq_khz[i] < max_freq_khz_medium) + mask_little.enable(i); + else + mask_big.enable(i); + } +#elif __APPLE__ + int nperflevels = get_hw_capability("hw.nperflevels"); + if (nperflevels == 1) + { + // smp models + mask_little.disable_all(); + mask_big = mask_all; + } + else + { + // two or more clusters, level0 is the high-performance cluster + int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max"); + for (int i = 0; i < perflevel0_logicalcpu; i++) + { + mask_big.enable(i); + } + for (int i = perflevel0_logicalcpu; i < g_cpucount; i++) + { + mask_little.enable(i); + } + } #else - return 0; + // TODO implement me for other platforms + mask_little.disable_all(); + mask_big = mask_all; #endif } -int cpu_support_loongarch_lasx() -{ #if defined __ANDROID__ || defined __linux__ -#if __loongarch64 - return g_hwcaps & HWCAP_LOONGARCH_LASX; -#else - return 0; -#endif -#else - return 0; -#endif -} - -int cpu_support_loongson_mmi() +#if __aarch64__ +union midr_info_t { -#if defined __ANDROID__ || defined __linux__ -#if __mips__ - return g_hwcaps & HWCAP_LOONGSON_MMI; -#else - return 0; -#endif -#else - return 0; -#endif -} + struct __attribute__((packed)) + { + unsigned int revision : 4; + unsigned int part : 12; + unsigned int architecture : 4; + unsigned int variant : 4; + unsigned int implementer : 8; + }; + unsigned int midr; -int cpu_support_riscv_v() -{ -#if defined __ANDROID__ || defined __linux__ -#if __riscv - return g_hwcaps & COMPAT_HWCAP_ISA_V; -#else - return 0; -#endif -#else - return 0; -#endif -} + midr_info_t(unsigned int _midr) + : midr(_midr) + { + } +}; -int cpu_support_riscv_zfh() +static unsigned int get_midr_from_sysfs(int cpuid) { -#if defined __ANDROID__ || defined __linux__ -#if __riscv - // v + f does not imply zfh, but how to discover zfh properly ? - // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414 - return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F; -#else - return 0; -#endif -#else - return 0; -#endif -} + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid); -int cpu_riscv_vlenb() -{ -#if __riscv - if (!cpu_support_riscv_v()) + FILE* fp = fopen(path, "rb"); + if (!fp) return 0; - int a = 0; - asm volatile( - ".word 0xc22026f3 \n" // csrr a3, vlenb - "mv %0, a3 \n" - : "=r"(a) - : - : "memory", "a3"); - return a; -#else - return 0; -#endif + unsigned int midr_el1 = 0; + int nscan = fscanf(fp, "%x", &midr_el1); + if (nscan != 1) + { + // ignore + } + + fclose(fp); + + return midr_el1; } -static int get_cpucount() +static int get_midr_from_proc_cpuinfo(std::vector& midrs) { - int count = 0; -#ifdef __EMSCRIPTEN__ - if (emscripten_has_threading_support()) - count = emscripten_num_logical_cores(); - else - count = 1; -#elif (defined _WIN32 && !(defined __MINGW32__)) - SYSTEM_INFO system_info; - GetSystemInfo(&system_info); - count = system_info.dwNumberOfProcessors; -#elif defined __ANDROID__ || defined __linux__ - // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) - return 1; + return -1; + + midrs.resize(g_cpucount, 0); + + int cpuid = -1; + midr_info_t midr_info(0); char line[1024]; while (!feof(fp)) @@ -1241,1295 +1514,1145 @@ static int get_cpucount() if (memcmp(line, "processor", 9) == 0) { - count++; - } - } - - fclose(fp); -#elif __APPLE__ - size_t len = sizeof(count); - sysctlbyname("hw.ncpu", &count, &len, NULL, 0); -#else -#ifdef _OPENMP - count = omp_get_max_threads(); -#else - count = 1; -#endif // _OPENMP -#endif - - if (count < 1) - count = 1; - - return count; -} - -static int g_cpucount = get_cpucount(); + // processor : 4 + int id = -1; + int nscan = sscanf(line, "%*[^:]: %d", &id); + if (nscan != 1) + continue; -int get_cpu_count() -{ - return g_cpucount; -} + if (cpuid >= 0 && cpuid < g_cpucount) + { + if (midr_info.midr == 0) + { + // shared midr + midrs[cpuid] = (unsigned int)-1; + } + else + { + // save midr and reset + midrs[cpuid] = midr_info.midr; + for (int i = 0; i < g_cpucount; i++) + { + if (midrs[i] == (unsigned int)-1) + midrs[i] = midr_info.midr; + } + } -int get_little_cpu_count() -{ - return get_cpu_thread_affinity_mask(1).num_enabled(); -} + midr_info.midr = 0; + } -int get_big_cpu_count() -{ - int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled(); - return big_cpu_count ? big_cpu_count : g_cpucount; -} + cpuid = id; + } -#if defined __ANDROID__ || defined __linux__ -static int get_thread_siblings(int cpuid) -{ - char path[256]; - sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid); + if (cpuid == -1) + continue; - FILE* fp = fopen(path, "rb"); - if (!fp) - return -1; + if (memcmp(line, "CPU implementer", 15) == 0) + { + // CPU implementer : 0x51 + unsigned int id = 0; + int nscan = sscanf(line, "%*[^:]: %x", &id); + if (nscan != 1) + continue; - int thread_siblings = -1; - int nscan = fscanf(fp, "%x", &thread_siblings); - if (nscan != 1) - { - // ignore - } + midr_info.implementer = id; + } + else if (memcmp(line, "CPU architecture", 16) == 0) + { + // CPU architecture: 8 + int id = 0; + int nscan = sscanf(line, "%*[^:]: %d", &id); + if (nscan != 1) + continue; - fclose(fp); + midr_info.architecture = id; + } + else if (memcmp(line, "CPU variant", 11) == 0) + { + // CPU variant : 0xd + int id = 0; + int nscan = sscanf(line, "%*[^:]: %x", &id); + if (nscan != 1) + continue; - return thread_siblings; -} -#endif // defined __ANDROID__ || defined __linux__ + midr_info.variant = id; + } + else if (memcmp(line, "CPU part", 8) == 0) + { + // CPU part : 0x804 + int id = 0; + int nscan = sscanf(line, "%*[^:]: %x", &id); + if (nscan != 1) + continue; -static int get_physical_cpucount() -{ - int count = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) - typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); - LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); - if (glpi == NULL) - { - NCNN_LOGE("GetLogicalProcessorInformation is not supported"); - return g_cpucount; - } + midr_info.part = id; + } + else if (memcmp(line, "CPU revision", 12) == 0) + { + // CPU revision : 14 + int id = 0; + int nscan = sscanf(line, "%*[^:]: %d", &id); + if (nscan != 1) + continue; - DWORD return_length = 0; - glpi(NULL, &return_length); + midr_info.revision = id; + } + } - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); - glpi(buffer, &return_length); + fclose(fp); - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; - DWORD byte_offset = 0; - while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + if (cpuid >= 0 && cpuid < g_cpucount) { - if (ptr->Relationship == RelationProcessorCore) + if (midr_info.midr == 0) { - count++; + // shared midr + midrs[cpuid] = (unsigned int)-1; + } + else + { + // save midr and reset + midrs[cpuid] = midr_info.midr; + for (int i = 0; i < g_cpucount; i++) + { + if (midrs[i] == (unsigned int)-1) + midrs[i] = midr_info.midr; + } } - byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - ptr++; + midr_info.midr = 0; } - free(buffer); -#elif defined __ANDROID__ || defined __linux__ - std::vector thread_set; - for (int i = 0; i < g_cpucount; i++) + // /proc/cpuinfo may only report little/online cores on old kernel + if (g_cpu_affinity_mask_big.num_enabled() == g_cpucount) { - int thread_siblings = get_thread_siblings(i); - if (thread_siblings == -1) + // assign the remaining unknown midrs for smp cpu + for (int i = 0; i < g_cpucount; i++) { - // ignore malformed one - continue; + if (midrs[i] == 0) + midrs[i] = midr_info.midr; } - - bool thread_siblings_exists = false; - for (size_t j = 0; j < thread_set.size(); j++) + } + else + { + // clear the big core midrs for hmp cpu if they are the same as little cores + unsigned int little_midr = 0; + for (int i = 0; i < g_cpucount; i++) { - if (thread_set[j] == thread_siblings) + if (g_cpu_affinity_mask_little.is_enabled(i)) { - thread_siblings_exists = true; + little_midr = midrs[i]; break; } } - if (!thread_siblings_exists) + for (int i = 0; i < g_cpucount; i++) { - thread_set.push_back(thread_siblings); - count++; + if (g_cpu_affinity_mask_big.is_enabled(i)) + { + if (midrs[i] == little_midr) + { + midrs[i] = 0; + } + } } } -#elif __APPLE__ - size_t len = sizeof(count); - sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0); -#else - count = g_cpucount; -#endif - - if (count > g_cpucount) - count = g_cpucount; - - return count; -} - -static int g_physical_cpucount = get_physical_cpucount(); -int get_physical_cpu_count() -{ - return g_physical_cpucount; + return 0; } -int get_physical_little_cpu_count() +// return midr for the current running core +static unsigned int get_midr_from_register() { - if (g_physical_cpucount == g_cpucount) - return get_little_cpu_count(); + uint64_t midr; + asm volatile("mrs %0, MIDR_EL1" + : "=r"(midr)); - return g_physical_cpucount * 2 - g_cpucount; + return (unsigned int)midr; } -int get_physical_big_cpu_count() +static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask) { - if (g_physical_cpucount == g_cpucount) - return get_big_cpu_count(); - - return g_cpucount - g_physical_cpucount; -} + // get affinity for thread +#if defined(__BIONIC__) + pid_t pid = gettid(); +#else + pid_t pid = syscall(SYS_gettid); +#endif -#if defined __ANDROID__ || defined __linux__ -static int get_data_cache_size(int cpuid, int level) -{ - char path[256]; + thread_affinity_mask.disable_all(); - // discover sysfs cache entry - int indexid = -1; - for (int i = 0;; i++) + int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set); + if (syscallret) { - // check level - { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i); - FILE* fp = fopen(path, "rb"); - if (!fp) - break; - - int cache_level = -1; - int nscan = fscanf(fp, "%d", &cache_level); - fclose(fp); - if (nscan != 1 || cache_level != level) - continue; - } - - // check type - { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i); - FILE* fp = fopen(path, "rb"); - if (!fp) - break; - - char type[32]; - int nscan = fscanf(fp, "%31s", type); - fclose(fp); - if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0)) - continue; - } - - indexid = i; - break; + // handle get error silently + return -1; } - if (indexid == -1) - { - // no sysfs entry - return 0; - } + return 0; +} - // get size - int cache_size_K = 0; - { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid); - FILE* fp = fopen(path, "rb"); - if (!fp) - return 0; +static int midr_is_a53_a55(unsigned int midr) +{ + // 0x 41 ? f d03 ? = arm cortex-a53 + // 0x 51 ? f 801 ? = qcom kryo200 a53 + // 0x 41 ? f d04 ? = arm cortex-a35 + // 0x 41 ? f d05 ? = arm cortex-a55 + // 0x 51 ? f 803 ? = qcom kryo300 a55 + // 0x 51 ? f 805 ? = qcom kryo400 a55 - int nscan = fscanf(fp, "%dK", &cache_size_K); - fclose(fp); - if (nscan != 1) - { - NCNN_LOGE("fscanf cache_size_K error %d", nscan); - return 0; - } - } + midr_info_t midr_info(midr); - // parse shared_cpu_map mask - CpuSet shared_cpu_map; - { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid); - FILE* fp = fopen(path, "rb"); - if (!fp) - return 0; + return (midr_info.implementer == 0x41 && midr_info.part == 0xd03) + || (midr_info.implementer == 0x51 && midr_info.part == 0x801) + || (midr_info.implementer == 0x41 && midr_info.part == 0xd04) + || (midr_info.implementer == 0x41 && midr_info.part == 0xd05) + || (midr_info.implementer == 0x51 && midr_info.part == 0x803) + || (midr_info.implementer == 0x51 && midr_info.part == 0x805); +} - char shared_cpu_map_str[256]; - int nscan = fscanf(fp, "%255s", shared_cpu_map_str); - fclose(fp); - if (nscan != 1) - { - NCNN_LOGE("fscanf shared_cpu_map error %d", nscan); - return 0; - } +static int detect_cpu_is_arm_a53_a55() +{ + int a53_a55_cpu_count = 0; - int len = strlen(shared_cpu_map_str); + // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1 + bool sysfs_midr = true; + for (int i = 0; i < g_cpucount; i++) + { + unsigned int midr = 0; - if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x') + // for kernel 4.7+ + midr = get_midr_from_sysfs(i); + if (midr == 0) { - // skip leading 0x - len -= 2; + sysfs_midr = false; + break; } - int ci = 0; - for (int i = len - 1; i >= 0; i--) + if (midr_is_a53_a55(midr)) { - char x = shared_cpu_map_str[i]; - if (x & 1) shared_cpu_map.enable(ci + 0); - if (x & 2) shared_cpu_map.enable(ci + 1); - if (x & 4) shared_cpu_map.enable(ci + 2); - if (x & 8) shared_cpu_map.enable(ci + 3); - - ci += 4; + a53_a55_cpu_count++; } } - if (shared_cpu_map.num_enabled() == 1) - return cache_size_K * 1024; - - // resolve physical cpu count in the shared_cpu_map - int shared_physical_cpu_count = 0; + if (!sysfs_midr) { - std::vector thread_set; - for (int i = 0; i < g_cpucount; i++) + // second try, collect midr from /proc/cpuinfo + std::vector midrs; + int ret = get_midr_from_proc_cpuinfo(midrs); + if (ret == 0 && (int)midrs.size() == g_cpucount) { - if (!shared_cpu_map.is_enabled(i)) - continue; - - int thread_siblings = get_thread_siblings(i); - if (thread_siblings == -1) - { - // ignore malformed one - continue; - } - - bool thread_siblings_exists = false; - for (size_t j = 0; j < thread_set.size(); j++) + for (int i = 0; i < g_cpucount; i++) { - if (thread_set[j] == thread_siblings) + if (midr_is_a53_a55(midrs[i])) { - thread_siblings_exists = true; - break; + a53_a55_cpu_count++; } } - - if (!thread_siblings_exists) - { - thread_set.push_back(thread_siblings); - shared_physical_cpu_count++; - } + } + else + { + // third try, assume all aarch64 little cores are a53/a55 + a53_a55_cpu_count = g_cpu_affinity_mask_little.num_enabled(); } } - // return per-physical-core cache size with 4K aligned - cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4; + if (a53_a55_cpu_count == 0) + return 0; // all non a53/a55 - return cache_size_K * 1024; + if (a53_a55_cpu_count == g_cpucount) + return 1; // all a53/a55 + + // little cores are a53/a55 + return 2; } +#endif // __aarch64__ +#endif // defined __ANDROID__ || defined __linux__ -static int get_big_cpu_data_cache_size(int level) +// the initialization +static void initialize_global_cpu_info() { - const CpuSet& big_cs = get_cpu_thread_affinity_mask(2); - if (big_cs.num_enabled() == 0) - { - // smp cpu - return get_data_cache_size(0, level); - } + g_cpucount = get_cpucount(); + g_physical_cpucount = get_physical_cpucount(); + g_powersave = 0; + initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big); - for (int i = 0; i < g_cpucount; i++) - { - if (big_cs.is_enabled(i)) - { - return get_data_cache_size(i, level); - } - } +#if defined __ANDROID__ || defined __linux__ + g_hwcaps = get_elf_hwcap(AT_HWCAP); + g_hwcaps2 = get_elf_hwcap(AT_HWCAP2); +#endif // defined __ANDROID__ || defined __linux__ - // should never reach here, fallback to cpu0 - return get_data_cache_size(0, level); -} +#if __APPLE__ + g_hw_cpufamily = get_hw_cpufamily(); + g_hw_cputype = get_hw_cputype(); + g_hw_cpusubtype = get_hw_cpusubtype(); + + g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16"); + g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd"); + g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM"); + g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16"); + g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM"); +#endif // __APPLE__ + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + g_cpu_support_x86_avx = get_cpu_support_x86_avx(); + g_cpu_support_x86_fma = get_cpu_support_x86_fma(); + g_cpu_support_x86_xop = get_cpu_support_x86_xop(); + g_cpu_support_x86_f16c = get_cpu_support_x86_f16c(); + g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2(); + g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni(); + g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512(); + g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni(); + g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16(); + g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16(); +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + + g_cpu_level2_cachesize = get_cpu_level2_cachesize(); + g_cpu_level3_cachesize = get_cpu_level3_cachesize(); + +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55(); +#endif // __aarch64__ #endif // defined __ANDROID__ || defined __linux__ +} -static int get_cpu_level2_cachesize() +static int g_cpu_info_initialized = 0; + +static inline void try_initialize_global_cpu_info() { - int size = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) - typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); - LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); - if (glpi != NULL) + if (!g_cpu_info_initialized) { - DWORD return_length = 0; - glpi(NULL, &return_length); + initialize_global_cpu_info(); + g_cpu_info_initialized = 1; + } +} - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); - glpi(buffer, &return_length); +namespace ncnn { - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; - DWORD byte_offset = 0; - while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) - { - if (ptr->Relationship == RelationCache) - { - PCACHE_DESCRIPTOR Cache = &ptr->Cache; - if (Cache->Level == 2) - { - size = std::max(size, (int)Cache->Size); - } - } +#if (defined _WIN32 && !(defined __MINGW32__)) +CpuSet::CpuSet() +{ + disable_all(); +} - byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - ptr++; - } +void CpuSet::enable(int cpu) +{ + mask |= (1 << cpu); +} - free(buffer); - } -#elif defined __ANDROID__ || defined __linux__ - size = get_big_cpu_data_cache_size(2); -#if defined(_SC_LEVEL2_CACHE_SIZE) - if (size <= 0) - size = sysconf(_SC_LEVEL2_CACHE_SIZE); -#endif -#elif __APPLE__ - // perflevel 0 is the higher performance cluster - int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2"); - int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize"); - size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize; -#endif +void CpuSet::disable(int cpu) +{ + mask &= ~(1 << cpu); +} - // fallback to a common value - if (size <= 0) - { -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) - size = 64 * 1024; - if (cpu_support_x86_avx()) - size = 128 * 1024; - if (cpu_support_x86_avx2()) - size = 256 * 1024; - if (cpu_support_x86_avx512()) - size = 1024 * 1024; -#elif __aarch64__ - size = 256 * 1024; -#elif __arm__ - size = 128 * 1024; -#else - // is 64k still too large here ? - size = 64 * 1024; -#endif - } +void CpuSet::disable_all() +{ + mask = 0; +} - return size; +bool CpuSet::is_enabled(int cpu) const +{ + return mask & (1 << cpu); } -static int get_cpu_level3_cachesize() +int CpuSet::num_enabled() const { - int size = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) - typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); - LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); - if (glpi != NULL) + int num_enabled = 0; + for (int i = 0; i < (int)sizeof(mask) * 8; i++) { - DWORD return_length = 0; - glpi(NULL, &return_length); + if (is_enabled(i)) + num_enabled++; + } - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); - glpi(buffer, &return_length); + return num_enabled; +} +#elif defined __ANDROID__ || defined __linux__ +CpuSet::CpuSet() +{ + disable_all(); +} - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; - DWORD byte_offset = 0; - while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) - { - if (ptr->Relationship == RelationCache) - { - PCACHE_DESCRIPTOR Cache = &ptr->Cache; - if (Cache->Level == 3) - { - size = std::max(size, (int)Cache->Size); - } - } +void CpuSet::enable(int cpu) +{ + CPU_SET(cpu, &cpu_set); +} - byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - ptr++; - } +void CpuSet::disable(int cpu) +{ + CPU_CLR(cpu, &cpu_set); +} - free(buffer); - } -#elif defined __ANDROID__ || defined __linux__ - size = get_big_cpu_data_cache_size(3); -#if defined(_SC_LEVEL3_CACHE_SIZE) - if (size <= 0) - size = sysconf(_SC_LEVEL3_CACHE_SIZE); -#endif -#elif __APPLE__ - // perflevel 0 is the higher performance cluster - // get the size shared among all cpus - size = get_hw_capability("hw.perflevel0.l3cachesize"); -#endif +void CpuSet::disable_all() +{ + CPU_ZERO(&cpu_set); +} - // l3 cache size can be zero +bool CpuSet::is_enabled(int cpu) const +{ + return CPU_ISSET(cpu, &cpu_set); +} - return size; +int CpuSet::num_enabled() const +{ + int num_enabled = 0; + for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++) + { + if (is_enabled(i)) + num_enabled++; + } + + return num_enabled; +} +#elif __APPLE__ +CpuSet::CpuSet() +{ + disable_all(); } -static int g_cpu_level2_cachesize = get_cpu_level2_cachesize(); -static int g_cpu_level3_cachesize = get_cpu_level3_cachesize(); +void CpuSet::enable(int cpu) +{ + policy |= (1 << cpu); +} -int get_cpu_level2_cache_size() +void CpuSet::disable(int cpu) { - return g_cpu_level2_cachesize; + policy &= ~(1 << cpu); } -int get_cpu_level3_cache_size() +void CpuSet::disable_all() { - return g_cpu_level3_cachesize; + policy = 0; } -#if (defined _WIN32 && !(defined __MINGW32__)) -static CpuSet get_smt_cpu_mask() +bool CpuSet::is_enabled(int cpu) const { - CpuSet smt_cpu_mask; + return policy & (1 << cpu); +} - typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); - LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); - if (glpi == NULL) +int CpuSet::num_enabled() const +{ + int num_enabled = 0; + for (int i = 0; i < (int)sizeof(policy) * 8; i++) { - NCNN_LOGE("GetLogicalProcessorInformation is not supported"); - return smt_cpu_mask; + if (is_enabled(i)) + num_enabled++; } - DWORD return_length = 0; - glpi(NULL, &return_length); - - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); - glpi(buffer, &return_length); + return num_enabled; +} +#else +CpuSet::CpuSet() +{ +} - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; - DWORD byte_offset = 0; - while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) - { - if (ptr->Relationship == RelationProcessorCore) - { - CpuSet smt_set; - smt_set.mask = ptr->ProcessorMask; - if (smt_set.num_enabled() > 1) - { - // this core is smt - smt_cpu_mask.mask |= smt_set.mask; - } - } +void CpuSet::enable(int /* cpu */) +{ +} - byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - ptr++; - } +void CpuSet::disable(int /* cpu */) +{ +} - free(buffer); +void CpuSet::disable_all() +{ +} - return smt_cpu_mask; +bool CpuSet::is_enabled(int /* cpu */) const +{ + return true; } -static std::vector get_max_freq_mhz() +int CpuSet::num_enabled() const { - typedef struct _PROCESSOR_POWER_INFORMATION - { - ULONG Number; - ULONG MaxMhz; - ULONG CurrentMhz; - ULONG MhzLimit; - ULONG MaxIdleState; - ULONG CurrentIdleState; - } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION; + return get_cpu_count(); +} +#endif - HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll")); +int cpu_support_arm_edsp() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return 0; +#else + return g_hwcaps & HWCAP_EDSP; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; +#else + return g_hw_cputype == CPU_TYPE_ARM; +#endif +#else + return 0; +#endif +} - typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG); - LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation"); - if (cnpi == NULL) - { - NCNN_LOGE("CallNtPowerInformation is not supported"); - FreeLibrary(powrprof); - return std::vector(g_cpucount, 0); - } +int cpu_support_arm_neon() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_ASIMD; +#else + return g_hwcaps & HWCAP_NEON; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_cputype == CPU_TYPE_ARM64; +#else + return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7; +#endif +#else + return 0; +#endif +} - DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount; - PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length); +int cpu_support_arm_vfpv4() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + // neon always enable fma and fp16 + return g_hwcaps & HWCAP_ASIMD; +#else + return g_hwcaps & HWCAP_VFPv4; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_cputype == CPU_TYPE_ARM64; +#else + return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S; +#endif +#else + return 0; +#endif +} - cnpi(ProcessorInformation, NULL, 0, buffer, return_length); +int cpu_support_arm_asimdhp() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_ASIMDHP; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_optional_arm_FEAT_FP16 + || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL + || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; +#else + return 0; +#endif +#else + return 0; +#endif +} - std::vector ret; - for (int i = 0; i < g_cpucount; i++) - { - ULONG max_mhz = buffer[i].MaxMhz; - ret.push_back(max_mhz); - } +int cpu_support_arm_cpuid() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_CPUID; +#else + return 0; +#endif +#elif __APPLE__ + return 0; +#else + return 0; +#endif +} - free(buffer); - FreeLibrary(powrprof); - return ret; +int cpu_support_arm_asimddp() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_ASIMDDP; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_optional_arm_FEAT_DotProd + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; +#else + return 0; +#endif +#else + return 0; +#endif } -static int set_sched_affinity(const CpuSet& thread_affinity_mask) +int cpu_support_arm_asimdfhm() { - DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask); - if (prev_mask == 0) - { - NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); - return -1; - } - + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_ASIMDFHM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_optional_arm_FEAT_FHM + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; +#else + return 0; +#endif +#else return 0; +#endif } -#endif // (defined _WIN32 && !(defined __MINGW32__)) -#if defined __ANDROID__ || defined __linux__ -static int get_max_freq_khz(int cpuid) +int cpu_support_arm_bf16() { - // first try, for all possible cpu - char path[256]; - sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); - - FILE* fp = fopen(path, "rb"); - - if (!fp) - { - // second try, for online cpu - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); - fp = fopen(path, "rb"); - - if (fp) - { - int max_freq_khz = 0; - while (!feof(fp)) - { - int freq_khz = 0; - int nscan = fscanf(fp, "%d %*d", &freq_khz); - if (nscan != 1) - break; - - if (freq_khz > max_freq_khz) - max_freq_khz = freq_khz; - } - - fclose(fp); - - if (max_freq_khz != 0) - return max_freq_khz; - - fp = NULL; - } - - if (!fp) - { - // third try, for online cpu - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); - fp = fopen(path, "rb"); - - if (!fp) - return -1; - - int max_freq_khz = -1; - int nscan = fscanf(fp, "%d", &max_freq_khz); - if (nscan != 1) - { - NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan); - } - fclose(fp); - - return max_freq_khz; - } - } - - int max_freq_khz = 0; - while (!feof(fp)) - { - int freq_khz = 0; - int nscan = fscanf(fp, "%d %*d", &freq_khz); - if (nscan != 1) - break; - - if (freq_khz > max_freq_khz) - max_freq_khz = freq_khz; - } - - fclose(fp); - - return max_freq_khz; + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_BF16; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_optional_arm_FEAT_BF16 + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; +#else + return 0; +#endif +#else + return 0; +#endif } -static bool is_smt_cpu(int cpuid) +int cpu_support_arm_i8mm() { - // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72 - char path[256]; - sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid); - - FILE* fp = fopen(path, "rb"); - - if (!fp) - { - sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid); - fp = fopen(path, "rb"); - - if (!fp) - return false; - } - - bool is_smt = false; - while (!feof(fp)) - { - char ch = fgetc(fp); - if (ch == ',' || ch == '-') - { - is_smt = true; - break; - } - } - - fclose(fp); - - return is_smt; + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_I8MM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return g_hw_optional_arm_FEAT_I8MM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; +#else + return 0; +#endif +#else + return 0; +#endif } -static int set_sched_affinity(const CpuSet& thread_affinity_mask) +int cpu_support_arm_sve() { - // set affinity for thread -#if defined(__BIONIC__) - pid_t pid = gettid(); + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps & HWCAP_SVE; #else - pid_t pid = syscall(SYS_gettid); + return 0; #endif - - int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set); - if (syscallret) - { - NCNN_LOGE("syscall error %d", syscallret); - return -1; - } - +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 sve +#else + return 0; +#endif +#else return 0; +#endif } -#endif // defined __ANDROID__ || defined __linux__ -#if __APPLE__ -static int set_sched_affinity(const CpuSet& thread_affinity_mask) +int cpu_support_arm_sve2() { - // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html - // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html - // https://gist.github.com/Coneko/4234842 - - // This is a quite outdated document. Apple will not allow developers to set CPU affinity. - // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings. - // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio - - int affinity_tag = THREAD_AFFINITY_TAG_NULL; - for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++) - { - if (thread_affinity_mask.is_enabled(i)) - { - affinity_tag = i + 1; - break; - } - } - - mach_port_t tid = pthread_mach_thread_np(pthread_self()); - - thread_affinity_policy_data_t policy_data; - policy_data.affinity_tag = affinity_tag; - int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT); - if (ret && ret != KERN_NOT_SUPPORTED) - { - NCNN_LOGE("thread_policy_set error %d", ret); - return -1; - } + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVE2; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 sve2 +#else + return 0; +#endif +#else + return 0; +#endif +} +int cpu_support_arm_svebf16() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVEBF16; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 svebf16 +#else return 0; -} -#endif // __APPLE__ - -static int g_powersave = 0; - -int get_cpu_powersave() -{ - return g_powersave; +#endif +#else + return 0; +#endif } -int set_cpu_powersave(int powersave) +int cpu_support_arm_svei8mm() { - if (powersave < 0 || powersave > 2) - { - NCNN_LOGE("powersave %d not supported", powersave); - return -1; - } - - const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave); - - int ret = set_cpu_thread_affinity(thread_affinity_mask); - if (ret != 0) - return ret; - - g_powersave = powersave; - + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVEI8MM; +#else + return 0; +#endif +#elif __APPLE__ +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 svei8mm +#else + return 0; +#endif +#else return 0; +#endif } -class cpu_thread_affinity_mask -{ -public: - cpu_thread_affinity_mask(); - - CpuSet mask_all; - CpuSet mask_little; - CpuSet mask_big; -}; - -cpu_thread_affinity_mask::cpu_thread_affinity_mask() +int cpu_support_arm_svef32mm() { - mask_all.disable_all(); - -#if (defined _WIN32 && !(defined __MINGW32__)) - // get max freq mhz for all cores - int max_freq_mhz_min = INT_MAX; - int max_freq_mhz_max = 0; - std::vector cpu_max_freq_mhz = get_max_freq_mhz(); - for (int i = 0; i < g_cpucount; i++) - { - int max_freq_mhz = cpu_max_freq_mhz[i]; - - // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz); - - if (max_freq_mhz > max_freq_mhz_max) - max_freq_mhz_max = max_freq_mhz; - if (max_freq_mhz < max_freq_mhz_min) - max_freq_mhz_min = max_freq_mhz; - } - - int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2; - if (max_freq_mhz_medium == max_freq_mhz_max) - { - mask_little.disable_all(); - mask_big = mask_all; - return; - } - - CpuSet smt_cpu_mask = get_smt_cpu_mask(); - - for (int i = 0; i < g_cpucount; i++) - { - if (smt_cpu_mask.is_enabled(i)) - { - // always treat smt core as big core - mask_big.enable(i); - continue; - } - - if (cpu_max_freq_mhz[i] < max_freq_mhz_medium) - mask_little.enable(i); - else - mask_big.enable(i); - } -#elif defined __ANDROID__ || defined __linux__ - int max_freq_khz_min = INT_MAX; - int max_freq_khz_max = 0; - std::vector cpu_max_freq_khz(g_cpucount); - for (int i = 0; i < g_cpucount; i++) - { - int max_freq_khz = get_max_freq_khz(i); - - // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz); - - cpu_max_freq_khz[i] = max_freq_khz; - - if (max_freq_khz > max_freq_khz_max) - max_freq_khz_max = max_freq_khz; - if (max_freq_khz < max_freq_khz_min) - max_freq_khz_min = max_freq_khz; - } - - int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2; - if (max_freq_khz_medium == max_freq_khz_max) - { - mask_little.disable_all(); - mask_big = mask_all; - return; - } - - for (int i = 0; i < g_cpucount; i++) - { - if (is_smt_cpu(i)) - { - // always treat smt core as big core - mask_big.enable(i); - continue; - } - - if (cpu_max_freq_khz[i] < max_freq_khz_medium) - mask_little.enable(i); - else - mask_big.enable(i); - } + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __aarch64__ + return g_hwcaps2 & HWCAP2_SVEF32MM; +#else + return 0; +#endif #elif __APPLE__ - int nperflevels = get_hw_capability("hw.nperflevels"); - if (nperflevels == 1) - { - // smp models - mask_little.disable_all(); - mask_big = mask_all; - } - else - { - // two or more clusters, level0 is the high-performance cluster - int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max"); - for (int i = 0; i < perflevel0_logicalcpu; i++) - { - mask_big.enable(i); - } - for (int i = perflevel0_logicalcpu; i < g_cpucount; i++) - { - mask_little.enable(i); - } - } +#if __aarch64__ + return 0; // no known apple cpu support armv8.6 svef32mm #else - // TODO implement me for other platforms - mask_little.disable_all(); - mask_big = mask_all; + return 0; +#endif +#else + return 0; #endif } -static cpu_thread_affinity_mask g_thread_affinity_mask; - -const CpuSet& get_cpu_thread_affinity_mask(int powersave) +int cpu_support_x86_avx() { - if (powersave == 0) - return g_thread_affinity_mask.mask_all; - - if (powersave == 1) - return g_thread_affinity_mask.mask_little; - - if (powersave == 2) - return g_thread_affinity_mask.mask_big; - - NCNN_LOGE("powersave %d not supported", powersave); - - // fallback to all cores anyway - return g_thread_affinity_mask.mask_all; + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx; +#else + return 0; +#endif } -int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) +int cpu_support_x86_fma() { -#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__)) - int num_threads = thread_affinity_mask.num_enabled(); - -#ifdef _OPENMP - // set affinity for each thread - set_omp_num_threads(num_threads); - std::vector ssarets(num_threads, 0); - #pragma omp parallel for num_threads(num_threads) - for (int i = 0; i < num_threads; i++) - { - ssarets[i] = set_sched_affinity(thread_affinity_mask); - } - for (int i = 0; i < num_threads; i++) - { - if (ssarets[i] != 0) - return -1; - } + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_fma; #else - int ssaret = set_sched_affinity(thread_affinity_mask); - if (ssaret != 0) - return -1; + return 0; #endif +} +int cpu_support_x86_xop() +{ + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_xop; +#else return 0; -#elif __APPLE__ - -#ifdef _OPENMP - int num_threads = thread_affinity_mask.num_enabled(); - - // set affinity for each thread - set_omp_num_threads(num_threads); - std::vector ssarets(num_threads, 0); - #pragma omp parallel for num_threads(num_threads) - for (int i = 0; i < num_threads; i++) - { - // assign one core for each thread - int core = -1 - i; - for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++) - { - if (thread_affinity_mask.is_enabled(j)) - { - if (core == -1) - { - core = j; - break; - } - else - { - core++; - } - } - } - CpuSet this_thread_affinity_mask; - if (core != -1 - i) - { - this_thread_affinity_mask.enable(core); - } +#endif +} - ssarets[i] = set_sched_affinity(this_thread_affinity_mask); - } - for (int i = 0; i < num_threads; i++) - { - if (ssarets[i] != 0) - return -1; - } +int cpu_support_x86_f16c() +{ + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_f16c; #else - int ssaret = set_sched_affinity(thread_affinity_mask); - if (ssaret != 0) - return -1; + return 0; #endif +} - return 0; +int cpu_support_x86_avx2() +{ + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx2; #else - // TODO - (void)thread_affinity_mask; - return -1; + return 0; #endif } -#if defined __ANDROID__ || defined __linux__ -#if __aarch64__ -union midr_info_t +int cpu_support_x86_avx_vnni() { - struct __attribute__((packed)) - { - unsigned int revision : 4; - unsigned int part : 12; - unsigned int architecture : 4; - unsigned int variant : 4; - unsigned int implementer : 8; - }; - unsigned int midr; - - midr_info_t(unsigned int _midr) - : midr(_midr) - { - } -}; + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx_vnni; +#else + return 0; +#endif +} -static unsigned int get_midr_from_sysfs(int cpuid) +int cpu_support_x86_avx512() { - char path[256]; - sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid); - - FILE* fp = fopen(path, "rb"); - if (!fp) - return 0; - - unsigned int midr_el1 = 0; - int nscan = fscanf(fp, "%x", &midr_el1); - if (nscan != 1) - { - // ignore - } - - fclose(fp); - - return midr_el1; + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx512; +#else + return 0; +#endif } -static int get_midr_from_proc_cpuinfo(std::vector& midrs) +int cpu_support_x86_avx512_vnni() { - FILE* fp = fopen("/proc/cpuinfo", "rb"); - if (!fp) - return -1; - - midrs.resize(g_cpucount, 0); - - int cpuid = -1; - midr_info_t midr_info(0); - - char line[1024]; - while (!feof(fp)) - { - char* s = fgets(line, 1024, fp); - if (!s) - break; - - if (memcmp(line, "processor", 9) == 0) - { - // processor : 4 - int id = -1; - int nscan = sscanf(line, "%*[^:]: %d", &id); - if (nscan != 1) - continue; - - if (cpuid >= 0 && cpuid < g_cpucount) - { - if (midr_info.midr == 0) - { - // shared midr - midrs[cpuid] = (unsigned int)-1; - } - else - { - // save midr and reset - midrs[cpuid] = midr_info.midr; - for (int i = 0; i < g_cpucount; i++) - { - if (midrs[i] == (unsigned int)-1) - midrs[i] = midr_info.midr; - } - } + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx512_vnni; +#else + return 0; +#endif +} - midr_info.midr = 0; - } +int cpu_support_x86_avx512_bf16() +{ + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx512_bf16; +#else + return 0; +#endif +} - cpuid = id; - } +int cpu_support_x86_avx512_fp16() +{ + try_initialize_global_cpu_info(); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + return g_cpu_support_x86_avx512_fp16; +#else + return 0; +#endif +} - if (cpuid == -1) - continue; +int cpu_support_mips_msa() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __mips__ + return g_hwcaps & HWCAP_MIPS_MSA; +#else + return 0; +#endif +#else + return 0; +#endif +} - if (memcmp(line, "CPU implementer", 15) == 0) - { - // CPU implementer : 0x51 - unsigned int id = 0; - int nscan = sscanf(line, "%*[^:]: %x", &id); - if (nscan != 1) - continue; +int cpu_support_loongarch_lsx() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __loongarch64 + return g_hwcaps & HWCAP_LOONGARCH_LSX; +#else + return 0; +#endif +#else + return 0; +#endif +} - midr_info.implementer = id; - } - else if (memcmp(line, "CPU architecture", 16) == 0) - { - // CPU architecture: 8 - int id = 0; - int nscan = sscanf(line, "%*[^:]: %d", &id); - if (nscan != 1) - continue; +int cpu_support_loongarch_lasx() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __loongarch64 + return g_hwcaps & HWCAP_LOONGARCH_LASX; +#else + return 0; +#endif +#else + return 0; +#endif +} - midr_info.architecture = id; - } - else if (memcmp(line, "CPU variant", 11) == 0) - { - // CPU variant : 0xd - int id = 0; - int nscan = sscanf(line, "%*[^:]: %x", &id); - if (nscan != 1) - continue; +int cpu_support_loongson_mmi() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __mips__ + return g_hwcaps & HWCAP_LOONGSON_MMI; +#else + return 0; +#endif +#else + return 0; +#endif +} - midr_info.variant = id; - } - else if (memcmp(line, "CPU part", 8) == 0) - { - // CPU part : 0x804 - int id = 0; - int nscan = sscanf(line, "%*[^:]: %x", &id); - if (nscan != 1) - continue; +int cpu_support_riscv_v() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __riscv + return g_hwcaps & COMPAT_HWCAP_ISA_V; +#else + return 0; +#endif +#else + return 0; +#endif +} - midr_info.part = id; - } - else if (memcmp(line, "CPU revision", 12) == 0) - { - // CPU revision : 14 - int id = 0; - int nscan = sscanf(line, "%*[^:]: %d", &id); - if (nscan != 1) - continue; +int cpu_support_riscv_zfh() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __riscv + // v + f does not imply zfh, but how to discover zfh properly ? + // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414 + return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F; +#else + return 0; +#endif +#else + return 0; +#endif +} - midr_info.revision = id; - } - } +int cpu_riscv_vlenb() +{ + try_initialize_global_cpu_info(); +#if __riscv + if (!cpu_support_riscv_v()) + return 0; - fclose(fp); + int a = 0; + asm volatile( + ".word 0xc22026f3 \n" // csrr a3, vlenb + "mv %0, a3 \n" + : "=r"(a) + : + : "memory", "a3"); + return a; +#else + return 0; +#endif +} - if (cpuid >= 0 && cpuid < g_cpucount) - { - if (midr_info.midr == 0) - { - // shared midr - midrs[cpuid] = (unsigned int)-1; - } - else - { - // save midr and reset - midrs[cpuid] = midr_info.midr; - for (int i = 0; i < g_cpucount; i++) - { - if (midrs[i] == (unsigned int)-1) - midrs[i] = midr_info.midr; - } - } +int get_cpu_count() +{ + try_initialize_global_cpu_info(); + return g_cpucount; +} - midr_info.midr = 0; - } +int get_little_cpu_count() +{ + try_initialize_global_cpu_info(); + return get_cpu_thread_affinity_mask(1).num_enabled(); +} - // /proc/cpuinfo may only report little/online cores on old kernel - if (get_big_cpu_count() == get_cpu_count()) - { - // assign the remaining unknown midrs for smp cpu - for (int i = 0; i < g_cpucount; i++) - { - if (midrs[i] == 0) - midrs[i] = midr_info.midr; - } - } - else - { - // clear the big core midrs for hmp cpu if they are the same as little cores - const CpuSet& little_cs = get_cpu_thread_affinity_mask(1); - const CpuSet& big_cs = get_cpu_thread_affinity_mask(2); +int get_big_cpu_count() +{ + try_initialize_global_cpu_info(); + int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled(); + return big_cpu_count ? big_cpu_count : g_cpucount; +} - unsigned int little_midr = 0; - for (int i = 0; i < g_cpucount; i++) - { - if (little_cs.is_enabled(i)) - { - little_midr = midrs[i]; - break; - } - } +int get_physical_cpu_count() +{ + try_initialize_global_cpu_info(); + return g_physical_cpucount; +} - for (int i = 0; i < g_cpucount; i++) - { - if (big_cs.is_enabled(i)) - { - if (midrs[i] == little_midr) - { - midrs[i] = 0; - } - } - } - } +int get_physical_little_cpu_count() +{ + try_initialize_global_cpu_info(); + if (g_physical_cpucount == g_cpucount) + return get_little_cpu_count(); - return 0; + return g_physical_cpucount * 2 - g_cpucount; } -// return midr for the current running core -static unsigned int get_midr_from_register() +int get_physical_big_cpu_count() { - uint64_t midr; - asm volatile("mrs %0, MIDR_EL1" - : "=r"(midr)); + try_initialize_global_cpu_info(); + if (g_physical_cpucount == g_cpucount) + return get_big_cpu_count(); - return (unsigned int)midr; + return g_cpucount - g_physical_cpucount; } -static int get_sched_affinity(CpuSet& thread_affinity_mask) +int get_cpu_level2_cache_size() { - // get affinity for thread -#if defined(__BIONIC__) - pid_t pid = gettid(); -#else - pid_t pid = syscall(SYS_gettid); -#endif + try_initialize_global_cpu_info(); + return g_cpu_level2_cachesize; +} - thread_affinity_mask.disable_all(); +int get_cpu_level3_cache_size() +{ + try_initialize_global_cpu_info(); + return g_cpu_level3_cachesize; +} - int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set); - if (syscallret) +int get_cpu_powersave() +{ + try_initialize_global_cpu_info(); + return g_powersave; +} + +int set_cpu_powersave(int powersave) +{ + try_initialize_global_cpu_info(); + if (powersave < 0 || powersave > 2) { - // handle get error silently + NCNN_LOGE("powersave %d not supported", powersave); return -1; } + const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave); + + int ret = set_cpu_thread_affinity(thread_affinity_mask); + if (ret != 0) + return ret; + + g_powersave = powersave; + return 0; } -static int midr_is_a53_a55(unsigned int midr) +const CpuSet& get_cpu_thread_affinity_mask(int powersave) { - // 0x 41 ? f d03 ? = arm cortex-a53 - // 0x 51 ? f 801 ? = qcom kryo200 a53 - // 0x 41 ? f d04 ? = arm cortex-a35 - // 0x 41 ? f d05 ? = arm cortex-a55 - // 0x 51 ? f 803 ? = qcom kryo300 a55 - // 0x 51 ? f 805 ? = qcom kryo400 a55 + try_initialize_global_cpu_info(); + if (powersave == 0) + return g_cpu_affinity_mask_all; - midr_info_t midr_info(midr); + if (powersave == 1) + return g_cpu_affinity_mask_little; - return (midr_info.implementer == 0x41 && midr_info.part == 0xd03) - || (midr_info.implementer == 0x51 && midr_info.part == 0x801) - || (midr_info.implementer == 0x41 && midr_info.part == 0xd04) - || (midr_info.implementer == 0x41 && midr_info.part == 0xd05) - || (midr_info.implementer == 0x51 && midr_info.part == 0x803) - || (midr_info.implementer == 0x51 && midr_info.part == 0x805); + if (powersave == 2) + return g_cpu_affinity_mask_big; + + NCNN_LOGE("powersave %d not supported", powersave); + + // fallback to all cores anyway + return g_cpu_affinity_mask_all; } -static int detect_cpu_is_arm_a53_a55() +int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) { - int a53_a55_cpu_count = 0; + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__)) +#ifdef _OPENMP + int num_threads = thread_affinity_mask.num_enabled(); - // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1 - bool sysfs_midr = true; - for (int i = 0; i < g_cpucount; i++) + // set affinity for each thread + set_omp_num_threads(num_threads); + std::vector ssarets(num_threads, 0); + #pragma omp parallel for num_threads(num_threads) + for (int i = 0; i < num_threads; i++) { - unsigned int midr = 0; + ssarets[i] = set_sched_affinity(thread_affinity_mask); + } + for (int i = 0; i < num_threads; i++) + { + if (ssarets[i] != 0) + return -1; + } +#else + int ssaret = set_sched_affinity(thread_affinity_mask); + if (ssaret != 0) + return -1; +#endif - // for kernel 4.7+ - midr = get_midr_from_sysfs(i); - if (midr == 0) - { - sysfs_midr = false; - break; - } + return 0; +#elif __APPLE__ - if (midr_is_a53_a55(midr)) - { - a53_a55_cpu_count++; - } - } +#ifdef _OPENMP + int num_threads = thread_affinity_mask.num_enabled(); - if (!sysfs_midr) + // set affinity for each thread + set_omp_num_threads(num_threads); + std::vector ssarets(num_threads, 0); + #pragma omp parallel for num_threads(num_threads) + for (int i = 0; i < num_threads; i++) { - // second try, collect midr from /proc/cpuinfo - std::vector midrs; - int ret = get_midr_from_proc_cpuinfo(midrs); - if (ret == 0 && (int)midrs.size() == g_cpucount) + // assign one core for each thread + int core = -1 - i; + for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++) { - for (int i = 0; i < g_cpucount; i++) + if (thread_affinity_mask.is_enabled(j)) { - if (midr_is_a53_a55(midrs[i])) + if (core == -1) { - a53_a55_cpu_count++; + core = j; + break; + } + else + { + core++; } } } - else + CpuSet this_thread_affinity_mask; + if (core != -1 - i) { - // third try, assume all aarch64 little cores are a53/a55 - a53_a55_cpu_count = get_little_cpu_count(); + this_thread_affinity_mask.enable(core); } - } - if (a53_a55_cpu_count == 0) - return 0; // all non a53/a55 - - if (a53_a55_cpu_count == g_cpucount) - return 1; // all a53/a55 + ssarets[i] = set_sched_affinity(this_thread_affinity_mask); + } + for (int i = 0; i < num_threads; i++) + { + if (ssarets[i] != 0) + return -1; + } +#else + int ssaret = set_sched_affinity(thread_affinity_mask); + if (ssaret != 0) + return -1; +#endif - // little cores are a53/a55 - return 2; + return 0; +#else + // TODO + (void)thread_affinity_mask; + return -1; +#endif } -static int g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55(); -#endif // __aarch64__ -#endif // defined __ANDROID__ || defined __linux__ - int is_current_thread_running_on_a53_a55() { + try_initialize_global_cpu_info(); #if defined __ANDROID__ || defined __linux__ #if __aarch64__ if (g_cpu_is_arm_a53_a55 == 0) diff --git a/src/layer/arm/convolution_3x3_winograd.h b/src/layer/arm/convolution_3x3_winograd.h index 681979b10..5b5872f9f 100644 --- a/src/layer/arm/convolution_3x3_winograd.h +++ b/src/layer/arm/convolution_3x3_winograd.h @@ -4464,6 +4464,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B, // resolve optimal tile size from cache size const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float)); + if (nT == 0) + nT = get_physical_big_cpu_count(); + // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ? (void)B; diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h index 3e5dee1ae..813b81299 100644 --- a/src/layer/arm/convolution_3x3_winograd_fp16s.h +++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h @@ -1999,6 +1999,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in // resolve optimal tile size from cache size const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); + if (nT == 0) + nT = get_physical_big_cpu_count(); + // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ? (void)B; diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h index eeb6df6bb..45651ddae 100644 --- a/src/layer/arm/convolution_im2col_gemm.h +++ b/src/layer/arm/convolution_im2col_gemm.h @@ -5950,6 +5950,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in // resolve optimal tile size from cache size const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float)); + if (nT == 0) + nT = get_physical_big_cpu_count(); + // solve K { // try not to split K diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h index 1cb603e7d..f29420e97 100644 --- a/src/layer/arm/convolution_im2col_gemm_bf16s.h +++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h @@ -5831,6 +5831,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int // resolve optimal tile size from cache size const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); + if (nT == 0) + nT = get_physical_big_cpu_count(); + // solve K { // try not to split K diff --git a/src/layer/arm/convolution_im2col_gemm_fp16s.h b/src/layer/arm/convolution_im2col_gemm_fp16s.h index 360f05f41..a4cc82d70 100644 --- a/src/layer/arm/convolution_im2col_gemm_fp16s.h +++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h @@ -3022,6 +3022,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in // resolve optimal tile size from cache size const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); + if (nT == 0) + nT = get_physical_big_cpu_count(); + // solve K { // try not to split K diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp index 86079cd6b..e5cfede04 100644 --- a/src/layer/arm/gemm_arm.cpp +++ b/src/layer/arm/gemm_arm.cpp @@ -3662,7 +3662,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size - size_t l2_cache_size = get_cpu_level2_cache_size(); + const size_t l2_cache_size = get_cpu_level2_cache_size(); + + if (nT == 0) + nT = get_physical_big_cpu_count(); + int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float)); #if __aarch64__ diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp index e73a82473..f2b352b80 100644 --- a/src/layer/arm/gemm_arm_asimdhp.cpp +++ b/src/layer/arm/gemm_arm_asimdhp.cpp @@ -2281,7 +2281,11 @@ static void gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_til static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size - size_t l2_cache_size = get_cpu_level2_cache_size(); + const size_t l2_cache_size = get_cpu_level2_cache_size(); + + if (nT == 0) + nT = get_physical_big_cpu_count(); + int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16)); TILE_M = std::max(8, tile_size / 8 * 8); diff --git a/src/layer/arm/gemm_bf16s_fp16s.h b/src/layer/arm/gemm_bf16s_fp16s.h index 768200a6e..7f93d003c 100644 --- a/src/layer/arm/gemm_bf16s_fp16s.h +++ b/src/layer/arm/gemm_bf16s_fp16s.h @@ -1522,7 +1522,11 @@ static void transpose_unpack_output_tile_bf16_fp16(const Mat& topT, Mat& top_blo static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size - size_t l2_cache_size = get_cpu_level2_cache_size(); + const size_t l2_cache_size = get_cpu_level2_cache_size(); + + if (nT == 0) + nT = get_physical_big_cpu_count(); + int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float))); TILE_M = std::max(8, tile_size / 8 * 8); diff --git a/src/layer/x86/convolution_3x3_winograd.h b/src/layer/x86/convolution_3x3_winograd.h index 0c273bed0..ca7751c29 100644 --- a/src/layer/x86/convolution_3x3_winograd.h +++ b/src/layer/x86/convolution_3x3_winograd.h @@ -1820,7 +1820,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, Mat& static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size - size_t l2_cache_size = get_cpu_level2_cache_size(); + const size_t l2_cache_size = get_cpu_level2_cache_size(); + + if (nT == 0) + nT = get_physical_big_cpu_count(); // solve M { diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp index a85c802c2..8ee06d443 100644 --- a/src/layer/x86/gemm_x86.cpp +++ b/src/layer/x86/gemm_x86.cpp @@ -6699,7 +6699,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size - size_t l2_cache_size = get_cpu_level2_cache_size(); + const size_t l2_cache_size = get_cpu_level2_cache_size(); + + if (nT == 0) + nT = get_physical_big_cpu_count(); + int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); #if __AVX512F__