- // Copyright 2017 Tencent
- // SPDX-License-Identifier: BSD-3-Clause
-
- #include "cpu.h"
-
- #include "platform.h"
-
- #include <limits.h>
- #ifndef __wasi__
- #include <setjmp.h>
- #include <signal.h>
- #endif // __wasi__
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #if !NCNN_SIMPLESTL
- #include <algorithm>
- #include <cstdint>
- #include <utility>
- #include <vector>
- #endif
-
- #ifdef _OPENMP
- #if NCNN_SIMPLEOMP
- #include "simpleomp.h"
- #else
- #include <omp.h>
- #endif
- #endif
-
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- #ifdef _MSC_VER
- #include <intrin.h> // __cpuid()
- #include <immintrin.h> // _xgetbv()
- #endif
- #if defined(__clang__) || defined(__GNUC__)
- #include <cpuid.h> // __get_cpuid() and __cpuid_count()
- #endif
- #endif
-
- #ifdef __EMSCRIPTEN__
- #include <emscripten/threading.h>
- #endif
-
- #if defined _WIN32
- #define WIN32_LEAN_AND_MEAN
- #include <windows.h>
- #endif
-
- #if defined __ANDROID__ || defined __OHOS__ || __linux__
- #if defined __ANDROID__
- #if __ANDROID_API__ >= 18
- #include <sys/auxv.h> // getauxval()
- #endif
- #include <sys/system_properties.h> // __system_property_get()
- #include <dlfcn.h>
- #endif
- #if defined __OHOS__
- #include <sys/auxv.h> // getauxval()
- #endif
- #include <ctype.h>
- #include <stdint.h>
- #include <fcntl.h>
- #include <sys/stat.h>
- #include <sys/syscall.h>
- #include <unistd.h>
- #endif
-
- #if __APPLE__
- #include <mach/mach.h>
- #include <mach/machine.h>
- #include <mach/thread_act.h>
- #include <sys/sysctl.h>
- #include <sys/types.h>
- #include <unistd.h>
- #include "TargetConditionals.h"
- #if TARGET_OS_IPHONE
- #define __IOS__ 1
- #endif
- // define missing cpu model for old sdk
- #ifndef CPUFAMILY_ARM_HURRICANE
- #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
- #endif
- // A11
- #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
- #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
- #endif
- // A12
- #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
- #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
- #endif
- // A13
- #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
- #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
- #endif
- // A14 / M1
- #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
- #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
- #endif
- // A15 / M2
- #ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
- #define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
- #endif
- // A16
- #ifndef CPUFAMILY_ARM_EVEREST_SAWTOOTH
- #define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea
- #endif
- // A17
- #ifndef CPUFAMILY_ARM_COLL
- #define CPUFAMILY_ARM_COLL 0x2876f5b5
- #endif
- // A18
- #ifndef CPUFAMILY_ARM_TUPAI
- #define CPUFAMILY_ARM_TUPAI 0x204526d0
- #endif
- // A18 Pro
- #ifndef CPUFAMILY_ARM_TAHITI
- #define CPUFAMILY_ARM_TAHITI 0x75d4acb9
- #endif
- // M3
- #ifndef CPUFAMILY_ARM_IBIZA
- #define CPUFAMILY_ARM_IBIZA 0xfa33415e
- #endif
- // M3 Pro
- #ifndef CPUFAMILY_ARM_LOBOS
- #define CPUFAMILY_ARM_LOBOS 0x5f4dea93
- #endif
- // M3 Max
- #ifndef CPUFAMILY_ARM_PALMA
- #define CPUFAMILY_ARM_PALMA 0x72015832
- #endif
- // M4
- #ifndef CPUFAMILY_ARM_DONAN
- #define CPUFAMILY_ARM_DONAN 0x6f5129ac
- #endif
- // M4 Pro / M4 Max
- #ifndef CPUFAMILY_ARM_BRAVA
- #define CPUFAMILY_ARM_BRAVA 0x17d5b93a
- #endif
- #endif // __APPLE__
-
- #if defined(__SSE3__)
- #include <immintrin.h>
- #endif
-
- #if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
- #define RUAPU_IMPLEMENTATION
- #include "ruapu.h"
- #endif
-
- #if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
- __attribute__((constructor)) void ncnn_kmp_env_initializer()
- {
- // this function should be called before touching all openmp stuff
- // the env setting here helps prevent abort from happening inside openmp
-
- // the internal affinity routines in llvm openmp call abort on __NR_sched_getaffinity / __NR_sched_setaffinity fails
- // ref KMPNativeAffinity::get_system_affinity/set_system_affinity in openmp/runtime/src/kmp_affinity.h
- // and cpu core goes offline in powersave mode on android, which triggers abort
- // disable affinity capability, we handle thread affinity for openmp threads
- #if defined _WIN32
- #if _WIN32_WINNT >= 0x0600
- _putenv_s("KMP_AFFINITY", "disabled");
- #else
- _putenv("KMP_AFFINITY=disabled");
- #endif
- #else
- setenv("KMP_AFFINITY", "disabled", 1);
- #endif
-
- // openmp initialization triggers abort when another openmp runtime detected
- // ref __kmp_register_library_startup in openmp/runtime/src/kmp_runtime.cpp
- // this happens when loading multiple libraries that are static linked openmp
- // just let it continue to work, it works well in most cases, at least it won't crash unexpectedly
- #if defined _WIN32
- #if _WIN32_WINNT >= 0x0600
- _putenv_s("KMP_DUPLICATE_LIB_OK", "1");
- #else
- _putenv("KMP_DUPLICATE_LIB_OK=1");
- #endif
- #else
- setenv("KMP_DUPLICATE_LIB_OK", "1", 1);
- #endif
- }
- #endif
-
- // topology info
- static int g_cpucount;
- static int g_physical_cpucount;
- static int g_powersave;
- static int g_max_cpu_count = 0; // Maximum CPU count detected at runtime
- static ncnn::CpuSet g_cpu_affinity_mask_all;
- static ncnn::CpuSet g_cpu_affinity_mask_little;
- static ncnn::CpuSet g_cpu_affinity_mask_big;
-
- // isa info
- #if defined _WIN32
- #if __aarch64__
- static int g_cpu_support_arm_asimdhp;
- static int g_cpu_support_arm_cpuid;
- static int g_cpu_support_arm_asimddp;
- static int g_cpu_support_arm_asimdfhm;
- static int g_cpu_support_arm_bf16;
- static int g_cpu_support_arm_i8mm;
- static int g_cpu_support_arm_sve;
- static int g_cpu_support_arm_sve2;
- static int g_cpu_support_arm_svebf16;
- static int g_cpu_support_arm_svei8mm;
- static int g_cpu_support_arm_svef32mm;
- #elif __arm__
- static int g_cpu_support_arm_edsp;
- static int g_cpu_support_arm_neon;
- static int g_cpu_support_arm_vfpv4;
- #endif // __aarch64__ || __arm__
- #elif defined __ANDROID__ || defined __linux__
- static unsigned int g_hwcaps;
- static unsigned int g_hwcaps2;
- #elif __APPLE__
- static unsigned int g_hw_cpufamily;
- static cpu_type_t g_hw_cputype;
- static cpu_subtype_t g_hw_cpusubtype;
- #if __aarch64__
- static int g_hw_optional_arm_FEAT_FP16;
- static int g_hw_optional_arm_FEAT_DotProd;
- static int g_hw_optional_arm_FEAT_FHM;
- static int g_hw_optional_arm_FEAT_BF16;
- static int g_hw_optional_arm_FEAT_I8MM;
- #endif // __aarch64__
- #endif
-
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- static int g_cpu_support_x86_avx;
- static int g_cpu_support_x86_fma;
- static int g_cpu_support_x86_xop;
- static int g_cpu_support_x86_f16c;
- static int g_cpu_support_x86_avx2;
- static int g_cpu_support_x86_avx_vnni;
- static int g_cpu_support_x86_avx_vnni_int8;
- static int g_cpu_support_x86_avx_vnni_int16;
- static int g_cpu_support_x86_avx_ne_convert;
- static int g_cpu_support_x86_avx512;
- static int g_cpu_support_x86_avx512_vnni;
- static int g_cpu_support_x86_avx512_bf16;
- static int g_cpu_support_x86_avx512_fp16;
- #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-
- #if defined __ANDROID__ || defined __linux__
- #if __riscv
- static int g_cpu_support_riscv_zfh;
- static int g_cpu_support_riscv_zvfh;
- static int g_cpu_support_riscv_xtheadvector;
- #endif // __riscv
- #endif // defined __ANDROID__ || defined __linux__
-
- static int g_cpu_level2_cachesize;
- static int g_cpu_level3_cachesize;
-
- // misc info
- #if defined __ANDROID__ || defined __linux__
- #if __aarch64__
- static int g_cpu_is_arm_a53_a55;
- #endif // __aarch64__
- #endif // defined __ANDROID__ || defined __linux__
-
- static bool is_being_debugged()
- {
- #if defined _WIN32
- return IsDebuggerPresent();
- #elif defined __ANDROID__ || defined __linux__
- // https://stackoverflow.com/questions/3596781/how-to-detect-if-the-current-process-is-being-run-by-gdb
- int status_fd = open("/proc/self/status", O_RDONLY);
- if (status_fd == -1)
- return false;
-
- char buf[4096];
- ssize_t num_read = read(status_fd, buf, sizeof(buf) - 1);
- close(status_fd);
-
- if (num_read <= 0)
- return false;
-
- buf[num_read] = '\0';
- const char tracerPidString[] = "TracerPid:";
- const char* tracer_pid_ptr = strstr(buf, tracerPidString);
- if (!tracer_pid_ptr)
- return false;
-
- for (const char* ch = tracer_pid_ptr + sizeof(tracerPidString) - 1; ch <= buf + num_read; ++ch)
- {
- if (isspace(*ch))
- continue;
-
- return isdigit(*ch) != 0 && *ch != '0';
- }
-
- return false;
- #elif defined __APPLE__
- // https://stackoverflow.com/questions/2200277/detecting-debugger-on-mac-os-x
- struct kinfo_proc info;
- info.kp_proc.p_flag = 0;
-
- int mib[4];
- mib[0] = CTL_KERN;
- mib[1] = KERN_PROC;
- mib[2] = KERN_PROC_PID;
- mib[3] = getpid();
-
- size_t size = sizeof(info);
- sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, NULL, 0);
-
- return ((info.kp_proc.p_flag & P_TRACED) != 0);
- #else
- // unknown platform :(
- fprintf(stderr, "unknown platform!\n");
- return false;
- #endif
- }
-
- #if defined __ANDROID__ || defined __OHOS__ || defined __linux__
-
- #define AT_HWCAP 16
- #define AT_HWCAP2 26
-
- #if __aarch64__
- // from arch/arm64/include/uapi/asm/hwcap.h
- #define HWCAP_ASIMD (1 << 1)
- #define HWCAP_ASIMDHP (1 << 10)
- #define HWCAP_CPUID (1 << 11)
- #define HWCAP_ASIMDDP (1 << 20)
- #define HWCAP_SVE (1 << 22)
- #define HWCAP_ASIMDFHM (1 << 23)
- #define HWCAP2_SVE2 (1 << 1)
- #define HWCAP2_SVEI8MM (1 << 9)
- #define HWCAP2_SVEF32MM (1 << 10)
- #define HWCAP2_SVEBF16 (1 << 12)
- #define HWCAP2_I8MM (1 << 13)
- #define HWCAP2_BF16 (1 << 14)
- #else
- // from arch/arm/include/uapi/asm/hwcap.h
- #define HWCAP_EDSP (1 << 7)
- #define HWCAP_NEON (1 << 12)
- #define HWCAP_VFPv4 (1 << 16)
- #endif
-
- #if __mips__
- // from arch/mips/include/uapi/asm/hwcap.h
- #define HWCAP_MIPS_MSA (1 << 1)
- #define HWCAP_LOONGSON_MMI (1 << 11)
- #endif
-
- #if __loongarch64
- // from arch/loongarch/include/uapi/asm/hwcap.h
- #define HWCAP_LOONGARCH_LSX (1 << 4)
- #define HWCAP_LOONGARCH_LASX (1 << 5)
- #endif
-
- #if __riscv
- // from arch/riscv/include/uapi/asm/hwcap.h
- #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
- #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
- #endif
-
- #if defined __ANDROID__ || defined __OHOS__
- // Probe the system's C library for a 'getauxval' function and call it if
- // it exits, or return 0 for failure. This function is available since API
- // level 18.
- //
- // HarmonyOS NEXT support `getauxval` directly.
- //
- // Note that getauxval() can't really be re-implemented here, because
- // its implementation does not parse /proc/self/auxv. Instead it depends
- // on values that are passed by the kernel at process-init time to the
- // C runtime initialization layer.
- static unsigned int get_elf_hwcap_from_getauxval(unsigned int type)
- {
- #if defined __OHOS__
- return getauxval(type);
- #else
- #if __ANDROID_API__ >= 18
- unsigned int hwcap = getauxval(type);
- if (hwcap)
- return hwcap;
- #endif
-
- typedef unsigned long getauxval_func_t(unsigned long);
-
- dlerror();
- void* libc_handle = dlopen("libc.so", RTLD_NOW);
- if (!libc_handle)
- {
- NCNN_LOGE("dlopen libc.so failed %s", dlerror());
- return 0;
- }
-
- unsigned int result = 0;
- getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
- if (!func)
- {
- NCNN_LOGE("dlsym getauxval failed");
- }
- else
- {
- // Note: getauxval() returns 0 on failure. Doesn't touch errno.
- result = (unsigned int)(*func)(type);
- }
- dlclose(libc_handle);
-
- return result;
- #endif
- }
- #endif // defined __ANDROID__ || defined __OHOS__
-
- // extract the ELF HW capabilities bitmap from /proc/self/auxv
- static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
- {
- FILE* fp = fopen("/proc/self/auxv", "rb");
- if (!fp)
- {
- NCNN_LOGE("fopen /proc/self/auxv failed");
- return 0;
- }
-
- #if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
- struct
- {
- uint64_t tag;
- uint64_t value;
- } entry;
- #else
- struct
- {
- unsigned int tag;
- unsigned int value;
- } entry;
-
- #endif
-
- unsigned int result = 0;
- while (!feof(fp))
- {
- int nread = fread((char*)&entry, sizeof(entry), 1, fp);
- if (nread != 1)
- break;
-
- if (entry.tag == 0 && entry.value == 0)
- break;
-
- if (entry.tag == type)
- {
- result = entry.value;
- break;
- }
- }
-
- fclose(fp);
-
- return result;
- }
-
- static unsigned int get_elf_hwcap(unsigned int type)
- {
- unsigned int hwcap = 0;
-
- #if defined __ANDROID__ || defined __OHOS__
- hwcap = get_elf_hwcap_from_getauxval(type);
- #endif
-
- if (!hwcap)
- hwcap = get_elf_hwcap_from_proc_self_auxv(type);
-
- #if defined __ANDROID__
- #if __aarch64__
- if (type == AT_HWCAP)
- {
- // samsung exynos9810 on android pre-9 incorrectly reports armv8.2
- // for little cores, but big cores only support armv8.0
- // drop all armv8.2 features used by ncnn for preventing SIGILLs
- // ref https://reviews.llvm.org/D114523
- char arch[PROP_VALUE_MAX];
- int len = __system_property_get("ro.arch", arch);
- if (len > 0 && strncmp(arch, "exynos9810", 10) == 0)
- {
- hwcap &= ~HWCAP_ASIMDHP;
- hwcap &= ~HWCAP_ASIMDDP;
- }
- }
- #endif // __aarch64__
- #endif // defined __ANDROID__
-
- return hwcap;
- }
- #endif // defined __ANDROID__ || defined __OHOS__ || defined __linux__
-
- #if __APPLE__
- static unsigned int get_hw_cpufamily()
- {
- unsigned int value = 0;
- size_t len = sizeof(value);
- sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
- return value;
- }
-
- static cpu_type_t get_hw_cputype()
- {
- cpu_type_t value = 0;
- size_t len = sizeof(value);
- sysctlbyname("hw.cputype", &value, &len, NULL, 0);
- return value;
- }
-
- static cpu_subtype_t get_hw_cpusubtype()
- {
- cpu_subtype_t value = 0;
- size_t len = sizeof(value);
- sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
- return value;
- }
-
- static int get_hw_capability(const char* cap)
- {
- int64_t value = 0;
- size_t len = sizeof(value);
- sysctlbyname(cap, &value, &len, NULL, 0);
- return value;
- }
- #endif // __APPLE__
-
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- static inline void x86_cpuid(int level, unsigned int out[4])
- {
- #if defined(_MSC_VER) && !defined(__clang__)
- __cpuid((int*)out, level);
- #elif defined(__clang__) || defined(__GNUC__)
- __get_cpuid(level, out, out + 1, out + 2, out + 3);
- #else
- NCNN_LOGE("x86_cpuid is unknown for current compiler");
- out[0] = 0;
- out[1] = 0;
- out[2] = 0;
- out[3] = 0;
- #endif
- }
-
- static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
- {
- #if defined(_MSC_VER)
- __cpuidex((int*)out, level, sublevel);
- #elif defined(__clang__) || defined(__GNUC__)
- __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
- #else
- NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
- out[0] = 0;
- out[1] = 0;
- out[2] = 0;
- out[3] = 0;
- #endif
- }
-
- static inline int x86_get_xcr0()
- {
- #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
- return _xgetbv(0);
- #elif defined(__i386__) || defined(__x86_64__)
- int xcr0 = 0;
- asm(".byte 0x0f, 0x01, 0xd0"
- : "=a"(xcr0)
- : "c"(0)
- : "%edx");
- return xcr0;
- #else
- NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
- return 0xffffffff; // assume it will work
- #endif
- }
-
- static int get_cpu_support_x86_avx()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 1)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- return 1;
- }
-
- static int get_cpu_support_x86_fma()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- return cpu_info[2] & (1u << 12);
- }
-
- static int get_cpu_support_x86_xop()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0x80000000, cpu_info);
-
- if (cpu_info[0] < 0x80000001)
- return 0;
-
- x86_cpuid(0x80000001, cpu_info);
-
- return cpu_info[2] & (1u << 11);
- }
-
- static int get_cpu_support_x86_f16c()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 1)
- return 0;
-
- x86_cpuid(1, cpu_info);
-
- return cpu_info[2] & (1u << 29);
- }
-
- static int get_cpu_support_x86_avx2()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- x86_cpuid_sublevel(7, 0, cpu_info);
- return cpu_info[1] & (1u << 5);
- }
-
- static int get_cpu_support_x86_avx_vnni()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- x86_cpuid_sublevel(7, 1, cpu_info);
- return cpu_info[0] & (1u << 4);
- }
-
- static int get_cpu_support_x86_avx_vnni_int8()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- x86_cpuid_sublevel(7, 1, cpu_info);
- return cpu_info[3] & (1u << 4);
- }
-
- static int get_cpu_support_x86_avx_vnni_int16()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- x86_cpuid_sublevel(7, 1, cpu_info);
- return cpu_info[3] & (1u << 10);
- }
-
- static int get_cpu_support_x86_avx_ne_convert()
- {
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- x86_cpuid_sublevel(7, 1, cpu_info);
- return cpu_info[3] & (1u << 5);
- }
-
- static int get_cpu_support_x86_avx512()
- {
- #if __APPLE__
- return get_hw_capability("hw.optional.avx512f")
- && get_hw_capability("hw.optional.avx512bw")
- && get_hw_capability("hw.optional.avx512cd")
- && get_hw_capability("hw.optional.avx512dq")
- && get_hw_capability("hw.optional.avx512vl");
- #else
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- // check avx512 XSAVE enabled by kernel
- if ((x86_get_xcr0() & 0xe0) != 0xe0)
- return 0;
-
- x86_cpuid_sublevel(7, 0, cpu_info);
- return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
- #endif
- }
-
- static int get_cpu_support_x86_avx512_vnni()
- {
- #if __APPLE__
- return get_hw_capability("hw.optional.avx512vnni");
- #else
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- // check avx512 XSAVE enabled by kernel
- if ((x86_get_xcr0() & 0xe0) != 0xe0)
- return 0;
-
- x86_cpuid_sublevel(7, 0, cpu_info);
- return cpu_info[2] & (1u << 11);
- #endif
- }
-
- static int get_cpu_support_x86_avx512_bf16()
- {
- #if __APPLE__
- return get_hw_capability("hw.optional.avx512bf16");
- #else
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- x86_cpuid_sublevel(7, 1, cpu_info);
- return cpu_info[0] & (1u << 5);
- #endif
- }
-
- static int get_cpu_support_x86_avx512_fp16()
- {
- #if __APPLE__
- return get_hw_capability("hw.optional.avx512fp16");
- #else
- unsigned int cpu_info[4] = {0};
- x86_cpuid(0, cpu_info);
-
- int nIds = cpu_info[0];
- if (nIds < 7)
- return 0;
-
- x86_cpuid(1, cpu_info);
- // check AVX XSAVE OSXSAVE
- if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
- return 0;
-
- // check XSAVE enabled by kernel
- if ((x86_get_xcr0() & 6) != 6)
- return 0;
-
- // check avx512 XSAVE enabled by kernel
- if ((x86_get_xcr0() & 0xe0) != 0xe0)
- return 0;
-
- x86_cpuid_sublevel(7, 0, cpu_info);
- return cpu_info[3] & (1u << 23);
- #endif
- }
- #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-
- static int get_cpucount()
- {
- int count = 0;
- #ifdef __EMSCRIPTEN__
- if (emscripten_has_threading_support())
- count = emscripten_num_logical_cores();
- else
- count = 1;
- #elif defined _WIN32
- SYSTEM_INFO system_info;
- GetSystemInfo(&system_info);
- count = system_info.dwNumberOfProcessors;
- #elif defined __ANDROID__ || defined __linux__
- // get cpu count from /proc/cpuinfo
- FILE* fp = fopen("/proc/cpuinfo", "rb");
- if (!fp)
- return 1;
-
- char line[1024];
- while (!feof(fp))
- {
- char* s = fgets(line, 1024, fp);
- if (!s)
- break;
-
- if (memcmp(line, "processor", 9) == 0)
- {
- count++;
- }
- }
-
- fclose(fp);
- #elif __APPLE__
- size_t len = sizeof(count);
- sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
- #else
- #ifdef _OPENMP
- count = omp_get_max_threads();
- #else
- count = 1;
- #endif // _OPENMP
- #endif
-
- if (count < 1)
- count = 1;
-
- return count;
- }
-
- #if defined __ANDROID__ || defined __linux__
- static void get_thread_siblings(int cpuid, ncnn::CpuSet& siblings)
- {
- siblings.disable_all();
-
- char path[256];
- sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
-
- FILE* fp = 0; //fopen(path, "rb");
- if (fp)
- {
- // Try to read hex mask directly (this path is currently disabled)
- char hex_str[256];
- int nscan = fscanf(fp, "%255s", hex_str);
- if (nscan == 1)
- {
- // Parse hex string into CpuSet
- int len = strlen(hex_str);
- if (hex_str[0] == '0' && hex_str[1] == 'x')
- {
- // Skip "0x" prefix
- len -= 2;
- memmove(hex_str, hex_str + 2, len + 1);
- }
-
- int ci = 0;
- for (int i = len - 1; i >= 0; i--)
- {
- char c = hex_str[i];
- int hex_val = 0;
-
- if (c >= '0' && c <= '9')
- hex_val = c - '0';
- else if (c >= 'a' && c <= 'f')
- hex_val = c - 'a' + 10;
- else if (c >= 'A' && c <= 'F')
- hex_val = c - 'A' + 10;
- else
- continue;
-
- if (hex_val & 1) siblings.enable(ci + 0);
- if (hex_val & 2) siblings.enable(ci + 1);
- if (hex_val & 4) siblings.enable(ci + 2);
- if (hex_val & 8) siblings.enable(ci + 3);
-
- ci += 4;
- }
- }
-
- fclose(fp);
-
- if (!siblings.is_empty())
- return;
- }
-
- // second try, parse from human-readable thread_siblings_list
- sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
-
- fp = fopen(path, "rb");
- if (fp)
- {
- int id0;
- char sep;
- int id1;
-
- int nscan = fscanf(fp, "%d", &id0);
- if (nscan == 1)
- {
- siblings.enable(id0);
-
- while (fscanf(fp, "%c%d", &sep, &id1) == 2)
- {
- if (sep == ',')
- {
- siblings.enable(id1);
- }
- if (sep == '-' && id0 < id1)
- {
- for (int i = id0 + 1; i <= id1; i++)
- {
- siblings.enable(i);
- }
- }
-
- id0 = id1;
- }
- }
-
- fclose(fp);
- }
- }
- #endif // defined __ANDROID__ || defined __linux__
-
- static int get_physical_cpucount()
- {
- int count = 0;
- #if defined _WIN32
- typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
- LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
- if (glpi == NULL)
- {
- NCNN_LOGE("GetLogicalProcessorInformation is not supported");
- return g_cpucount;
- }
-
- DWORD return_length = 0;
- glpi(NULL, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
- glpi(buffer, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
- DWORD byte_offset = 0;
- while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
- {
- if (ptr->Relationship == RelationProcessorCore)
- {
- count++;
- }
-
- byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
- ptr++;
- }
-
- free(buffer);
- #elif defined __ANDROID__ || defined __linux__
- std::vector<ncnn::CpuSet> thread_set;
- for (int i = 0; i < g_cpucount; i++)
- {
- ncnn::CpuSet thread_siblings;
- get_thread_siblings(i, thread_siblings);
- if (thread_siblings.is_empty())
- {
- // ignore malformed one
- continue;
- }
-
- bool thread_siblings_exists = false;
- for (size_t j = 0; j < thread_set.size(); j++)
- {
- // Compare CpuSets by checking if they have the same enabled CPUs
- bool same = true;
- int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id());
- for (int k = 0; k <= max_cpu; k++)
- {
- if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k))
- {
- same = false;
- break;
- }
- }
- if (same)
- {
- thread_siblings_exists = true;
- break;
- }
- }
-
- if (!thread_siblings_exists)
- {
- thread_set.push_back(thread_siblings);
- count++;
- }
- }
- if (count == 0)
- {
- // cannot resolve siblings, fallback to all cpu count
- count = g_cpucount;
- }
- #elif __APPLE__
- size_t len = sizeof(count);
- sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
- #else
- count = g_cpucount;
- #endif
-
- if (count > g_cpucount)
- count = g_cpucount;
-
- return count;
- }
-
- #if defined __ANDROID__ || defined __linux__
- static int get_data_cache_size(int cpuid, int level)
- {
- char path[256];
-
- // discover sysfs cache entry
- int indexid = -1;
- for (int i = 0;; i++)
- {
- // check level
- {
- sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i);
- FILE* fp = fopen(path, "rb");
- if (!fp)
- break;
-
- int cache_level = -1;
- int nscan = fscanf(fp, "%d", &cache_level);
- fclose(fp);
- if (nscan != 1 || cache_level != level)
- continue;
- }
-
- // check type
- {
- sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i);
- FILE* fp = fopen(path, "rb");
- if (!fp)
- break;
-
- char type[32];
- int nscan = fscanf(fp, "%31s", type);
- fclose(fp);
- if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0))
- continue;
- }
-
- indexid = i;
- break;
- }
-
- if (indexid == -1)
- {
- // no sysfs entry
- return 0;
- }
-
- // get size
- int cache_size_K = 0;
- {
- sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid);
- FILE* fp = fopen(path, "rb");
- if (!fp)
- return 0;
-
- int nscan = fscanf(fp, "%dK", &cache_size_K);
- fclose(fp);
- if (nscan != 1)
- {
- NCNN_LOGE("fscanf cache_size_K error %d", nscan);
- return 0;
- }
- }
-
- // parse shared_cpu_map mask
- ncnn::CpuSet shared_cpu_map;
- {
- sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid);
- FILE* fp = fopen(path, "rb");
- if (!fp)
- return 0;
-
- char shared_cpu_map_str[256];
- int nscan = fscanf(fp, "%255s", shared_cpu_map_str);
- fclose(fp);
- if (nscan != 1)
- {
- NCNN_LOGE("fscanf shared_cpu_map error %d", nscan);
- return 0;
- }
-
- int len = strlen(shared_cpu_map_str);
-
- if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x')
- {
- // skip leading 0x
- len -= 2;
- }
-
- int ci = 0;
- for (int i = len - 1; i >= 0; i--)
- {
- char c = shared_cpu_map_str[i];
- int hex_val = 0;
-
- // Convert hex character to value
- if (c >= '0' && c <= '9')
- hex_val = c - '0';
- else if (c >= 'a' && c <= 'f')
- hex_val = c - 'a' + 10;
- else if (c >= 'A' && c <= 'F')
- hex_val = c - 'A' + 10;
- else
- continue; // Skip invalid characters
-
- // Set bits according to hex value
- if (hex_val & 1) shared_cpu_map.enable(ci + 0);
- if (hex_val & 2) shared_cpu_map.enable(ci + 1);
- if (hex_val & 4) shared_cpu_map.enable(ci + 2);
- if (hex_val & 8) shared_cpu_map.enable(ci + 3);
-
- ci += 4;
- }
- }
-
- if (shared_cpu_map.num_enabled() == 1)
- return cache_size_K * 1024;
-
- // resolve physical cpu count in the shared_cpu_map
- int shared_physical_cpu_count = 0;
- {
- std::vector<ncnn::CpuSet> thread_set;
- for (int i = 0; i < g_cpucount; i++)
- {
- if (!shared_cpu_map.is_enabled(i))
- continue;
-
- ncnn::CpuSet thread_siblings;
- get_thread_siblings(i, thread_siblings);
- if (thread_siblings.is_empty())
- {
- // ignore malformed one
- continue;
- }
-
- bool thread_siblings_exists = false;
- for (size_t j = 0; j < thread_set.size(); j++)
- {
- // Compare CpuSets by checking if they have the same enabled CPUs
- bool same = true;
- int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id());
- for (int k = 0; k <= max_cpu; k++)
- {
- if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k))
- {
- same = false;
- break;
- }
- }
- if (same)
- {
- thread_siblings_exists = true;
- break;
- }
- }
-
- if (!thread_siblings_exists)
- {
- thread_set.push_back(thread_siblings);
- shared_physical_cpu_count++;
- }
- }
- }
-
- // return per-physical-core cache size with 4K aligned
- cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4;
-
- return cache_size_K * 1024;
- }
-
- static int get_big_cpu_data_cache_size(int level)
- {
- if (g_cpu_affinity_mask_big.num_enabled() == 0)
- {
- // smp cpu
- return get_data_cache_size(0, level);
- }
-
- for (int i = 0; i < g_cpucount; i++)
- {
- if (g_cpu_affinity_mask_big.is_enabled(i))
- {
- return get_data_cache_size(i, level);
- }
- }
-
- // should never reach here, fallback to cpu0
- return get_data_cache_size(0, level);
- }
- #endif // defined __ANDROID__ || defined __linux__
-
- static int get_cpu_level2_cachesize()
- {
- int size = 0;
- #if defined _WIN32
- typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
- LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
- if (glpi != NULL)
- {
- DWORD return_length = 0;
- glpi(NULL, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
- glpi(buffer, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
- DWORD byte_offset = 0;
- while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
- {
- if (ptr->Relationship == RelationCache)
- {
- PCACHE_DESCRIPTOR Cache = &ptr->Cache;
- if (Cache->Level == 2)
- {
- size = std::max(size, (int)Cache->Size);
- }
- }
-
- byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
- ptr++;
- }
-
- free(buffer);
- }
- #elif defined __ANDROID__ || defined __linux__
- size = get_big_cpu_data_cache_size(2);
- #if defined(_SC_LEVEL2_CACHE_SIZE)
- if (size <= 0)
- size = sysconf(_SC_LEVEL2_CACHE_SIZE);
- #endif
- #elif __APPLE__
- // perflevel 0 is the higher performance cluster
- int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
- int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
- size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
- #endif
-
- // fallback to a common value
- if (size <= 0)
- {
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- size = 64 * 1024;
- if (g_cpu_support_x86_avx)
- size = 128 * 1024;
- if (g_cpu_support_x86_avx2)
- size = 256 * 1024;
- if (g_cpu_support_x86_avx512)
- size = 1024 * 1024;
- #elif __aarch64__
- size = 256 * 1024;
- #elif __arm__
- size = 128 * 1024;
- #else
- // is 64k still too large here ?
- size = 64 * 1024;
- #endif
- }
-
- return size;
- }
-
- static int get_cpu_level3_cachesize()
- {
- int size = 0;
- #if defined _WIN32
- typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
- LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
- if (glpi != NULL)
- {
- DWORD return_length = 0;
- glpi(NULL, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
- glpi(buffer, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
- DWORD byte_offset = 0;
- while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
- {
- if (ptr->Relationship == RelationCache)
- {
- PCACHE_DESCRIPTOR Cache = &ptr->Cache;
- if (Cache->Level == 3)
- {
- size = std::max(size, (int)Cache->Size);
- }
- }
-
- byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
- ptr++;
- }
-
- free(buffer);
- }
- #elif defined __ANDROID__ || defined __linux__
- size = get_big_cpu_data_cache_size(3);
- #if defined(_SC_LEVEL3_CACHE_SIZE)
- if (size <= 0)
- size = sysconf(_SC_LEVEL3_CACHE_SIZE);
- #endif
- #elif __APPLE__
- // perflevel 0 is the higher performance cluster
- // get the size shared among all cpus
- size = get_hw_capability("hw.perflevel0.l3cachesize");
- #endif
-
- // l3 cache size can be zero
-
- return size;
- }
-
- #if defined _WIN32
- static ncnn::CpuSet get_smt_cpu_mask()
- {
- ncnn::CpuSet smt_cpu_mask;
-
- typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
- LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
- if (glpi == NULL)
- {
- NCNN_LOGE("GetLogicalProcessorInformation is not supported");
- return smt_cpu_mask;
- }
-
- DWORD return_length = 0;
- glpi(NULL, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
- glpi(buffer, &return_length);
-
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
- DWORD byte_offset = 0;
- while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
- {
- if (ptr->Relationship == RelationProcessorCore)
- {
- ncnn::CpuSet smt_set;
- smt_set.set_legacy_mask(ptr->ProcessorMask);
- if (smt_set.num_enabled() > 1)
- {
- // this core is smt - merge with existing smt_cpu_mask
- for (int i = 0; i < 64; i++) // ProcessorMask is limited to 64 bits
- {
- if (smt_set.is_enabled(i))
- {
- smt_cpu_mask.enable(i);
- }
- }
- }
- }
-
- byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
- ptr++;
- }
-
- free(buffer);
-
- return smt_cpu_mask;
- }
-
- static std::vector<int> get_max_freq_mhz()
- {
- typedef struct _PROCESSOR_POWER_INFORMATION
- {
- ULONG Number;
- ULONG MaxMhz;
- ULONG CurrentMhz;
- ULONG MhzLimit;
- ULONG MaxIdleState;
- ULONG CurrentIdleState;
- } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
-
- HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));
-
- typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
- LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
- if (cnpi == NULL)
- {
- NCNN_LOGE("CallNtPowerInformation is not supported");
- FreeLibrary(powrprof);
- return std::vector<int>(g_cpucount, 0);
- }
-
- DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
- PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);
-
- cnpi(ProcessorInformation, NULL, 0, buffer, return_length);
-
- std::vector<int> ret;
- for (int i = 0; i < g_cpucount; i++)
- {
- ULONG max_mhz = buffer[i].MaxMhz;
- ret.push_back(max_mhz);
- }
-
- free(buffer);
- FreeLibrary(powrprof);
- return ret;
- }
-
- static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
- {
- // Check if we can use the legacy method (<=64 CPUs)
- int max_cpu = thread_affinity_mask.max_cpu_id();
- if (max_cpu < 64)
- {
- ULONG_PTR legacy_mask = thread_affinity_mask.get_legacy_mask();
- if (legacy_mask != 0)
- {
- DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), legacy_mask);
- if (prev_mask == 0)
- {
- NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
- return -1;
- }
- return 0;
- }
- }
-
- // For >64 CPU support, use SetThreadGroupAffinity
- // Windows organizes CPUs into groups of 64
- typedef BOOL(WINAPI * LPFN_STGA)(HANDLE, const GROUP_AFFINITY*, GROUP_AFFINITY*);
-
- HMODULE kernel32 = GetModuleHandle(TEXT("kernel32.dll"));
- if (!kernel32)
- {
- NCNN_LOGE("Failed to get kernel32.dll handle");
- return -1;
- }
-
- LPFN_STGA SetThreadGroupAffinityFunc = (LPFN_STGA)GetProcAddress(kernel32, "SetThreadGroupAffinity");
- if (!SetThreadGroupAffinityFunc)
- {
- NCNN_LOGE("SetThreadGroupAffinity not available, >64 CPU affinity not supported");
- return -1;
- }
-
- // Find the first enabled CPU and set affinity to its group
- // This is a simplified implementation - ideally we'd handle multiple groups
- for (int cpu = 0; cpu <= max_cpu; cpu++)
- {
- if (thread_affinity_mask.is_enabled(cpu))
- {
- GROUP_AFFINITY group_affinity = {0};
- group_affinity.Group = (WORD)(cpu / 64);
- group_affinity.Mask = 1ULL << (cpu % 64);
-
- // Add other CPUs in the same group
- for (int other_cpu = cpu + 1; other_cpu <= max_cpu && other_cpu < (group_affinity.Group + 1) * 64; other_cpu++)
- {
- if (thread_affinity_mask.is_enabled(other_cpu))
- {
- group_affinity.Mask |= 1ULL << (other_cpu % 64);
- }
- }
-
- GROUP_AFFINITY prev_affinity;
- if (!SetThreadGroupAffinityFunc(GetCurrentThread(), &group_affinity, &prev_affinity))
- {
- NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError());
- return -1;
- }
-
- return 0;
- }
- }
-
- NCNN_LOGE("No CPUs enabled in affinity mask");
- return -1;
- }
- #endif // defined _WIN32
-
- #if defined __ANDROID__ || defined __linux__
- static int get_max_freq_khz(int cpuid)
- {
- // first try, for all possible cpu
- char path[256];
- sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-
- FILE* fp = fopen(path, "rb");
-
- if (!fp)
- {
- // second try, for online cpu
- sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
- fp = fopen(path, "rb");
-
- if (fp)
- {
- int max_freq_khz = 0;
- while (!feof(fp))
- {
- int freq_khz = 0;
- int nscan = fscanf(fp, "%d %*d", &freq_khz);
- if (nscan != 1)
- break;
-
- if (freq_khz > max_freq_khz)
- max_freq_khz = freq_khz;
- }
-
- fclose(fp);
-
- if (max_freq_khz != 0)
- return max_freq_khz;
-
- fp = NULL;
- }
-
- if (!fp)
- {
- // third try, for online cpu
- sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
- fp = fopen(path, "rb");
-
- if (!fp)
- return -1;
-
- int max_freq_khz = -1;
- int nscan = fscanf(fp, "%d", &max_freq_khz);
- if (nscan != 1)
- {
- NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
- }
- fclose(fp);
-
- return max_freq_khz;
- }
- }
-
- int max_freq_khz = 0;
- while (!feof(fp))
- {
- int freq_khz = 0;
- int nscan = fscanf(fp, "%d %*d", &freq_khz);
- if (nscan != 1)
- break;
-
- if (freq_khz > max_freq_khz)
- max_freq_khz = freq_khz;
- }
-
- fclose(fp);
-
- return max_freq_khz;
- }
-
- static bool is_smt_cpu(int cpuid)
- {
- // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
- char path[256];
- sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);
-
- FILE* fp = fopen(path, "rb");
-
- if (!fp)
- {
- sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
- fp = fopen(path, "rb");
-
- if (!fp)
- return false;
- }
-
- bool is_smt = false;
- while (!feof(fp))
- {
- char ch = fgetc(fp);
- if (ch == ',' || ch == '-')
- {
- is_smt = true;
- break;
- }
- }
-
- fclose(fp);
-
- return is_smt;
- }
-
- static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
- {
- // set affinity for thread
- #if defined(__BIONIC__) && !defined(__OHOS__)
- pid_t pid = gettid();
- #else
- pid_t pid = syscall(SYS_gettid);
- #endif
-
- const cpu_set_t* cpuset = thread_affinity_mask.get_cpu_set();
- if (!cpuset)
- {
- NCNN_LOGE("Failed to get cpu_set from CpuSet");
- return -1;
- }
-
- int syscallret = syscall(__NR_sched_setaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), cpuset);
- if (syscallret)
- {
- NCNN_LOGE("syscall error %d", syscallret);
- return -1;
- }
-
- return 0;
- }
- #endif // defined __ANDROID__ || defined __linux__
-
- #if __APPLE__
- static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
- {
- // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
- // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
- // https://gist.github.com/Coneko/4234842
-
- // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
- // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
- // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio
-
- int affinity_tag = THREAD_AFFINITY_TAG_NULL;
- int max_cpu = thread_affinity_mask.max_cpu_id();
- for (int i = 0; i <= max_cpu && i < 32; i++) // Apple policy is limited to 32 bits
- {
- if (thread_affinity_mask.is_enabled(i))
- {
- affinity_tag = i + 1;
- break;
- }
- }
-
- mach_port_t tid = pthread_mach_thread_np(pthread_self());
-
- thread_affinity_policy_data_t policy_data;
- policy_data.affinity_tag = affinity_tag;
- int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
- if (ret && ret != KERN_NOT_SUPPORTED)
- {
- NCNN_LOGE("thread_policy_set error %d", ret);
- return -1;
- }
-
- return 0;
- }
- #endif // __APPLE__
-
- static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
- {
- mask_all.disable_all();
- for (int i = 0; i < g_cpucount; i++)
- {
- mask_all.enable(i);
- }
-
- #if defined _WIN32
- // Check SDK >= Win7
- #if _WIN32_WINNT >= _WIN32_WINNT_WIN7 // win7
-
- // Load GetLogicalProcessorInformationEx
- HMODULE kernel32 = LoadLibrary(TEXT("kernel32.dll"));
- if (!kernel32)
- {
- NCNN_LOGE("LoadLibrary kernel32.dll failed");
- return;
- }
-
- typedef BOOL(WINAPI * LPFN_GLPIE)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
- LPFN_GLPIE glpie = (LPFN_GLPIE)GetProcAddress(kernel32, "GetLogicalProcessorInformationEx");
-
- if (glpie != NULL)
- {
- DWORD bufferSize = 0;
- glpie(RelationProcessorCore, nullptr, &bufferSize);
- std::vector<BYTE> buffer(bufferSize);
- if (!glpie(RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(buffer.data()), &bufferSize))
- {
- NCNN_LOGE("GetLogicalProcessorInformationEx failed");
- return;
- }
-
- // A map from processor number to whether it is an E core
- std::vector<std::pair<DWORD, bool> > processorCoreType;
- BYTE maxEfficiencyClass = 0; // In a system without E cores, all cores EfficiencyClass is 0
-
- BYTE* ptr = buffer.data();
- while (ptr < buffer.data() + bufferSize)
- {
- SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)ptr;
- if (info->Relationship == RelationProcessorCore)
- {
- // Mingw and some old MSVC do not have EfficiencyClass in PROCESSOR_RELATIONSHIP
- // So we should redefine PROCESSOR_RELATIONSHIP
- // Because ncnn need to support c++98, so we can't use some new features in c++11
- // So there is a ugly implementation
-
- BYTE efficiencyClass = ((BYTE*)&info->Processor)[1];
-
- bool isECore = (efficiencyClass == 0);
- maxEfficiencyClass = (std::max)(maxEfficiencyClass, efficiencyClass);
-
- for (WORD g = 0; g < info->Processor.GroupCount; ++g)
- {
- const GROUP_AFFINITY& ga = info->Processor.GroupMask[g];
- KAFFINITY mask = ga.Mask;
- WORD group = ga.Group;
- for (int bit = 0; bit < 64; ++bit)
- { // for each bit in the mask
- if (mask & (static_cast<KAFFINITY>(1) << bit))
- {
- DWORD processorNumber = group * 64 + bit;
- processorCoreType.push_back(std::pair<DWORD, bool>(processorNumber, isECore));
- }
- }
- }
- }
- ptr += info->Size;
- }
-
- if (maxEfficiencyClass == 0)
- {
- // All cores are P cores
- mask_little.disable_all();
- mask_big = mask_all;
- }
- else
- {
- for (int i = 0; i < g_cpucount; i++)
- {
- bool isECore = false;
- for (int j = 0; j < processorCoreType.size(); j++)
- {
- std::pair<DWORD, bool> p = processorCoreType[j];
- if (p.first == i)
- {
- isECore = p.second;
- break;
- }
- }
- // fprintf(stderr, "processor %d is %s\n", i, isECore ? "E" : "P");
-
- if (isECore)
- {
- mask_little.enable(i);
- }
- else
- {
- mask_big.enable(i);
- }
- }
- }
- }
- else
- #endif
- {
- // get max freq mhz for all cores
- int max_freq_mhz_min = INT_MAX;
- int max_freq_mhz_max = 0;
- std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
- for (int i = 0; i < g_cpucount; i++)
- {
- int max_freq_mhz = cpu_max_freq_mhz[i];
-
- // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);
-
- if (max_freq_mhz > max_freq_mhz_max)
- max_freq_mhz_max = max_freq_mhz;
- if (max_freq_mhz < max_freq_mhz_min)
- max_freq_mhz_min = max_freq_mhz;
- }
-
- int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
- if (max_freq_mhz_medium == max_freq_mhz_max)
- {
- mask_little.disable_all();
- mask_big = mask_all;
- return;
- }
-
- ncnn::CpuSet smt_cpu_mask = get_smt_cpu_mask();
-
- for (int i = 0; i < g_cpucount; i++)
- {
- if (smt_cpu_mask.is_enabled(i))
- {
- // always treat smt core as big core
- mask_big.enable(i);
- continue;
- }
-
- if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
- mask_little.enable(i);
- else
- mask_big.enable(i);
- }
- }
- #elif defined __ANDROID__ || defined __linux__
- int max_freq_khz_min = INT_MAX;
- int max_freq_khz_max = 0;
- std::vector<int> cpu_max_freq_khz(g_cpucount);
- for (int i = 0; i < g_cpucount; i++)
- {
- int max_freq_khz = get_max_freq_khz(i);
-
- // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
-
- cpu_max_freq_khz[i] = max_freq_khz;
-
- if (max_freq_khz > max_freq_khz_max)
- max_freq_khz_max = max_freq_khz;
- if (max_freq_khz < max_freq_khz_min)
- max_freq_khz_min = max_freq_khz;
- }
-
- int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
- if (max_freq_khz_medium == max_freq_khz_max)
- {
- mask_little.disable_all();
- mask_big = mask_all;
- return;
- }
-
- for (int i = 0; i < g_cpucount; i++)
- {
- if (is_smt_cpu(i))
- {
- // always treat smt core as big core
- mask_big.enable(i);
- continue;
- }
-
- if (cpu_max_freq_khz[i] < max_freq_khz_medium)
- mask_little.enable(i);
- else
- mask_big.enable(i);
- }
- #elif __APPLE__
- int nperflevels = get_hw_capability("hw.nperflevels");
- if (nperflevels == 1)
- {
- // smp models
- mask_little.disable_all();
- mask_big = mask_all;
- }
- else
- {
- // two or more clusters, level0 is the high-performance cluster
- int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
- for (int i = 0; i < perflevel0_logicalcpu; i++)
- {
- mask_big.enable(i);
- }
- for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
- {
- mask_little.enable(i);
- }
- }
- #else
- // TODO implement me for other platforms
- mask_little.disable_all();
- mask_big = mask_all;
- #endif
- }
-
- #if defined __ANDROID__ || defined __linux__
- #if __aarch64__
- union midr_info_t
- {
- struct __attribute__((packed))
- {
- unsigned int revision : 4;
- unsigned int part : 12;
- unsigned int architecture : 4;
- unsigned int variant : 4;
- unsigned int implementer : 8;
- };
- unsigned int midr;
-
- midr_info_t(unsigned int _midr)
- : midr(_midr)
- {
- }
- };
-
- static unsigned int get_midr_from_sysfs(int cpuid)
- {
- char path[256];
- sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid);
-
- FILE* fp = fopen(path, "rb");
- if (!fp)
- return 0;
-
- unsigned int midr_el1 = 0;
- int nscan = fscanf(fp, "%x", &midr_el1);
- if (nscan != 1)
- {
- // ignore
- }
-
- fclose(fp);
-
- return midr_el1;
- }
-
- static int get_midr_from_proc_cpuinfo(std::vector<unsigned int>& midrs)
- {
- FILE* fp = fopen("/proc/cpuinfo", "rb");
- if (!fp)
- return -1;
-
- midrs.resize(g_cpucount, 0);
-
- int cpuid = -1;
- midr_info_t midr_info(0);
-
- char line[1024];
- while (!feof(fp))
- {
- char* s = fgets(line, 1024, fp);
- if (!s)
- break;
-
- if (memcmp(line, "processor", 9) == 0)
- {
- // processor : 4
- int id = -1;
- int nscan = sscanf(line, "%*[^:]: %d", &id);
- if (nscan != 1)
- continue;
-
- if (cpuid >= 0 && cpuid < g_cpucount)
- {
- if (midr_info.midr == 0)
- {
- // shared midr
- midrs[cpuid] = (unsigned int)-1;
- }
- else
- {
- // save midr and reset
- midrs[cpuid] = midr_info.midr;
- for (int i = 0; i < g_cpucount; i++)
- {
- if (midrs[i] == (unsigned int)-1)
- midrs[i] = midr_info.midr;
- }
- }
-
- midr_info.midr = 0;
- }
-
- cpuid = id;
- }
-
- if (cpuid == -1)
- continue;
-
- if (memcmp(line, "CPU implementer", 15) == 0)
- {
- // CPU implementer : 0x51
- unsigned int id = 0;
- int nscan = sscanf(line, "%*[^:]: %x", &id);
- if (nscan != 1)
- continue;
-
- midr_info.implementer = id;
- }
- else if (memcmp(line, "CPU architecture", 16) == 0)
- {
- // CPU architecture: 8
- int id = 0;
- int nscan = sscanf(line, "%*[^:]: %d", &id);
- if (nscan != 1)
- continue;
-
- midr_info.architecture = id;
- }
- else if (memcmp(line, "CPU variant", 11) == 0)
- {
- // CPU variant : 0xd
- int id = 0;
- int nscan = sscanf(line, "%*[^:]: %x", &id);
- if (nscan != 1)
- continue;
-
- midr_info.variant = id;
- }
- else if (memcmp(line, "CPU part", 8) == 0)
- {
- // CPU part : 0x804
- int id = 0;
- int nscan = sscanf(line, "%*[^:]: %x", &id);
- if (nscan != 1)
- continue;
-
- midr_info.part = id;
- }
- else if (memcmp(line, "CPU revision", 12) == 0)
- {
- // CPU revision : 14
- int id = 0;
- int nscan = sscanf(line, "%*[^:]: %d", &id);
- if (nscan != 1)
- continue;
-
- midr_info.revision = id;
- }
- }
-
- fclose(fp);
-
- if (cpuid >= 0 && cpuid < g_cpucount)
- {
- if (midr_info.midr == 0)
- {
- // shared midr
- midrs[cpuid] = (unsigned int)-1;
- }
- else
- {
- // save midr and reset
- midrs[cpuid] = midr_info.midr;
- for (int i = 0; i < g_cpucount; i++)
- {
- if (midrs[i] == (unsigned int)-1)
- midrs[i] = midr_info.midr;
- }
- }
-
- midr_info.midr = 0;
- }
-
- // /proc/cpuinfo may only report little/online cores on old kernel
- if (g_cpu_affinity_mask_big.num_enabled() == g_cpucount)
- {
- // assign the remaining unknown midrs for smp cpu
- for (int i = 0; i < g_cpucount; i++)
- {
- if (midrs[i] == 0)
- midrs[i] = midr_info.midr;
- }
- }
- else
- {
- // clear the big core midrs for hmp cpu if they are the same as little cores
- unsigned int little_midr = 0;
- for (int i = 0; i < g_cpucount; i++)
- {
- if (g_cpu_affinity_mask_little.is_enabled(i))
- {
- little_midr = midrs[i];
- break;
- }
- }
-
- for (int i = 0; i < g_cpucount; i++)
- {
- if (g_cpu_affinity_mask_big.is_enabled(i))
- {
- if (midrs[i] == little_midr)
- {
- midrs[i] = 0;
- }
- }
- }
- }
-
- return 0;
- }
-
- // return midr for the current running core
- static unsigned int get_midr_from_register()
- {
- uint64_t midr;
- asm volatile("mrs %0, MIDR_EL1"
- : "=r"(midr));
-
- return (unsigned int)midr;
- }
-
- static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)
- {
- // get affinity for thread
- #if defined(__BIONIC__) && !defined(__OHOS__)
- pid_t pid = gettid();
- #else
- pid_t pid = syscall(SYS_gettid);
- #endif
-
- thread_affinity_mask.disable_all();
-
- // Allocate a temporary cpu_set_t for the syscall
- cpu_set_t* temp_cpuset = CPU_ALLOC(CPU_SETSIZE);
- if (!temp_cpuset)
- {
- return -1;
- }
-
- int syscallret = syscall(__NR_sched_getaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), temp_cpuset);
- if (syscallret)
- {
- CPU_FREE(temp_cpuset);
- // handle get error silently
- return -1;
- }
-
- // Copy the result to our CpuSet
- thread_affinity_mask.set_cpu_set(temp_cpuset);
- CPU_FREE(temp_cpuset);
-
- return 0;
- }
-
- static int midr_is_a53_a55(unsigned int midr)
- {
- // 0x 41 ? f d03 ? = arm cortex-a53
- // 0x 51 ? f 801 ? = qcom kryo200 a53
- // 0x 41 ? f d04 ? = arm cortex-a35
- // 0x 41 ? f d05 ? = arm cortex-a55
- // 0x 51 ? f 803 ? = qcom kryo300 a55
- // 0x 51 ? f 805 ? = qcom kryo400 a55
-
- midr_info_t midr_info(midr);
-
- return (midr_info.implementer == 0x41 && midr_info.part == 0xd03)
- || (midr_info.implementer == 0x51 && midr_info.part == 0x801)
- || (midr_info.implementer == 0x41 && midr_info.part == 0xd04)
- || (midr_info.implementer == 0x41 && midr_info.part == 0xd05)
- || (midr_info.implementer == 0x51 && midr_info.part == 0x803)
- || (midr_info.implementer == 0x51 && midr_info.part == 0x805);
- }
-
- static int detect_cpu_is_arm_a53_a55()
- {
- int a53_a55_cpu_count = 0;
-
- // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1
- bool sysfs_midr = true;
- for (int i = 0; i < g_cpucount; i++)
- {
- unsigned int midr = 0;
-
- // for kernel 4.7+
- midr = get_midr_from_sysfs(i);
- if (midr == 0)
- {
- sysfs_midr = false;
- break;
- }
-
- if (midr_is_a53_a55(midr))
- {
- a53_a55_cpu_count++;
- }
- }
-
- if (!sysfs_midr)
- {
- // second try, collect midr from /proc/cpuinfo
- std::vector<unsigned int> midrs;
- int ret = get_midr_from_proc_cpuinfo(midrs);
- if (ret == 0 && (int)midrs.size() == g_cpucount)
- {
- for (int i = 0; i < g_cpucount; i++)
- {
- if (midr_is_a53_a55(midrs[i]))
- {
- a53_a55_cpu_count++;
- }
- }
- }
- else
- {
- // third try, assume all aarch64 little cores are a53/a55
- a53_a55_cpu_count = g_cpu_affinity_mask_little.num_enabled();
- }
- }
-
- if (a53_a55_cpu_count == 0)
- return 0; // all non a53/a55
-
- if (a53_a55_cpu_count == g_cpucount)
- return 1; // all a53/a55
-
- // little cores are a53/a55
- return 2;
- }
- #endif // __aarch64__
- #endif // defined __ANDROID__ || defined __linux__
-
- // the initialization
- static void initialize_global_cpu_info()
- {
- #if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
- ncnn_kmp_env_initializer();
- #endif
-
- g_cpucount = get_cpucount();
- g_physical_cpucount = get_physical_cpucount();
- g_powersave = 0;
-
- // Set global max CPU count for CpuSet optimization
- g_max_cpu_count = g_cpucount;
-
- initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
-
- #if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
- if (!is_being_debugged())
- {
- ruapu_init();
- }
- #endif
-
- #if defined _WIN32
- #if __aarch64__
- g_cpu_support_arm_cpuid = ruapu_supports("cpuid");
- g_cpu_support_arm_asimdhp = ruapu_supports("asimdhp") || IsProcessorFeaturePresent(43); // dp implies hp
- g_cpu_support_arm_asimddp = ruapu_supports("asimddp") || IsProcessorFeaturePresent(43); // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
- g_cpu_support_arm_asimdfhm = ruapu_supports("asimdfhm");
- g_cpu_support_arm_bf16 = ruapu_supports("bf16");
- g_cpu_support_arm_i8mm = ruapu_supports("i8mm");
- g_cpu_support_arm_sve = ruapu_supports("sve");
- g_cpu_support_arm_sve2 = ruapu_supports("sve2");
- g_cpu_support_arm_svebf16 = ruapu_supports("svebf16");
- g_cpu_support_arm_svei8mm = ruapu_supports("svei8mm");
- g_cpu_support_arm_svef32mm = ruapu_supports("svef32mm");
- #elif __arm__
- g_cpu_support_arm_edsp = ruapu_supports("edsp");
- g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
- g_cpu_support_arm_vfpv4 = ruapu_supports("vfpv4");
- #endif // __aarch64__ || __arm__
- #elif defined __ANDROID__ || defined __linux__
- g_hwcaps = get_elf_hwcap(AT_HWCAP);
- g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
- #elif __APPLE__
- g_hw_cpufamily = get_hw_cpufamily();
- g_hw_cputype = get_hw_cputype();
- g_hw_cpusubtype = get_hw_cpusubtype();
- #if __aarch64__
- g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
- g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
- g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
- g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
- g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
-
- switch (g_hw_cpufamily)
- {
- case CPUFAMILY_ARM_TUPAI:
- case CPUFAMILY_ARM_TAHITI:
- case CPUFAMILY_ARM_DONAN:
- case CPUFAMILY_ARM_BRAVA:
- // TODO check sve sme
- case CPUFAMILY_ARM_AVALANCHE_BLIZZARD:
- case CPUFAMILY_ARM_EVEREST_SAWTOOTH:
- case CPUFAMILY_ARM_COLL:
- case CPUFAMILY_ARM_IBIZA:
- case CPUFAMILY_ARM_LOBOS:
- case CPUFAMILY_ARM_PALMA:
- g_hw_optional_arm_FEAT_BF16 = 1;
- g_hw_optional_arm_FEAT_I8MM = 1;
- case CPUFAMILY_ARM_LIGHTNING_THUNDER:
- case CPUFAMILY_ARM_FIRESTORM_ICESTORM:
- g_hw_optional_arm_FEAT_DotProd = 1;
- g_hw_optional_arm_FEAT_FHM = 1;
- case CPUFAMILY_ARM_MONSOON_MISTRAL:
- case CPUFAMILY_ARM_VORTEX_TEMPEST:
- g_hw_optional_arm_FEAT_FP16 = 1;
- default:
- break;
- }
- #endif // __aarch64__
- #endif
-
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- g_cpu_support_x86_avx = get_cpu_support_x86_avx();
- g_cpu_support_x86_fma = get_cpu_support_x86_fma();
- g_cpu_support_x86_xop = get_cpu_support_x86_xop();
- g_cpu_support_x86_f16c = get_cpu_support_x86_f16c();
- g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
- g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
- g_cpu_support_x86_avx_vnni_int8 = get_cpu_support_x86_avx_vnni_int8();
- g_cpu_support_x86_avx_vnni_int16 = get_cpu_support_x86_avx_vnni_int16();
- g_cpu_support_x86_avx_ne_convert = get_cpu_support_x86_avx_ne_convert();
- g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
- g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
- g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16();
- g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16();
- #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-
- #if defined __ANDROID__ || defined __linux__
- #if __riscv
- g_cpu_support_riscv_zfh = ruapu_supports("zfh") || ruapu_supports("xtheadvector"); // xtheadvector implies zfh
- g_cpu_support_riscv_zvfh = ruapu_supports("zvfh") || ruapu_supports("xtheadvector"); // xtheadvector implies zvfh
- g_cpu_support_riscv_xtheadvector = ruapu_supports("xtheadvector");
- #endif // __riscv
- #endif // defined __ANDROID__ || defined __linux__
-
- g_cpu_level2_cachesize = get_cpu_level2_cachesize();
- g_cpu_level3_cachesize = get_cpu_level3_cachesize();
-
- #if defined __ANDROID__ || defined __linux__
- #if __aarch64__
- g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55();
- #endif // __aarch64__
- #endif // defined __ANDROID__ || defined __linux__
- }
-
- static int g_cpu_info_initialized = 0;
-
- static inline void try_initialize_global_cpu_info()
- {
- if (!g_cpu_info_initialized)
- {
- initialize_global_cpu_info();
- g_cpu_info_initialized = 1;
- }
- }
-
- namespace ncnn {
-
- // New unified CpuSet implementation supporting >64 CPUs
- CpuSet::CpuSet()
- : fast_mask(0)
- , extended_mask(nullptr)
- , extended_capacity(0)
- , use_extended(false)
- #if defined _WIN32
- , legacy_mask_cache(0)
- , legacy_mask_valid(false)
- #endif
- #if defined __ANDROID__ || defined __linux__
- , cpu_set_cache(nullptr)
- , cpu_set_valid(false)
- #endif
- #if __APPLE__
- , legacy_policy_cache(0)
- , legacy_policy_valid(false)
- #endif
- {
- }
-
- CpuSet::CpuSet(const CpuSet& other)
- : fast_mask(0)
- , extended_mask(nullptr)
- , extended_capacity(0)
- , use_extended(false)
- #if defined _WIN32
- , legacy_mask_cache(0)
- , legacy_mask_valid(false)
- #endif
- #if defined __ANDROID__ || defined __linux__
- , cpu_set_cache(nullptr)
- , cpu_set_valid(false)
- #endif
- #if __APPLE__
- , legacy_policy_cache(0)
- , legacy_policy_valid(false)
- #endif
- {
- copy_from(other);
- }
-
- CpuSet& CpuSet::operator=(const CpuSet& other)
- {
- if (this != &other)
- {
- copy_from(other);
- }
- return *this;
- }
-
- CpuSet::~CpuSet()
- {
- if (extended_mask)
- {
- free(extended_mask);
- }
- #if defined __ANDROID__ || defined __linux__
- if (cpu_set_cache)
- {
- CPU_FREE(cpu_set_cache);
- }
- #endif
- }
-
- void CpuSet::copy_from(const CpuSet& other)
- {
- // Clean up existing state
- if (extended_mask)
- {
- free(extended_mask);
- extended_mask = nullptr;
- }
- extended_capacity = 0;
-
- // Copy basic state
- fast_mask = other.fast_mask;
- use_extended = other.use_extended;
-
- // Copy extended mask if needed
- if (other.use_extended && other.extended_mask)
- {
- extended_capacity = other.extended_capacity;
- extended_mask = (uint64_t*)malloc(extended_capacity * sizeof(uint64_t));
- if (extended_mask)
- {
- memcpy(extended_mask, other.extended_mask, extended_capacity * sizeof(uint64_t));
- }
- }
-
- // Invalidate caches
- #if defined _WIN32
- legacy_mask_valid = false;
- #endif
- #if defined __ANDROID__ || defined __linux__
- cpu_set_valid = false;
- if (cpu_set_cache)
- {
- CPU_FREE(cpu_set_cache);
- cpu_set_cache = nullptr;
- }
- #endif
- #if __APPLE__
- legacy_policy_valid = false;
- #endif
- }
-
- void CpuSet::ensure_capacity(int cpu_id)
- {
- if (cpu_id < FAST_PATH_BITS && !use_extended)
- {
- return; // Fast path is sufficient
- }
-
- // Need to switch to extended mode
- if (!use_extended)
- {
- use_extended = true;
- // Calculate required capacity
- int required_words = (cpu_id / BITS_PER_WORD) + 1;
- extended_capacity = std::max(required_words, 2); // Minimum 2 words
- extended_mask = (uint64_t*)calloc(extended_capacity, sizeof(uint64_t));
- if (extended_mask)
- {
- // Copy fast_mask to extended_mask[0]
- extended_mask[0] = fast_mask;
- }
- return;
- }
-
- // Already in extended mode, check if we need more capacity
- int required_words = (cpu_id / BITS_PER_WORD) + 1;
- if (required_words > extended_capacity)
- {
- int new_capacity = std::max(required_words, extended_capacity * 2);
- uint64_t* new_mask = (uint64_t*)realloc(extended_mask, new_capacity * sizeof(uint64_t));
- if (new_mask)
- {
- // Zero out new memory
- memset(new_mask + extended_capacity, 0, (new_capacity - extended_capacity) * sizeof(uint64_t));
- extended_mask = new_mask;
- extended_capacity = new_capacity;
- }
- }
- }
- void CpuSet::enable(int cpu)
- {
- if (cpu < 0) return;
-
- ensure_capacity(cpu);
-
- if (!use_extended && cpu < FAST_PATH_BITS)
- {
- fast_mask |= (1ULL << cpu);
- }
- else if (use_extended && extended_mask)
- {
- int word_idx = cpu / BITS_PER_WORD;
- int bit_idx = cpu % BITS_PER_WORD;
- if (word_idx < extended_capacity)
- {
- extended_mask[word_idx] |= (1ULL << bit_idx);
- }
- }
-
- // Invalidate caches
- #if defined _WIN32
- legacy_mask_valid = false;
- #endif
- #if defined __ANDROID__ || defined __linux__
- cpu_set_valid = false;
- #endif
- #if __APPLE__
- legacy_policy_valid = false;
- #endif
- }
-
- void CpuSet::disable(int cpu)
- {
- if (cpu < 0) return;
-
- if (!use_extended && cpu < FAST_PATH_BITS)
- {
- fast_mask &= ~(1ULL << cpu);
- }
- else if (use_extended && extended_mask)
- {
- int word_idx = cpu / BITS_PER_WORD;
- int bit_idx = cpu % BITS_PER_WORD;
- if (word_idx < extended_capacity)
- {
- extended_mask[word_idx] &= ~(1ULL << bit_idx);
- }
- }
-
- // Invalidate caches
- #if defined _WIN32
- legacy_mask_valid = false;
- #endif
- #if defined __ANDROID__ || defined __linux__
- cpu_set_valid = false;
- #endif
- #if __APPLE__
- legacy_policy_valid = false;
- #endif
- }
-
- void CpuSet::disable_all()
- {
- fast_mask = 0;
- if (use_extended && extended_mask)
- {
- memset(extended_mask, 0, extended_capacity * sizeof(uint64_t));
- }
-
- // Invalidate caches
- #if defined _WIN32
- legacy_mask_valid = false;
- #endif
- #if defined __ANDROID__ || defined __linux__
- cpu_set_valid = false;
- #endif
- #if __APPLE__
- legacy_policy_valid = false;
- #endif
- }
-
- bool CpuSet::is_enabled(int cpu) const
- {
- if (cpu < 0) return false;
-
- if (!use_extended && cpu < FAST_PATH_BITS)
- {
- return (fast_mask & (1ULL << cpu)) != 0;
- }
- else if (use_extended && extended_mask)
- {
- int word_idx = cpu / BITS_PER_WORD;
- int bit_idx = cpu % BITS_PER_WORD;
- if (word_idx < extended_capacity)
- {
- return (extended_mask[word_idx] & (1ULL << bit_idx)) != 0;
- }
- }
-
- return false;
- }
- // Helper function to count bits in a 64-bit integer
- static int popcount64(uint64_t x)
- {
- #if defined(__GNUC__) || defined(__clang__)
- return __builtin_popcountll(x);
- #elif defined(_MSC_VER)
- return (int)__popcnt64(x);
- #else
- // Fallback implementation
- int count = 0;
- while (x)
- {
- count += x & 1;
- x >>= 1;
- }
- return count;
- #endif
- }
-
- int CpuSet::num_enabled() const
- {
- int count = 0;
-
- if (!use_extended)
- {
- // Fast path: count bits in fast_mask
- count = popcount64(fast_mask);
- }
- else if (extended_mask)
- {
- // Extended path: count bits in all words
- for (int i = 0; i < extended_capacity; i++)
- {
- count += popcount64(extended_mask[i]);
- }
- }
-
- return count;
- }
-
- int CpuSet::max_cpu_id() const
- {
- if (!use_extended)
- {
- if (fast_mask == 0) return -1;
-
- // Find highest set bit in fast_mask
- for (int i = FAST_PATH_BITS - 1; i >= 0; i--)
- {
- if (fast_mask & (1ULL << i))
- return i;
- }
- return -1;
- }
- else if (extended_mask)
- {
- // Find highest set bit in extended_mask
- for (int word = extended_capacity - 1; word >= 0; word--)
- {
- if (extended_mask[word] != 0)
- {
- for (int bit = BITS_PER_WORD - 1; bit >= 0; bit--)
- {
- if (extended_mask[word] & (1ULL << bit))
- return word * BITS_PER_WORD + bit;
- }
- }
- }
- }
-
- return -1;
- }
-
- bool CpuSet::is_empty() const
- {
- if (!use_extended)
- {
- return fast_mask == 0;
- }
- else if (extended_mask)
- {
- for (int i = 0; i < extended_capacity; i++)
- {
- if (extended_mask[i] != 0)
- return false;
- }
- }
-
- return true;
- }
-
- void CpuSet::set_range(int start_cpu, int end_cpu, bool enabled)
- {
- if (start_cpu < 0 || end_cpu < start_cpu) return;
-
- for (int cpu = start_cpu; cpu <= end_cpu; cpu++)
- {
- if (enabled)
- enable(cpu);
- else
- disable(cpu);
- }
- }
- // Platform-specific compatibility methods
- #if defined _WIN32
- ULONG_PTR CpuSet::get_legacy_mask() const
- {
- if (!legacy_mask_valid)
- {
- legacy_mask_cache = 0;
-
- if (!use_extended)
- {
- // Fast path: directly use fast_mask (truncated to ULONG_PTR size)
- if (sizeof(ULONG_PTR) >= sizeof(uint64_t))
- {
- legacy_mask_cache = (ULONG_PTR)fast_mask;
- }
- else
- {
- // Create mask for ULONG_PTR size without undefined behavior
- const uint64_t ptr_mask = (sizeof(ULONG_PTR) == 4) ? 0xFFFFFFFFULL : 0xFFFFFFFFFFFFFFFFULL;
- legacy_mask_cache = (ULONG_PTR)(fast_mask & ptr_mask);
- }
- }
- else if (extended_mask && extended_capacity > 0)
- {
- // Extended path: use first word, truncated to ULONG_PTR size
- if (sizeof(ULONG_PTR) >= sizeof(uint64_t))
- {
- legacy_mask_cache = (ULONG_PTR)extended_mask[0];
- }
- else
- {
- // Create mask for ULONG_PTR size without undefined behavior
- const uint64_t ptr_mask = (sizeof(ULONG_PTR) == 4) ? 0xFFFFFFFFULL : 0xFFFFFFFFFFFFFFFFULL;
- legacy_mask_cache = (ULONG_PTR)(extended_mask[0] & ptr_mask);
- }
- }
-
- legacy_mask_valid = true;
- }
-
- return legacy_mask_cache;
- }
-
- void CpuSet::set_legacy_mask(ULONG_PTR mask)
- {
- disable_all();
-
- // Set bits according to the legacy mask
- for (int i = 0; i < (int)(sizeof(ULONG_PTR) * 8); i++)
- {
- if (mask & ((ULONG_PTR)1 << i))
- {
- enable(i);
- }
- }
- }
- #endif
-
- #if defined __ANDROID__ || defined __linux__
- const cpu_set_t* CpuSet::get_cpu_set() const
- {
- if (!cpu_set_valid)
- {
- // Allocate cpu_set_t if not already done
- if (!cpu_set_cache)
- {
- cpu_set_cache = CPU_ALLOC(CPU_SETSIZE);
- if (!cpu_set_cache)
- return nullptr;
- }
-
- CPU_ZERO_S(CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
-
- // Copy our internal representation to cpu_set_t
- if (!use_extended)
- {
- for (int i = 0; i < FAST_PATH_BITS && i < CPU_SETSIZE; i++)
- {
- if (fast_mask & (1ULL << i))
- {
- CPU_SET_S(i, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
- }
- }
- }
- else if (extended_mask)
- {
- for (int word = 0; word < extended_capacity; word++)
- {
- uint64_t mask = extended_mask[word];
- for (int bit = 0; bit < BITS_PER_WORD; bit++)
- {
- int cpu_id = word * BITS_PER_WORD + bit;
- if (cpu_id >= CPU_SETSIZE) break;
-
- if (mask & (1ULL << bit))
- {
- CPU_SET_S(cpu_id, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
- }
- }
- if ((word + 1) * BITS_PER_WORD >= CPU_SETSIZE) break;
- }
- }
-
- cpu_set_valid = true;
- }
-
- return cpu_set_cache;
- }
-
- cpu_set_t* CpuSet::get_cpu_set_mutable()
- {
- get_cpu_set(); // Ensure cache is valid
- return cpu_set_cache;
- }
-
- void CpuSet::set_cpu_set(const cpu_set_t* cpuset)
- {
- if (!cpuset) return;
-
- disable_all();
-
- // Copy from cpu_set_t to our internal representation
- for (int i = 0; i < CPU_SETSIZE; i++)
- {
- if (CPU_ISSET(i, cpuset))
- {
- enable(i);
- }
- }
- }
- #endif
-
- #if __APPLE__
- unsigned int CpuSet::get_legacy_policy() const
- {
- if (!legacy_policy_valid)
- {
- legacy_policy_cache = 0;
-
- if (!use_extended)
- {
- // Fast path: directly use fast_mask (truncated to 32 bits)
- legacy_policy_cache = (unsigned int)(fast_mask & 0xFFFFFFFFU);
- }
- else if (extended_mask && extended_capacity > 0)
- {
- // Extended path: use first word, truncated to 32 bits
- legacy_policy_cache = (unsigned int)(extended_mask[0] & 0xFFFFFFFFU);
- }
-
- legacy_policy_valid = true;
- }
-
- return legacy_policy_cache;
- }
-
- void CpuSet::set_legacy_policy(unsigned int policy)
- {
- disable_all();
-
- // Set bits according to the legacy policy
- for (int i = 0; i < 32; i++)
- {
- if (policy & (1U << i))
- {
- enable(i);
- }
- }
- }
- #endif
-
- int cpu_support_arm_edsp()
- {
- try_initialize_global_cpu_info();
- #if __arm__ && !__aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_edsp;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_EDSP;
- #elif __APPLE__
- return g_hw_cputype == CPU_TYPE_ARM;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_neon()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- return 1;
- #elif __arm__
- #if defined _WIN32
- return g_cpu_support_arm_neon;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_NEON;
- #elif __APPLE__
- return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_vfpv4()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- return 1;
- #elif __arm__
- #if defined _WIN32
- return g_cpu_support_arm_vfpv4;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_VFPv4;
- #elif __APPLE__
- return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_asimdhp()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_asimdhp;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_ASIMDHP;
- #elif __APPLE__
- return g_hw_optional_arm_FEAT_FP16;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_cpuid()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_cpuid;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_CPUID;
- #elif __APPLE__
- return 0;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_asimddp()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_asimddp;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_ASIMDDP;
- #elif __APPLE__
- return g_hw_optional_arm_FEAT_DotProd;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_asimdfhm()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_asimdfhm;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_ASIMDFHM;
- #elif __APPLE__
- return g_hw_optional_arm_FEAT_FHM;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_bf16()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_bf16;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps2 & HWCAP2_BF16;
- #elif __APPLE__
- return g_hw_optional_arm_FEAT_BF16;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_i8mm()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_i8mm;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps2 & HWCAP2_I8MM;
- #elif __APPLE__
- return g_hw_optional_arm_FEAT_I8MM;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_sve()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_sve;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps & HWCAP_SVE;
- #elif __APPLE__
- return 0; // no known apple cpu support armv8.6 sve
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_sve2()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_sve2;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps2 & HWCAP2_SVE2;
- #elif __APPLE__
- return 0; // no known apple cpu support armv8.6 sve2
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_svebf16()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_svebf16;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps2 & HWCAP2_SVEBF16;
- #elif __APPLE__
- return 0; // no known apple cpu support armv8.6 svebf16
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_svei8mm()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_svei8mm;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps2 & HWCAP2_SVEI8MM;
- #elif __APPLE__
- return 0; // no known apple cpu support armv8.6 svei8mm
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_arm_svef32mm()
- {
- try_initialize_global_cpu_info();
- #if __aarch64__
- #if defined _WIN32
- return g_cpu_support_arm_svef32mm;
- #elif defined __ANDROID__ || defined __linux__
- return g_hwcaps2 & HWCAP2_SVEF32MM;
- #elif __APPLE__
- return 0; // no known apple cpu support armv8.6 svef32mm
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_fma()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_fma;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_xop()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_xop;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_f16c()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_f16c;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx2()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx2;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx_vnni()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx_vnni;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx_vnni_int8()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx_vnni_int8;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx_vnni_int16()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx_vnni_int16;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx_ne_convert()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx_ne_convert;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx512()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx512;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx512_vnni()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx512_vnni;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx512_bf16()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx512_bf16;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_x86_avx512_fp16()
- {
- try_initialize_global_cpu_info();
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
- return g_cpu_support_x86_avx512_fp16;
- #else
- return 0;
- #endif
- }
-
- int cpu_support_mips_msa()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __mips__
- return g_hwcaps & HWCAP_MIPS_MSA;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_loongarch_lsx()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __loongarch64
- return g_hwcaps & HWCAP_LOONGARCH_LSX;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_loongarch_lasx()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __loongarch64
- return g_hwcaps & HWCAP_LOONGARCH_LASX;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_loongson_mmi()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __mips__
- return g_hwcaps & HWCAP_LOONGSON_MMI;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_riscv_v()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __riscv
- return g_hwcaps & COMPAT_HWCAP_ISA_V;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_riscv_zfh()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __riscv
- return g_cpu_support_riscv_zfh;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_riscv_zvfh()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __riscv
- return g_cpu_support_riscv_zvfh;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_support_riscv_xtheadvector()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __riscv
- return g_cpu_support_riscv_xtheadvector;
- #else
- return 0;
- #endif
- #else
- return 0;
- #endif
- }
-
- int cpu_riscv_vlenb()
- {
- #if C906
- // FIXME xuantie qemu reports all zero auxv flags
- return 16;
- #endif
- try_initialize_global_cpu_info();
- #if __riscv
- if (!cpu_support_riscv_v())
- return 0;
-
- int a = 0;
- asm volatile(
- ".word 0xc22026f3 \n" // csrr a3, vlenb
- "mv %0, a3 \n"
- : "=r"(a)
- :
- : "memory", "a3");
- return a;
- #else
- return 0;
- #endif
- }
-
- int get_cpu_count()
- {
- try_initialize_global_cpu_info();
- return g_cpucount;
- }
-
- int get_little_cpu_count()
- {
- try_initialize_global_cpu_info();
- return get_cpu_thread_affinity_mask(1).num_enabled();
- }
-
- int get_big_cpu_count()
- {
- try_initialize_global_cpu_info();
- int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
- return big_cpu_count ? big_cpu_count : g_cpucount;
- }
-
- int get_physical_cpu_count()
- {
- try_initialize_global_cpu_info();
- return g_physical_cpucount;
- }
-
- int get_physical_little_cpu_count()
- {
- try_initialize_global_cpu_info();
- if (g_physical_cpucount == g_cpucount)
- return get_little_cpu_count();
-
- return g_physical_cpucount * 2 - g_cpucount;
- }
-
- int get_physical_big_cpu_count()
- {
- try_initialize_global_cpu_info();
- if (g_physical_cpucount == g_cpucount)
- return get_big_cpu_count();
-
- return g_cpucount - g_physical_cpucount;
- }
-
- int get_cpu_level2_cache_size()
- {
- try_initialize_global_cpu_info();
- return g_cpu_level2_cachesize;
- }
-
- int get_cpu_level3_cache_size()
- {
- try_initialize_global_cpu_info();
- return g_cpu_level3_cachesize;
- }
-
- int get_cpu_powersave()
- {
- try_initialize_global_cpu_info();
- return g_powersave;
- }
-
- int set_cpu_powersave(int powersave)
- {
- try_initialize_global_cpu_info();
- if (powersave < 0 || powersave > 2)
- {
- NCNN_LOGE("powersave %d not supported", powersave);
- return -1;
- }
-
- const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
-
- int ret = set_cpu_thread_affinity(thread_affinity_mask);
- if (ret != 0)
- return ret;
-
- g_powersave = powersave;
-
- return 0;
- }
-
- const CpuSet& get_cpu_thread_affinity_mask(int powersave)
- {
- try_initialize_global_cpu_info();
- if (powersave == 0)
- return g_cpu_affinity_mask_all;
-
- if (powersave == 1)
- return g_cpu_affinity_mask_little;
-
- if (powersave == 2)
- return g_cpu_affinity_mask_big;
-
- NCNN_LOGE("powersave %d not supported", powersave);
-
- // fallback to all cores anyway
- return g_cpu_affinity_mask_all;
- }
-
- int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__ || defined _WIN32
- #ifdef _OPENMP
- int num_threads = thread_affinity_mask.num_enabled();
-
- // set affinity for each thread
- set_omp_num_threads(num_threads);
- std::vector<int> ssarets(num_threads, 0);
- #pragma omp parallel for num_threads(num_threads)
- for (int i = 0; i < num_threads; i++)
- {
- ssarets[i] = set_sched_affinity(thread_affinity_mask);
- }
- for (int i = 0; i < num_threads; i++)
- {
- if (ssarets[i] != 0)
- return -1;
- }
- #else
- int ssaret = set_sched_affinity(thread_affinity_mask);
- if (ssaret != 0)
- return -1;
- #endif
-
- return 0;
- #elif __APPLE__
-
- #ifdef _OPENMP
- int num_threads = thread_affinity_mask.num_enabled();
-
- // set affinity for each thread
- set_omp_num_threads(num_threads);
- std::vector<int> ssarets(num_threads, 0);
- #pragma omp parallel for num_threads(num_threads)
- for (int i = 0; i < num_threads; i++)
- {
- // assign one core for each thread
- int core = -1 - i;
- int max_cpu = thread_affinity_mask.max_cpu_id();
- for (int j = 0; j <= max_cpu && j < 32; j++) // Apple policy is limited to 32 bits
- {
- if (thread_affinity_mask.is_enabled(j))
- {
- if (core == -1)
- {
- core = j;
- break;
- }
- else
- {
- core++;
- }
- }
- }
- CpuSet this_thread_affinity_mask;
- if (core != -1 - i)
- {
- this_thread_affinity_mask.enable(core);
- }
-
- ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
- }
- for (int i = 0; i < num_threads; i++)
- {
- if (ssarets[i] != 0)
- return -1;
- }
- #else
- int ssaret = set_sched_affinity(thread_affinity_mask);
- if (ssaret != 0)
- return -1;
- #endif
-
- return 0;
- #else
- // TODO
- (void)thread_affinity_mask;
- return -1;
- #endif
- }
-
- int is_current_thread_running_on_a53_a55()
- {
- try_initialize_global_cpu_info();
- #if defined __ANDROID__ || defined __linux__
- #if __aarch64__
- if (g_cpu_is_arm_a53_a55 == 0)
- return 0; // all non a53/a55
-
- if (g_cpu_is_arm_a53_a55 == 1)
- return 1; // all a53/a55
-
- if (g_powersave == 2)
- return 0; // big clusters
-
- if (g_powersave == 1)
- return 1; // little clusters
-
- // little cores are a53/a55
-
- // use cpuid for retrieving midr since kernel 4.7+
- if (cpu_support_arm_cpuid())
- {
- unsigned int midr = get_midr_from_register();
- if (midr)
- return midr_is_a53_a55(midr);
- }
-
- // check if affinity cpuid is in the little ones
- CpuSet thread_cs;
- int ret = get_sched_affinity(thread_cs);
- if (ret != 0)
- {
- // no affinity capability
- return 0;
- }
-
- const CpuSet& little_cs = get_cpu_thread_affinity_mask(1);
- for (int i = 0; i < g_cpucount; i++)
- {
- if (!thread_cs.is_enabled(i))
- continue;
-
- if (!little_cs.is_enabled(i))
- return 0;
- }
-
- // all affinity cpuids are little core
- return 1;
- #else
- return 0;
- #endif // __aarch64__
- #else
- return 0;
- #endif // defined __ANDROID__ || defined __linux__
- }
-
- int get_omp_num_threads()
- {
- #ifdef _OPENMP
- return omp_get_num_threads();
- #else
- return 1;
- #endif
- }
-
- void set_omp_num_threads(int num_threads)
- {
- #ifdef _OPENMP
- omp_set_num_threads(num_threads);
- #else
- (void)num_threads;
- #endif
- }
-
- int get_omp_dynamic()
- {
- #ifdef _OPENMP
- return omp_get_dynamic();
- #else
- return 0;
- #endif
- }
-
- void set_omp_dynamic(int dynamic)
- {
- #ifdef _OPENMP
- omp_set_dynamic(dynamic);
- #else
- (void)dynamic;
- #endif
- }
-
- int get_omp_thread_num()
- {
- #ifdef _OPENMP
- return omp_get_thread_num();
- #else
- return 0;
- #endif
- }
-
- int get_kmp_blocktime()
- {
- #if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
- return kmp_get_blocktime();
- #else
- return 0;
- #endif
- }
-
- void set_kmp_blocktime(int time_ms)
- {
- #if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
- kmp_set_blocktime(time_ms);
- #else
- (void)time_ms;
- #endif
- }
-
- static ncnn::ThreadLocalStorage tls_flush_denormals;
-
- int get_flush_denormals()
- {
- #if defined(__SSE3__)
- return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
- #else
- return 0;
- #endif
- }
-
- int set_flush_denormals(int flush_denormals)
- {
- if (flush_denormals < 0 || flush_denormals > 3)
- {
- NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
- return -1;
- }
- #if defined(__SSE3__)
- if (flush_denormals == 0)
- {
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
- }
- else if (flush_denormals == 1)
- {
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
- }
- else if (flush_denormals == 2)
- {
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
- }
- else if (flush_denormals == 3)
- {
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
- }
-
- tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
- return 0;
- #else
- return 0;
- #endif
- }
-
- } // namespace ncnn
|