diff --git a/src/cpu.cpp b/src/cpu.cpp
index 7656da982..00dcd836e 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -113,7 +113,47 @@
 #include <immintrin.h>
 #endif
 
-namespace ncnn {
+// topology info
+static int g_cpucount;
+static int g_physical_cpucount;
+static int g_powersave;
+static ncnn::CpuSet g_cpu_affinity_mask_all;
+static ncnn::CpuSet g_cpu_affinity_mask_little;
+static ncnn::CpuSet g_cpu_affinity_mask_big;
+
+// isa info
+#if defined __ANDROID__ || defined __linux__
+static unsigned int g_hwcaps;
+static unsigned int g_hwcaps2;
+#endif // defined __ANDROID__ || defined __linux__
+#if __APPLE__
+static unsigned int g_hw_cpufamily;
+static cpu_type_t g_hw_cputype;
+static cpu_subtype_t g_hw_cpusubtype;
+static int g_hw_optional_arm_FEAT_FP16;
+static int g_hw_optional_arm_FEAT_DotProd;
+static int g_hw_optional_arm_FEAT_FHM;
+static int g_hw_optional_arm_FEAT_BF16;
+static int g_hw_optional_arm_FEAT_I8MM;
+#endif // __APPLE__
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+static int g_cpu_support_x86_avx;
+static int g_cpu_support_x86_fma;
+static int g_cpu_support_x86_xop;
+static int g_cpu_support_x86_f16c;
+static int g_cpu_support_x86_avx2;
+static int g_cpu_support_x86_avx_vnni;
+static int g_cpu_support_x86_avx512;
+static int g_cpu_support_x86_avx512_vnni;
+static int g_cpu_support_x86_avx512_bf16;
+static int g_cpu_support_x86_avx512_fp16;
+#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+
+static int g_cpu_level2_cachesize;
+static int g_cpu_level3_cachesize;
+
+// misc info
+static int g_cpu_is_arm_a53_a55;
 
 #if defined __ANDROID__ || defined __linux__
 
@@ -282,9 +322,6 @@ static unsigned int get_elf_hwcap(unsigned int type)
 
     return hwcap;
 }
-
-static unsigned int g_hwcaps = get_elf_hwcap(AT_HWCAP);
-static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
 #endif // defined __ANDROID__ || defined __linux__
 
 #if __APPLE__
@@ -312,10 +349,6 @@ static cpu_subtype_t get_hw_cpusubtype()
     return value;
 }
 
-static unsigned int g_hw_cpufamily = get_hw_cpufamily();
-static cpu_type_t g_hw_cputype = get_hw_cputype();
-static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
-
 static int get_hw_capability(const char* cap)
 {
     int64_t value = 0;
@@ -323,914 +356,1154 @@ static int get_hw_capability(const char* cap)
     sysctlbyname(cap, &value, &len, NULL, 0);
     return value;
 }
-
-static int g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
-static int g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
-static int g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
-static int g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
-static int g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
 #endif // __APPLE__
 
-#if (defined _WIN32 && !(defined __MINGW32__))
-CpuSet::CpuSet()
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+static inline void x86_cpuid(int level, unsigned int out[4])
 {
-    disable_all();
+#if defined(_MSC_VER)
+    __cpuid((int*)out, level);
+#elif defined(__clang__) || defined(__GNUC__)
+    __get_cpuid(level, out, out + 1, out + 2, out + 3);
+#else
+    NCNN_LOGE("x86_cpuid is unknown for current compiler");
+    out[0] = 0;
+    out[1] = 0;
+    out[2] = 0;
+    out[3] = 0;
+#endif
 }
 
-void CpuSet::enable(int cpu)
+static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
 {
-    mask |= (1 << cpu);
+#if defined(_MSC_VER)
+    __cpuidex((int*)out, level, sublevel);
+#elif defined(__clang__) || defined(__GNUC__)
+    __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
+#else
+    NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
+    out[0] = 0;
+    out[1] = 0;
+    out[2] = 0;
+    out[3] = 0;
+#endif
 }
 
-void CpuSet::disable(int cpu)
+static inline int x86_get_xcr0()
 {
-    mask &= ~(1 << cpu);
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+    return _xgetbv(0);
+#elif defined(__i386__) || defined(__x86_64__)
+    int xcr0 = 0;
+    asm(".byte 0x0f, 0x01, 0xd0"
+        : "=a"(xcr0)
+        : "c"(0)
+        : "%edx");
+    return xcr0;
+#else
+    NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
+    return 0xffffffff; // assume it will work
+#endif
 }
 
-void CpuSet::disable_all()
+static int get_cpu_support_x86_avx()
 {
-    mask = 0;
-}
+#if !NCNN_AVX
+    return 0;
+#endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
 
-bool CpuSet::is_enabled(int cpu) const
-{
-    return mask & (1 << cpu);
-}
+    int nIds = cpu_info[0];
+    if (nIds < 1)
+        return 0;
 
-int CpuSet::num_enabled() const
-{
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
-    {
-        if (is_enabled(i))
-            num_enabled++;
-    }
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
 
-    return num_enabled;
-}
-#elif defined __ANDROID__ || defined __linux__
-CpuSet::CpuSet()
-{
-    disable_all();
-}
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
 
-void CpuSet::enable(int cpu)
-{
-    CPU_SET(cpu, &cpu_set);
+    return 1;
 }
 
-void CpuSet::disable(int cpu)
+static int get_cpu_support_x86_fma()
 {
-    CPU_CLR(cpu, &cpu_set);
-}
+#if !NCNN_FMA
+    return 0;
+#endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
 
-void CpuSet::disable_all()
-{
-    CPU_ZERO(&cpu_set);
-}
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
 
-bool CpuSet::is_enabled(int cpu) const
-{
-    return CPU_ISSET(cpu, &cpu_set);
-}
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
 
-int CpuSet::num_enabled() const
-{
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
-    {
-        if (is_enabled(i))
-            num_enabled++;
-    }
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
 
-    return num_enabled;
-}
-#elif __APPLE__
-CpuSet::CpuSet()
-{
-    disable_all();
+    return cpu_info[2] & (1u << 12);
 }
 
-void CpuSet::enable(int cpu)
+static int get_cpu_support_x86_xop()
 {
-    policy |= (1 << cpu);
-}
+#if !NCNN_XOP
+    return 0;
+#endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0x80000000, cpu_info);
 
-void CpuSet::disable(int cpu)
-{
-    policy &= ~(1 << cpu);
-}
+    if (cpu_info[0] < 0x80000001)
+        return 0;
 
-void CpuSet::disable_all()
-{
-    policy = 0;
-}
+    x86_cpuid(0x80000001, cpu_info);
 
-bool CpuSet::is_enabled(int cpu) const
-{
-    return policy & (1 << cpu);
+    return cpu_info[2] & (1u << 11);
 }
 
-int CpuSet::num_enabled() const
+static int get_cpu_support_x86_f16c()
 {
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(policy) * 8; i++)
-    {
-        if (is_enabled(i))
-            num_enabled++;
-    }
+#if !NCNN_F16C
+    return 0;
+#endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
 
-    return num_enabled;
-}
-#else
-CpuSet::CpuSet()
-{
-}
+    int nIds = cpu_info[0];
+    if (nIds < 1)
+        return 0;
 
-void CpuSet::enable(int /* cpu */)
-{
-}
+    x86_cpuid(1, cpu_info);
 
-void CpuSet::disable(int /* cpu */)
-{
+    return cpu_info[2] & (1u << 29);
 }
 
-void CpuSet::disable_all()
+static int get_cpu_support_x86_avx2()
 {
-}
+#if !NCNN_AVX2
+    return 0;
+#endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
 
-bool CpuSet::is_enabled(int /* cpu */) const
-{
-    return true;
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
+
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
+
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
+
+    x86_cpuid_sublevel(7, 0, cpu_info);
+    return cpu_info[1] & (1u << 5);
 }
 
-int CpuSet::num_enabled() const
+static int get_cpu_support_x86_avx_vnni()
 {
-    return get_cpu_count();
-}
+#if !NCNN_AVXVNNI
+    return 0;
 #endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
 
-int cpu_support_arm_edsp()
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
+
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
+
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
+
+    x86_cpuid_sublevel(7, 1, cpu_info);
+    return cpu_info[0] & (1u << 4);
+}
+
+static int get_cpu_support_x86_avx512()
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
+#if !NCNN_AVX512
     return 0;
-#else
-    return g_hwcaps & HWCAP_EDSP;
 #endif
-#elif __APPLE__
-#if __aarch64__
-    return 0;
-#else
-    return g_hw_cputype == CPU_TYPE_ARM;
-#endif
-#else
-    return 0;
-#endif
-}
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
 
-int cpu_support_arm_neon()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_ASIMD;
-#else
-    return g_hwcaps & HWCAP_NEON;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return g_hw_cputype == CPU_TYPE_ARM64;
-#else
-    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
-#endif
-#else
-    return 0;
-#endif
-}
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
 
-int cpu_support_arm_vfpv4()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    // neon always enable fma and fp16
-    return g_hwcaps & HWCAP_ASIMD;
-#else
-    return g_hwcaps & HWCAP_VFPv4;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return g_hw_cputype == CPU_TYPE_ARM64;
-#else
-    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
-#endif
-#else
-    return 0;
-#endif
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
+
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
+
+    // check avx512 XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 0xe0) != 0xe0)
+        return 0;
+
+    x86_cpuid_sublevel(7, 0, cpu_info);
+    return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
 }
 
-int cpu_support_arm_asimdhp()
+static int get_cpu_support_x86_avx512_vnni()
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_ASIMDHP;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return g_hw_optional_arm_FEAT_FP16
-           || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL
-           || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
-           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
-           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
-           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
-           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
-#else
-    return 0;
-#endif
-#else
+#if !NCNN_AVX512VNNI
     return 0;
 #endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
+
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
+
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
+
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
+
+    // check avx512 XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 0xe0) != 0xe0)
+        return 0;
+
+    x86_cpuid_sublevel(7, 0, cpu_info);
+    return cpu_info[2] & (1u << 11);
 }
 
-int cpu_support_arm_cpuid()
+static int get_cpu_support_x86_avx512_bf16()
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_CPUID;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-    return 0;
-#else
+#if !NCNN_AVX512BF16
     return 0;
 #endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
+
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
+
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
+
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
+
+    x86_cpuid_sublevel(7, 1, cpu_info);
+    return cpu_info[0] & (1u << 5);
 }
 
-int cpu_support_arm_asimddp()
+static int get_cpu_support_x86_avx512_fp16()
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_ASIMDDP;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return g_hw_optional_arm_FEAT_DotProd
-           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
-           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
-           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
-           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
-#else
-    return 0;
-#endif
-#else
+#if !NCNN_AVX512FP16
     return 0;
 #endif
+    unsigned int cpu_info[4] = {0};
+    x86_cpuid(0, cpu_info);
+
+    int nIds = cpu_info[0];
+    if (nIds < 7)
+        return 0;
+
+    x86_cpuid(1, cpu_info);
+    // check AVX XSAVE OSXSAVE
+    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
+        return 0;
+
+    // check XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 6) != 6)
+        return 0;
+
+    // check avx512 XSAVE enabled by kernel
+    if ((x86_get_xcr0() & 0xe0) != 0xe0)
+        return 0;
+
+    x86_cpuid_sublevel(7, 0, cpu_info);
+    return cpu_info[3] & (1u << 23);
 }
+#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
 
-int cpu_support_arm_asimdfhm()
+static int get_cpucount()
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_ASIMDFHM;
-#else
-    return 0;
-#endif
+    int count = 0;
+#ifdef __EMSCRIPTEN__
+    if (emscripten_has_threading_support())
+        count = emscripten_num_logical_cores();
+    else
+        count = 1;
+#elif (defined _WIN32 && !(defined __MINGW32__))
+    SYSTEM_INFO system_info;
+    GetSystemInfo(&system_info);
+    count = system_info.dwNumberOfProcessors;
+#elif defined __ANDROID__ || defined __linux__
+    // get cpu count from /proc/cpuinfo
+    FILE* fp = fopen("/proc/cpuinfo", "rb");
+    if (!fp)
+        return 1;
+
+    char line[1024];
+    while (!feof(fp))
+    {
+        char* s = fgets(line, 1024, fp);
+        if (!s)
+            break;
+
+        if (memcmp(line, "processor", 9) == 0)
+        {
+            count++;
+        }
+    }
+
+    fclose(fp);
 #elif __APPLE__
-#if __aarch64__
-    return g_hw_optional_arm_FEAT_FHM
-           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
-           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
-           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
-           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
+    size_t len = sizeof(count);
+    sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
 #else
-    return 0;
-#endif
+#ifdef _OPENMP
+    count = omp_get_max_threads();
 #else
-    return 0;
+    count = 1;
+#endif // _OPENMP
 #endif
+
+    if (count < 1)
+        count = 1;
+
+    return count;
 }
 
-int cpu_support_arm_bf16()
-{
 #if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps2 & HWCAP2_BF16;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return g_hw_optional_arm_FEAT_BF16
-           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
-           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
-
-int cpu_support_arm_i8mm()
+static int get_thread_siblings(int cpuid)
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps2 & HWCAP2_I8MM;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return g_hw_optional_arm_FEAT_I8MM
-           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
-           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
 
-int cpu_support_arm_sve()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_SVE;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return 0; // no known apple cpu support armv8.6 sve
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    FILE* fp = fopen(path, "rb");
+    if (!fp)
+        return -1;
 
-int cpu_support_arm_sve2()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps2 & HWCAP2_SVE2;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return 0; // no known apple cpu support armv8.6 sve2
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    int thread_siblings = -1;
+    int nscan = fscanf(fp, "%x", &thread_siblings);
+    if (nscan != 1)
+    {
+        // ignore
+    }
 
-int cpu_support_arm_svebf16()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps2 & HWCAP2_SVEBF16;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return 0; // no known apple cpu support armv8.6 svebf16
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
+    fclose(fp);
+
+    return thread_siblings;
 }
+#endif // defined __ANDROID__ || defined __linux__
 
-int cpu_support_arm_svei8mm()
+static int get_physical_cpucount()
 {
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps2 & HWCAP2_SVEI8MM;
-#else
-    return 0;
-#endif
+    int count = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi == NULL)
+    {
+        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
+        return g_cpucount;
+    }
+
+    DWORD return_length = 0;
+    glpi(NULL, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+    glpi(buffer, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+    DWORD byte_offset = 0;
+    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    {
+        if (ptr->Relationship == RelationProcessorCore)
+        {
+            count++;
+        }
+
+        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    free(buffer);
+#elif defined __ANDROID__ || defined __linux__
+    std::vector<int> thread_set;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int thread_siblings = get_thread_siblings(i);
+        if (thread_siblings == -1)
+        {
+            // ignore malformed one
+            continue;
+        }
+
+        bool thread_siblings_exists = false;
+        for (size_t j = 0; j < thread_set.size(); j++)
+        {
+            if (thread_set[j] == thread_siblings)
+            {
+                thread_siblings_exists = true;
+                break;
+            }
+        }
+
+        if (!thread_siblings_exists)
+        {
+            thread_set.push_back(thread_siblings);
+            count++;
+        }
+    }
 #elif __APPLE__
-#if __aarch64__
-    return 0; // no known apple cpu support armv8.6 svei8mm
-#else
-    return 0;
-#endif
+    size_t len = sizeof(count);
+    sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
 #else
-    return 0;
+    count = g_cpucount;
 #endif
+
+    if (count > g_cpucount)
+        count = g_cpucount;
+
+    return count;
 }
 
-int cpu_support_arm_svef32mm()
-{
 #if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps2 & HWCAP2_SVEF32MM;
-#else
-    return 0;
-#endif
-#elif __APPLE__
-#if __aarch64__
-    return 0; // no known apple cpu support armv8.6 svef32mm
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
+static int get_data_cache_size(int cpuid, int level)
+{
+    char path[256];
+
+    // discover sysfs cache entry
+    int indexid = -1;
+    for (int i = 0;; i++)
+    {
+        // check level
+        {
+            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i);
+            FILE* fp = fopen(path, "rb");
+            if (!fp)
+                break;
+
+            int cache_level = -1;
+            int nscan = fscanf(fp, "%d", &cache_level);
+            fclose(fp);
+            if (nscan != 1 || cache_level != level)
+                continue;
+        }
+
+        // check type
+        {
+            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i);
+            FILE* fp = fopen(path, "rb");
+            if (!fp)
+                break;
+
+            char type[32];
+            int nscan = fscanf(fp, "%31s", type);
+            fclose(fp);
+            if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0))
+                continue;
+        }
+
+        indexid = i;
+        break;
+    }
+
+    if (indexid == -1)
+    {
+        // no sysfs entry
+        return 0;
+    }
+
+    // get size
+    int cache_size_K = 0;
+    {
+        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid);
+        FILE* fp = fopen(path, "rb");
+        if (!fp)
+            return 0;
+
+        int nscan = fscanf(fp, "%dK", &cache_size_K);
+        fclose(fp);
+        if (nscan != 1)
+        {
+            NCNN_LOGE("fscanf cache_size_K error %d", nscan);
+            return 0;
+        }
+    }
+
+    // parse shared_cpu_map mask
+    ncnn::CpuSet shared_cpu_map;
+    {
+        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid);
+        FILE* fp = fopen(path, "rb");
+        if (!fp)
+            return 0;
+
+        char shared_cpu_map_str[256];
+        int nscan = fscanf(fp, "%255s", shared_cpu_map_str);
+        fclose(fp);
+        if (nscan != 1)
+        {
+            NCNN_LOGE("fscanf shared_cpu_map error %d", nscan);
+            return 0;
+        }
+
+        int len = strlen(shared_cpu_map_str);
+
+        if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x')
+        {
+            // skip leading 0x
+            len -= 2;
+        }
+
+        int ci = 0;
+        for (int i = len - 1; i >= 0; i--)
+        {
+            char x = shared_cpu_map_str[i];
+            if (x & 1) shared_cpu_map.enable(ci + 0);
+            if (x & 2) shared_cpu_map.enable(ci + 1);
+            if (x & 4) shared_cpu_map.enable(ci + 2);
+            if (x & 8) shared_cpu_map.enable(ci + 3);
+
+            ci += 4;
+        }
+    }
+
+    if (shared_cpu_map.num_enabled() == 1)
+        return cache_size_K * 1024;
+
+    // resolve physical cpu count in the shared_cpu_map
+    int shared_physical_cpu_count = 0;
+    {
+        std::vector<int> thread_set;
+        for (int i = 0; i < g_cpucount; i++)
+        {
+            if (!shared_cpu_map.is_enabled(i))
+                continue;
+
+            int thread_siblings = get_thread_siblings(i);
+            if (thread_siblings == -1)
+            {
+                // ignore malformed one
+                continue;
+            }
+
+            bool thread_siblings_exists = false;
+            for (size_t j = 0; j < thread_set.size(); j++)
+            {
+                if (thread_set[j] == thread_siblings)
+                {
+                    thread_siblings_exists = true;
+                    break;
+                }
+            }
+
+            if (!thread_siblings_exists)
+            {
+                thread_set.push_back(thread_siblings);
+                shared_physical_cpu_count++;
+            }
+        }
+    }
+
+    // return per-physical-core cache size with 4K aligned
+    cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4;
+
+    return cache_size_K * 1024;
 }
 
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-static inline void x86_cpuid(int level, unsigned int out[4])
+static int get_big_cpu_data_cache_size(int level)
 {
-#if defined(_MSC_VER)
-    __cpuid((int*)out, level);
-#elif defined(__clang__) || defined(__GNUC__)
-    __get_cpuid(level, out, out + 1, out + 2, out + 3);
-#else
-    NCNN_LOGE("x86_cpuid is unknown for current compiler");
-    out[0] = 0;
-    out[1] = 0;
-    out[2] = 0;
-    out[3] = 0;
-#endif
+    if (g_cpu_affinity_mask_big.num_enabled() == 0)
+    {
+        // smp cpu
+        return get_data_cache_size(0, level);
+    }
+
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        if (g_cpu_affinity_mask_big.is_enabled(i))
+        {
+            return get_data_cache_size(i, level);
+        }
+    }
+
+    // should never reach here, fallback to cpu0
+    return get_data_cache_size(0, level);
 }
+#endif // defined __ANDROID__ || defined __linux__
 
-static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
+static int get_cpu_level2_cachesize()
 {
-#if defined(_MSC_VER)
-    __cpuidex((int*)out, level, sublevel);
-#elif defined(__clang__) || defined(__GNUC__)
-    __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
-#else
-    NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
-    out[0] = 0;
-    out[1] = 0;
-    out[2] = 0;
-    out[3] = 0;
+    int size = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi != NULL)
+    {
+        DWORD return_length = 0;
+        glpi(NULL, &return_length);
+
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+        glpi(buffer, &return_length);
+
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+        DWORD byte_offset = 0;
+        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+        {
+            if (ptr->Relationship == RelationCache)
+            {
+                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
+                if (Cache->Level == 2)
+                {
+                    size = std::max(size, (int)Cache->Size);
+                }
+            }
+
+            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+            ptr++;
+        }
+
+        free(buffer);
+    }
+#elif defined __ANDROID__ || defined __linux__
+    size = get_big_cpu_data_cache_size(2);
+#if defined(_SC_LEVEL2_CACHE_SIZE)
+    if (size <= 0)
+        size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+#endif
+#elif __APPLE__
+    // perflevel 0 is the higher performance cluster
+    int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
+    int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
+    size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
 #endif
-}
 
-static inline int x86_get_xcr0()
-{
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-    return _xgetbv(0);
-#elif defined(__i386__) || defined(__x86_64__)
-    int xcr0 = 0;
-    asm(".byte 0x0f, 0x01, 0xd0"
-        : "=a"(xcr0)
-        : "c"(0)
-        : "%edx");
-    return xcr0;
+    // fallback to a common value
+    if (size <= 0)
+    {
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+        size = 64 * 1024;
+        if (g_cpu_support_x86_avx)
+            size = 128 * 1024;
+        if (g_cpu_support_x86_avx2)
+            size = 256 * 1024;
+        if (g_cpu_support_x86_avx512)
+            size = 1024 * 1024;
+#elif __aarch64__
+        size = 256 * 1024;
+#elif __arm__
+        size = 128 * 1024;
 #else
-    NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
-    return 0xffffffff; // assume it will work
+        // is 64k still too large here ?
+        size = 64 * 1024;
 #endif
+    }
+
+    return size;
 }
 
-static int get_cpu_support_x86_avx()
+static int get_cpu_level3_cachesize()
 {
-#if !NCNN_AVX
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+    int size = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi != NULL)
+    {
+        DWORD return_length = 0;
+        glpi(NULL, &return_length);
 
-    int nIds = cpu_info[0];
-    if (nIds < 1)
-        return 0;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+        glpi(buffer, &return_length);
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+        DWORD byte_offset = 0;
+        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+        {
+            if (ptr->Relationship == RelationCache)
+            {
+                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
+                if (Cache->Level == 3)
+                {
+                    size = std::max(size, (int)Cache->Size);
+                }
+            }
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+            ptr++;
+        }
 
-    return 1;
+        free(buffer);
+    }
+#elif defined __ANDROID__ || defined __linux__
+    size = get_big_cpu_data_cache_size(3);
+#if defined(_SC_LEVEL3_CACHE_SIZE)
+    if (size <= 0)
+        size = sysconf(_SC_LEVEL3_CACHE_SIZE);
+#endif
+#elif __APPLE__
+    // perflevel 0 is the higher performance cluster
+    // get the size shared among all cpus
+    size = get_hw_capability("hw.perflevel0.l3cachesize");
+#endif
+
+    // l3 cache size can be zero
+
+    return size;
 }
 
-static int get_cpu_support_x86_fma()
+#if (defined _WIN32 && !(defined __MINGW32__))
+static ncnn::CpuSet get_smt_cpu_mask()
 {
-#if !NCNN_FMA
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
-
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
+    ncnn::CpuSet smt_cpu_mask;
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi == NULL)
+    {
+        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
+        return smt_cpu_mask;
+    }
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+    DWORD return_length = 0;
+    glpi(NULL, &return_length);
 
-    return cpu_info[2] & (1u << 12);
-}
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+    glpi(buffer, &return_length);
 
-static int get_cpu_support_x86_xop()
-{
-#if !NCNN_XOP
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0x80000000, cpu_info);
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+    DWORD byte_offset = 0;
+    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    {
+        if (ptr->Relationship == RelationProcessorCore)
+        {
+            ncnn::CpuSet smt_set;
+            smt_set.mask = ptr->ProcessorMask;
+            if (smt_set.num_enabled() > 1)
+            {
+                // this core is smt
+                smt_cpu_mask.mask |= smt_set.mask;
+            }
+        }
 
-    if (cpu_info[0] < 0x80000001)
-        return 0;
+        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
 
-    x86_cpuid(0x80000001, cpu_info);
+    free(buffer);
 
-    return cpu_info[2] & (1u << 11);
+    return smt_cpu_mask;
 }
 
-static int get_cpu_support_x86_f16c()
+static std::vector<int> get_max_freq_mhz()
 {
-#if !NCNN_F16C
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+    typedef struct _PROCESSOR_POWER_INFORMATION
+    {
+        ULONG Number;
+        ULONG MaxMhz;
+        ULONG CurrentMhz;
+        ULONG MhzLimit;
+        ULONG MaxIdleState;
+        ULONG CurrentIdleState;
+    } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
 
-    int nIds = cpu_info[0];
-    if (nIds < 1)
-        return 0;
+    HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));
 
-    x86_cpuid(1, cpu_info);
+    typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
+    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
+    if (cnpi == NULL)
+    {
+        NCNN_LOGE("CallNtPowerInformation is not supported");
+        FreeLibrary(powrprof);
+        return std::vector<int>(g_cpucount, 0);
+    }
 
-    return cpu_info[2] & (1u << 29);
+    DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
+    PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);
+
+    cnpi(ProcessorInformation, NULL, 0, buffer, return_length);
+
+    std::vector<int> ret;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        ULONG max_mhz = buffer[i].MaxMhz;
+        ret.push_back(max_mhz);
+    }
+
+    free(buffer);
+    FreeLibrary(powrprof);
+    return ret;
 }
 
-static int get_cpu_support_x86_avx2()
+static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
-#if !NCNN_AVX2
+    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
+    if (prev_mask == 0)
+    {
+        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
+        return -1;
+    }
+
     return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+}
+#endif // (defined _WIN32 && !(defined __MINGW32__))
 
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
+#if defined __ANDROID__ || defined __linux__
+static int get_max_freq_khz(int cpuid)
+{
+    // first try, for all possible cpu
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+    FILE* fp = fopen(path, "rb");
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+    if (!fp)
+    {
+        // second try, for online cpu
+        sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
+        fp = fopen(path, "rb");
 
-    x86_cpuid_sublevel(7, 0, cpu_info);
-    return cpu_info[1] & (1u << 5);
-}
+        if (fp)
+        {
+            int max_freq_khz = 0;
+            while (!feof(fp))
+            {
+                int freq_khz = 0;
+                int nscan = fscanf(fp, "%d %*d", &freq_khz);
+                if (nscan != 1)
+                    break;
 
-static int get_cpu_support_x86_avx_vnni()
-{
-#if !NCNN_AVXVNNI
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+                if (freq_khz > max_freq_khz)
+                    max_freq_khz = freq_khz;
+            }
 
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
+            fclose(fp);
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+            if (max_freq_khz != 0)
+                return max_freq_khz;
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+            fp = NULL;
+        }
 
-    x86_cpuid_sublevel(7, 1, cpu_info);
-    return cpu_info[0] & (1u << 4);
-}
+        if (!fp)
+        {
+            // third try, for online cpu
+            sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
+            fp = fopen(path, "rb");
 
-static int get_cpu_support_x86_avx512()
-{
-#if !NCNN_AVX512
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+            if (!fp)
+                return -1;
 
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
+            int max_freq_khz = -1;
+            int nscan = fscanf(fp, "%d", &max_freq_khz);
+            if (nscan != 1)
+            {
+                NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
+            }
+            fclose(fp);
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+            return max_freq_khz;
+        }
+    }
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+    int max_freq_khz = 0;
+    while (!feof(fp))
+    {
+        int freq_khz = 0;
+        int nscan = fscanf(fp, "%d %*d", &freq_khz);
+        if (nscan != 1)
+            break;
 
-    // check avx512 XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 0xe0) != 0xe0)
-        return 0;
+        if (freq_khz > max_freq_khz)
+            max_freq_khz = freq_khz;
+    }
 
-    x86_cpuid_sublevel(7, 0, cpu_info);
-    return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
+    fclose(fp);
+
+    return max_freq_khz;
 }
 
-static int get_cpu_support_x86_avx512_vnni()
+static bool is_smt_cpu(int cpuid)
 {
-#if !NCNN_AVX512VNNI
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+    // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);
 
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
+    FILE* fp = fopen(path, "rb");
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+    if (!fp)
+    {
+        sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
+        fp = fopen(path, "rb");
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+        if (!fp)
+            return false;
+    }
 
-    // check avx512 XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 0xe0) != 0xe0)
-        return 0;
+    bool is_smt = false;
+    while (!feof(fp))
+    {
+        char ch = fgetc(fp);
+        if (ch == ',' || ch == '-')
+        {
+            is_smt = true;
+            break;
+        }
+    }
 
-    x86_cpuid_sublevel(7, 0, cpu_info);
-    return cpu_info[2] & (1u << 11);
+    fclose(fp);
+
+    return is_smt;
 }
 
-static int get_cpu_support_x86_avx512_bf16()
+static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
-#if !NCNN_AVX512BF16
-    return 0;
+    // set affinity for thread
+#if defined(__BIONIC__)
+    pid_t pid = gettid();
+#else
+    pid_t pid = syscall(SYS_gettid);
 #endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
-
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
-
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
+    if (syscallret)
+    {
+        NCNN_LOGE("syscall error %d", syscallret);
+        return -1;
+    }
 
-    x86_cpuid_sublevel(7, 1, cpu_info);
-    return cpu_info[0] & (1u << 5);
+    return 0;
 }
+#endif // defined __ANDROID__ || defined __linux__
 
-static int get_cpu_support_x86_avx512_fp16()
+#if __APPLE__
+static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
-#if !NCNN_AVX512FP16
-    return 0;
-#endif
-    unsigned int cpu_info[4] = {0};
-    x86_cpuid(0, cpu_info);
+    // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
+    // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
+    // https://gist.github.com/Coneko/4234842
 
-    int nIds = cpu_info[0];
-    if (nIds < 7)
-        return 0;
+    // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
+    // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
+    // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio
 
-    x86_cpuid(1, cpu_info);
-    // check AVX XSAVE OSXSAVE
-    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
-        return 0;
+    int affinity_tag = THREAD_AFFINITY_TAG_NULL;
+    for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
+    {
+        if (thread_affinity_mask.is_enabled(i))
+        {
+            affinity_tag = i + 1;
+            break;
+        }
+    }
 
-    // check XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 6) != 6)
-        return 0;
+    mach_port_t tid = pthread_mach_thread_np(pthread_self());
 
-    // check avx512 XSAVE enabled by kernel
-    if ((x86_get_xcr0() & 0xe0) != 0xe0)
-        return 0;
+    thread_affinity_policy_data_t policy_data;
+    policy_data.affinity_tag = affinity_tag;
+    int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
+    if (ret && ret != KERN_NOT_SUPPORTED)
+    {
+        NCNN_LOGE("thread_policy_set error %d", ret);
+        return -1;
+    }
 
-    x86_cpuid_sublevel(7, 0, cpu_info);
-    return cpu_info[3] & (1u << 23);
+    return 0;
 }
+#endif // __APPLE__
 
-static int g_cpu_support_x86_avx = get_cpu_support_x86_avx();
-static int g_cpu_support_x86_fma = get_cpu_support_x86_fma();
-static int g_cpu_support_x86_xop = get_cpu_support_x86_xop();
-static int g_cpu_support_x86_f16c = get_cpu_support_x86_f16c();
-static int g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
-static int g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
-static int g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
-static int g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
-static int g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16();
-static int g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16();
-#else  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-static const int g_cpu_support_x86_avx = 0;
-static const int g_cpu_support_x86_fma = 0;
-static const int g_cpu_support_x86_xop = 0;
-static const int g_cpu_support_x86_f16c = 0;
-static const int g_cpu_support_x86_avx2 = 0;
-static const int g_cpu_support_x86_avx_vnni = 0;
-static const int g_cpu_support_x86_avx512 = 0;
-static const int g_cpu_support_x86_avx512_vnni = 0;
-static const int g_cpu_support_x86_avx512_bf16 = 0;
-static const int g_cpu_support_x86_avx512_fp16 = 0;
-#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-
-int cpu_support_x86_avx()
+static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
 {
-    return g_cpu_support_x86_avx;
-}
+    mask_all.disable_all();
 
-int cpu_support_x86_fma()
-{
-    return g_cpu_support_x86_fma;
-}
+#if (defined _WIN32 && !(defined __MINGW32__))
+    // get max freq mhz for all cores
+    int max_freq_mhz_min = INT_MAX;
+    int max_freq_mhz_max = 0;
+    std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int max_freq_mhz = cpu_max_freq_mhz[i];
 
-int cpu_support_x86_xop()
-{
-    return g_cpu_support_x86_xop;
-}
+        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);
 
-int cpu_support_x86_f16c()
-{
-    return g_cpu_support_x86_f16c;
-}
+        if (max_freq_mhz > max_freq_mhz_max)
+            max_freq_mhz_max = max_freq_mhz;
+        if (max_freq_mhz < max_freq_mhz_min)
+            max_freq_mhz_min = max_freq_mhz;
+    }
 
-int cpu_support_x86_avx2()
-{
-    return g_cpu_support_x86_avx2;
-}
+    int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
+    if (max_freq_mhz_medium == max_freq_mhz_max)
+    {
+        mask_little.disable_all();
+        mask_big = mask_all;
+        return;
+    }
 
-int cpu_support_x86_avx_vnni()
-{
-    return g_cpu_support_x86_avx_vnni;
-}
+    ncnn::CpuSet smt_cpu_mask = get_smt_cpu_mask();
+
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        if (smt_cpu_mask.is_enabled(i))
+        {
+            // always treat smt core as big core
+            mask_big.enable(i);
+            continue;
+        }
+
+        if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
+            mask_little.enable(i);
+        else
+            mask_big.enable(i);
+    }
+#elif defined __ANDROID__ || defined __linux__
+    int max_freq_khz_min = INT_MAX;
+    int max_freq_khz_max = 0;
+    std::vector<int> cpu_max_freq_khz(g_cpucount);
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int max_freq_khz = get_max_freq_khz(i);
 
-int cpu_support_x86_avx512()
-{
-    return g_cpu_support_x86_avx512;
-}
+        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
 
-int cpu_support_x86_avx512_vnni()
-{
-    return g_cpu_support_x86_avx512_vnni;
-}
+        cpu_max_freq_khz[i] = max_freq_khz;
 
-int cpu_support_x86_avx512_bf16()
-{
-    return g_cpu_support_x86_avx512_bf16;
-}
+        if (max_freq_khz > max_freq_khz_max)
+            max_freq_khz_max = max_freq_khz;
+        if (max_freq_khz < max_freq_khz_min)
+            max_freq_khz_min = max_freq_khz;
+    }
 
-int cpu_support_x86_avx512_fp16()
-{
-    return g_cpu_support_x86_avx512_fp16;
-}
+    int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
+    if (max_freq_khz_medium == max_freq_khz_max)
+    {
+        mask_little.disable_all();
+        mask_big = mask_all;
+        return;
+    }
 
-int cpu_support_mips_msa()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __mips__
-    return g_hwcaps & HWCAP_MIPS_MSA;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        if (is_smt_cpu(i))
+        {
+            // always treat smt core as big core
+            mask_big.enable(i);
+            continue;
+        }
 
-int cpu_support_loongarch_lsx()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __loongarch64
-    return g_hwcaps & HWCAP_LOONGARCH_LSX;
-#else
-    return 0;
-#endif
+        if (cpu_max_freq_khz[i] < max_freq_khz_medium)
+            mask_little.enable(i);
+        else
+            mask_big.enable(i);
+    }
+#elif __APPLE__
+    int nperflevels = get_hw_capability("hw.nperflevels");
+    if (nperflevels == 1)
+    {
+        // smp models
+        mask_little.disable_all();
+        mask_big = mask_all;
+    }
+    else
+    {
+        // two or more clusters, level0 is the high-performance cluster
+        int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
+        for (int i = 0; i < perflevel0_logicalcpu; i++)
+        {
+            mask_big.enable(i);
+        }
+        for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
+        {
+            mask_little.enable(i);
+        }
+    }
 #else
-    return 0;
+    // TODO implement me for other platforms
+    mask_little.disable_all();
+    mask_big = mask_all;
 #endif
 }
 
-int cpu_support_loongarch_lasx()
-{
 #if defined __ANDROID__ || defined __linux__
-#if __loongarch64
-    return g_hwcaps & HWCAP_LOONGARCH_LASX;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
-
-int cpu_support_loongson_mmi()
+#if __aarch64__
+union midr_info_t
 {
-#if defined __ANDROID__ || defined __linux__
-#if __mips__
-    return g_hwcaps & HWCAP_LOONGSON_MMI;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    struct __attribute__((packed))
+    {
+        unsigned int revision : 4;
+        unsigned int part : 12;
+        unsigned int architecture : 4;
+        unsigned int variant : 4;
+        unsigned int implementer : 8;
+    };
+    unsigned int midr;
 
-int cpu_support_riscv_v()
-{
-#if defined __ANDROID__ || defined __linux__
-#if __riscv
-    return g_hwcaps & COMPAT_HWCAP_ISA_V;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    midr_info_t(unsigned int _midr)
+        : midr(_midr)
+    {
+    }
+};
 
-int cpu_support_riscv_zfh()
+static unsigned int get_midr_from_sysfs(int cpuid)
 {
-#if defined __ANDROID__ || defined __linux__
-#if __riscv
-    // v + f does not imply zfh, but how to discover zfh properly ?
-    // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
-    return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
-#else
-    return 0;
-#endif
-#else
-    return 0;
-#endif
-}
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid);
 
-int cpu_riscv_vlenb()
-{
-#if __riscv
-    if (!cpu_support_riscv_v())
+    FILE* fp = fopen(path, "rb");
+    if (!fp)
         return 0;
 
-    int a = 0;
-    asm volatile(
-        ".word  0xc22026f3  \n" // csrr  a3, vlenb
-        "mv     %0, a3      \n"
-        : "=r"(a)
-        :
-        : "memory", "a3");
-    return a;
-#else
-    return 0;
-#endif
+    unsigned int midr_el1 = 0;
+    int nscan = fscanf(fp, "%x", &midr_el1);
+    if (nscan != 1)
+    {
+        // ignore
+    }
+
+    fclose(fp);
+
+    return midr_el1;
 }
 
-static int get_cpucount()
+static int get_midr_from_proc_cpuinfo(std::vector<unsigned int>& midrs)
 {
-    int count = 0;
-#ifdef __EMSCRIPTEN__
-    if (emscripten_has_threading_support())
-        count = emscripten_num_logical_cores();
-    else
-        count = 1;
-#elif (defined _WIN32 && !(defined __MINGW32__))
-    SYSTEM_INFO system_info;
-    GetSystemInfo(&system_info);
-    count = system_info.dwNumberOfProcessors;
-#elif defined __ANDROID__ || defined __linux__
-    // get cpu count from /proc/cpuinfo
     FILE* fp = fopen("/proc/cpuinfo", "rb");
     if (!fp)
-        return 1;
+        return -1;
+
+    midrs.resize(g_cpucount, 0);
+
+    int cpuid = -1;
+    midr_info_t midr_info(0);
 
     char line[1024];
     while (!feof(fp))
@@ -1241,1295 +1514,1145 @@ static int get_cpucount()
 
         if (memcmp(line, "processor", 9) == 0)
         {
-            count++;
-        }
-    }
-
-    fclose(fp);
-#elif __APPLE__
-    size_t len = sizeof(count);
-    sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-#else
-#ifdef _OPENMP
-    count = omp_get_max_threads();
-#else
-    count = 1;
-#endif // _OPENMP
-#endif
-
-    if (count < 1)
-        count = 1;
-
-    return count;
-}
-
-static int g_cpucount = get_cpucount();
+            // processor       : 4
+            int id = -1;
+            int nscan = sscanf(line, "%*[^:]: %d", &id);
+            if (nscan != 1)
+                continue;
 
-int get_cpu_count()
-{
-    return g_cpucount;
-}
+            if (cpuid >= 0 && cpuid < g_cpucount)
+            {
+                if (midr_info.midr == 0)
+                {
+                    // shared midr
+                    midrs[cpuid] = (unsigned int)-1;
+                }
+                else
+                {
+                    // save midr and reset
+                    midrs[cpuid] = midr_info.midr;
+                    for (int i = 0; i < g_cpucount; i++)
+                    {
+                        if (midrs[i] == (unsigned int)-1)
+                            midrs[i] = midr_info.midr;
+                    }
+                }
 
-int get_little_cpu_count()
-{
-    return get_cpu_thread_affinity_mask(1).num_enabled();
-}
+                midr_info.midr = 0;
+            }
 
-int get_big_cpu_count()
-{
-    int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
-    return big_cpu_count ? big_cpu_count : g_cpucount;
-}
+            cpuid = id;
+        }
 
-#if defined __ANDROID__ || defined __linux__
-static int get_thread_siblings(int cpuid)
-{
-    char path[256];
-    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
+        if (cpuid == -1)
+            continue;
 
-    FILE* fp = fopen(path, "rb");
-    if (!fp)
-        return -1;
+        if (memcmp(line, "CPU implementer", 15) == 0)
+        {
+            // CPU implementer : 0x51
+            unsigned int id = 0;
+            int nscan = sscanf(line, "%*[^:]: %x", &id);
+            if (nscan != 1)
+                continue;
 
-    int thread_siblings = -1;
-    int nscan = fscanf(fp, "%x", &thread_siblings);
-    if (nscan != 1)
-    {
-        // ignore
-    }
+            midr_info.implementer = id;
+        }
+        else if (memcmp(line, "CPU architecture", 16) == 0)
+        {
+            // CPU architecture: 8
+            int id = 0;
+            int nscan = sscanf(line, "%*[^:]: %d", &id);
+            if (nscan != 1)
+                continue;
 
-    fclose(fp);
+            midr_info.architecture = id;
+        }
+        else if (memcmp(line, "CPU variant", 11) == 0)
+        {
+            // CPU variant     : 0xd
+            int id = 0;
+            int nscan = sscanf(line, "%*[^:]: %x", &id);
+            if (nscan != 1)
+                continue;
 
-    return thread_siblings;
-}
-#endif // defined __ANDROID__ || defined __linux__
+            midr_info.variant = id;
+        }
+        else if (memcmp(line, "CPU part", 8) == 0)
+        {
+            // CPU part        : 0x804
+            int id = 0;
+            int nscan = sscanf(line, "%*[^:]: %x", &id);
+            if (nscan != 1)
+                continue;
 
-static int get_physical_cpucount()
-{
-    int count = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
-    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
-    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
-    if (glpi == NULL)
-    {
-        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
-        return g_cpucount;
-    }
+            midr_info.part = id;
+        }
+        else if (memcmp(line, "CPU revision", 12) == 0)
+        {
+            // CPU revision    : 14
+            int id = 0;
+            int nscan = sscanf(line, "%*[^:]: %d", &id);
+            if (nscan != 1)
+                continue;
 
-    DWORD return_length = 0;
-    glpi(NULL, &return_length);
+            midr_info.revision = id;
+        }
+    }
 
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
-    glpi(buffer, &return_length);
+    fclose(fp);
 
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
-    DWORD byte_offset = 0;
-    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    if (cpuid >= 0 && cpuid < g_cpucount)
     {
-        if (ptr->Relationship == RelationProcessorCore)
+        if (midr_info.midr == 0)
         {
-            count++;
+            // shared midr
+            midrs[cpuid] = (unsigned int)-1;
+        }
+        else
+        {
+            // save midr and reset
+            midrs[cpuid] = midr_info.midr;
+            for (int i = 0; i < g_cpucount; i++)
+            {
+                if (midrs[i] == (unsigned int)-1)
+                    midrs[i] = midr_info.midr;
+            }
         }
 
-        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
-        ptr++;
+        midr_info.midr = 0;
     }
 
-    free(buffer);
-#elif defined __ANDROID__ || defined __linux__
-    std::vector<int> thread_set;
-    for (int i = 0; i < g_cpucount; i++)
+    // /proc/cpuinfo may only report little/online cores on old kernel
+    if (g_cpu_affinity_mask_big.num_enabled() == g_cpucount)
     {
-        int thread_siblings = get_thread_siblings(i);
-        if (thread_siblings == -1)
+        // assign the remaining unknown midrs for smp cpu
+        for (int i = 0; i < g_cpucount; i++)
         {
-            // ignore malformed one
-            continue;
+            if (midrs[i] == 0)
+                midrs[i] = midr_info.midr;
         }
-
-        bool thread_siblings_exists = false;
-        for (size_t j = 0; j < thread_set.size(); j++)
+    }
+    else
+    {
+        // clear the big core midrs for hmp cpu if they are the same as little cores
+        unsigned int little_midr = 0;
+        for (int i = 0; i < g_cpucount; i++)
         {
-            if (thread_set[j] == thread_siblings)
+            if (g_cpu_affinity_mask_little.is_enabled(i))
             {
-                thread_siblings_exists = true;
+                little_midr = midrs[i];
                 break;
             }
         }
 
-        if (!thread_siblings_exists)
+        for (int i = 0; i < g_cpucount; i++)
         {
-            thread_set.push_back(thread_siblings);
-            count++;
+            if (g_cpu_affinity_mask_big.is_enabled(i))
+            {
+                if (midrs[i] == little_midr)
+                {
+                    midrs[i] = 0;
+                }
+            }
         }
     }
-#elif __APPLE__
-    size_t len = sizeof(count);
-    sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
-#else
-    count = g_cpucount;
-#endif
-
-    if (count > g_cpucount)
-        count = g_cpucount;
-
-    return count;
-}
-
-static int g_physical_cpucount = get_physical_cpucount();
 
-int get_physical_cpu_count()
-{
-    return g_physical_cpucount;
+    return 0;
 }
 
-int get_physical_little_cpu_count()
+// return midr for the current running core
+static unsigned int get_midr_from_register()
 {
-    if (g_physical_cpucount == g_cpucount)
-        return get_little_cpu_count();
+    uint64_t midr;
+    asm volatile("mrs   %0, MIDR_EL1"
+                 : "=r"(midr));
 
-    return g_physical_cpucount * 2 - g_cpucount;
+    return (unsigned int)midr;
 }
 
-int get_physical_big_cpu_count()
+static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)
 {
-    if (g_physical_cpucount == g_cpucount)
-        return get_big_cpu_count();
-
-    return g_cpucount - g_physical_cpucount;
-}
+    // get affinity for thread
+#if defined(__BIONIC__)
+    pid_t pid = gettid();
+#else
+    pid_t pid = syscall(SYS_gettid);
+#endif
 
-#if defined __ANDROID__ || defined __linux__
-static int get_data_cache_size(int cpuid, int level)
-{
-    char path[256];
+    thread_affinity_mask.disable_all();
 
-    // discover sysfs cache entry
-    int indexid = -1;
-    for (int i = 0;; i++)
+    int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
+    if (syscallret)
     {
-        // check level
-        {
-            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i);
-            FILE* fp = fopen(path, "rb");
-            if (!fp)
-                break;
-
-            int cache_level = -1;
-            int nscan = fscanf(fp, "%d", &cache_level);
-            fclose(fp);
-            if (nscan != 1 || cache_level != level)
-                continue;
-        }
-
-        // check type
-        {
-            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i);
-            FILE* fp = fopen(path, "rb");
-            if (!fp)
-                break;
-
-            char type[32];
-            int nscan = fscanf(fp, "%31s", type);
-            fclose(fp);
-            if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0))
-                continue;
-        }
-
-        indexid = i;
-        break;
+        // handle get error silently
+        return -1;
     }
 
-    if (indexid == -1)
-    {
-        // no sysfs entry
-        return 0;
-    }
+    return 0;
+}
 
-    // get size
-    int cache_size_K = 0;
-    {
-        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid);
-        FILE* fp = fopen(path, "rb");
-        if (!fp)
-            return 0;
+static int midr_is_a53_a55(unsigned int midr)
+{
+    // 0x 41 ? f d03 ? = arm cortex-a53
+    // 0x 51 ? f 801 ? = qcom kryo200 a53
+    // 0x 41 ? f d04 ? = arm cortex-a35
+    // 0x 41 ? f d05 ? = arm cortex-a55
+    // 0x 51 ? f 803 ? = qcom kryo300 a55
+    // 0x 51 ? f 805 ? = qcom kryo400 a55
 
-        int nscan = fscanf(fp, "%dK", &cache_size_K);
-        fclose(fp);
-        if (nscan != 1)
-        {
-            NCNN_LOGE("fscanf cache_size_K error %d", nscan);
-            return 0;
-        }
-    }
+    midr_info_t midr_info(midr);
 
-    // parse shared_cpu_map mask
-    CpuSet shared_cpu_map;
-    {
-        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid);
-        FILE* fp = fopen(path, "rb");
-        if (!fp)
-            return 0;
+    return (midr_info.implementer == 0x41 && midr_info.part == 0xd03)
+           || (midr_info.implementer == 0x51 && midr_info.part == 0x801)
+           || (midr_info.implementer == 0x41 && midr_info.part == 0xd04)
+           || (midr_info.implementer == 0x41 && midr_info.part == 0xd05)
+           || (midr_info.implementer == 0x51 && midr_info.part == 0x803)
+           || (midr_info.implementer == 0x51 && midr_info.part == 0x805);
+}
 
-        char shared_cpu_map_str[256];
-        int nscan = fscanf(fp, "%255s", shared_cpu_map_str);
-        fclose(fp);
-        if (nscan != 1)
-        {
-            NCNN_LOGE("fscanf shared_cpu_map error %d", nscan);
-            return 0;
-        }
+static int detect_cpu_is_arm_a53_a55()
+{
+    int a53_a55_cpu_count = 0;
 
-        int len = strlen(shared_cpu_map_str);
+    // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1
+    bool sysfs_midr = true;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        unsigned int midr = 0;
 
-        if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x')
+        // for kernel 4.7+
+        midr = get_midr_from_sysfs(i);
+        if (midr == 0)
         {
-            // skip leading 0x
-            len -= 2;
+            sysfs_midr = false;
+            break;
         }
 
-        int ci = 0;
-        for (int i = len - 1; i >= 0; i--)
+        if (midr_is_a53_a55(midr))
         {
-            char x = shared_cpu_map_str[i];
-            if (x & 1) shared_cpu_map.enable(ci + 0);
-            if (x & 2) shared_cpu_map.enable(ci + 1);
-            if (x & 4) shared_cpu_map.enable(ci + 2);
-            if (x & 8) shared_cpu_map.enable(ci + 3);
-
-            ci += 4;
+            a53_a55_cpu_count++;
         }
     }
 
-    if (shared_cpu_map.num_enabled() == 1)
-        return cache_size_K * 1024;
-
-    // resolve physical cpu count in the shared_cpu_map
-    int shared_physical_cpu_count = 0;
+    if (!sysfs_midr)
     {
-        std::vector<int> thread_set;
-        for (int i = 0; i < g_cpucount; i++)
+        // second try, collect midr from /proc/cpuinfo
+        std::vector<unsigned int> midrs;
+        int ret = get_midr_from_proc_cpuinfo(midrs);
+        if (ret == 0 && (int)midrs.size() == g_cpucount)
         {
-            if (!shared_cpu_map.is_enabled(i))
-                continue;
-
-            int thread_siblings = get_thread_siblings(i);
-            if (thread_siblings == -1)
-            {
-                // ignore malformed one
-                continue;
-            }
-
-            bool thread_siblings_exists = false;
-            for (size_t j = 0; j < thread_set.size(); j++)
+            for (int i = 0; i < g_cpucount; i++)
             {
-                if (thread_set[j] == thread_siblings)
+                if (midr_is_a53_a55(midrs[i]))
                 {
-                    thread_siblings_exists = true;
-                    break;
+                    a53_a55_cpu_count++;
                 }
             }
-
-            if (!thread_siblings_exists)
-            {
-                thread_set.push_back(thread_siblings);
-                shared_physical_cpu_count++;
-            }
+        }
+        else
+        {
+            // third try, assume all aarch64 little cores are a53/a55
+            a53_a55_cpu_count = g_cpu_affinity_mask_little.num_enabled();
         }
     }
 
-    // return per-physical-core cache size with 4K aligned
-    cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4;
+    if (a53_a55_cpu_count == 0)
+        return 0; // all non a53/a55
 
-    return cache_size_K * 1024;
+    if (a53_a55_cpu_count == g_cpucount)
+        return 1; // all a53/a55
+
+    // little cores are a53/a55
+    return 2;
 }
+#endif // __aarch64__
+#endif // defined __ANDROID__ || defined __linux__
 
-static int get_big_cpu_data_cache_size(int level)
+// the initialization
+static void initialize_global_cpu_info()
 {
-    const CpuSet& big_cs = get_cpu_thread_affinity_mask(2);
-    if (big_cs.num_enabled() == 0)
-    {
-        // smp cpu
-        return get_data_cache_size(0, level);
-    }
+    g_cpucount = get_cpucount();
+    g_physical_cpucount = get_physical_cpucount();
+    g_powersave = 0;
+    initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
 
-    for (int i = 0; i < g_cpucount; i++)
-    {
-        if (big_cs.is_enabled(i))
-        {
-            return get_data_cache_size(i, level);
-        }
-    }
+#if defined __ANDROID__ || defined __linux__
+    g_hwcaps = get_elf_hwcap(AT_HWCAP);
+    g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
+#endif // defined __ANDROID__ || defined __linux__
 
-    // should never reach here, fallback to cpu0
-    return get_data_cache_size(0, level);
-}
+#if __APPLE__
+    g_hw_cpufamily = get_hw_cpufamily();
+    g_hw_cputype = get_hw_cputype();
+    g_hw_cpusubtype = get_hw_cpusubtype();
+
+    g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
+    g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
+    g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
+    g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
+    g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
+#endif // __APPLE__
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    g_cpu_support_x86_avx = get_cpu_support_x86_avx();
+    g_cpu_support_x86_fma = get_cpu_support_x86_fma();
+    g_cpu_support_x86_xop = get_cpu_support_x86_xop();
+    g_cpu_support_x86_f16c = get_cpu_support_x86_f16c();
+    g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
+    g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
+    g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
+    g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
+    g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16();
+    g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16();
+#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+
+    g_cpu_level2_cachesize = get_cpu_level2_cachesize();
+    g_cpu_level3_cachesize = get_cpu_level3_cachesize();
+
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55();
+#endif // __aarch64__
 #endif // defined __ANDROID__ || defined __linux__
+}
 
-static int get_cpu_level2_cachesize()
+static int g_cpu_info_initialized = 0;
+
+static inline void try_initialize_global_cpu_info()
 {
-    int size = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
-    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
-    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
-    if (glpi != NULL)
+    if (!g_cpu_info_initialized)
     {
-        DWORD return_length = 0;
-        glpi(NULL, &return_length);
+        initialize_global_cpu_info();
+        g_cpu_info_initialized = 1;
+    }
+}
 
-        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
-        glpi(buffer, &return_length);
+namespace ncnn {
 
-        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
-        DWORD byte_offset = 0;
-        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
-        {
-            if (ptr->Relationship == RelationCache)
-            {
-                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
-                if (Cache->Level == 2)
-                {
-                    size = std::max(size, (int)Cache->Size);
-                }
-            }
+#if (defined _WIN32 && !(defined __MINGW32__))
+CpuSet::CpuSet()
+{
+    disable_all();
+}
 
-            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
-            ptr++;
-        }
+void CpuSet::enable(int cpu)
+{
+    mask |= (1 << cpu);
+}
 
-        free(buffer);
-    }
-#elif defined __ANDROID__ || defined __linux__
-    size = get_big_cpu_data_cache_size(2);
-#if defined(_SC_LEVEL2_CACHE_SIZE)
-    if (size <= 0)
-        size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-#endif
-#elif __APPLE__
-    // perflevel 0 is the higher performance cluster
-    int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
-    int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
-    size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
-#endif
+void CpuSet::disable(int cpu)
+{
+    mask &= ~(1 << cpu);
+}
 
-    // fallback to a common value
-    if (size <= 0)
-    {
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-        size = 64 * 1024;
-        if (cpu_support_x86_avx())
-            size = 128 * 1024;
-        if (cpu_support_x86_avx2())
-            size = 256 * 1024;
-        if (cpu_support_x86_avx512())
-            size = 1024 * 1024;
-#elif __aarch64__
-        size = 256 * 1024;
-#elif __arm__
-        size = 128 * 1024;
-#else
-        // is 64k still too large here ?
-        size = 64 * 1024;
-#endif
-    }
+void CpuSet::disable_all()
+{
+    mask = 0;
+}
 
-    return size;
+bool CpuSet::is_enabled(int cpu) const
+{
+    return mask & (1 << cpu);
 }
 
-static int get_cpu_level3_cachesize()
+int CpuSet::num_enabled() const
 {
-    int size = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
-    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
-    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
-    if (glpi != NULL)
+    int num_enabled = 0;
+    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
     {
-        DWORD return_length = 0;
-        glpi(NULL, &return_length);
+        if (is_enabled(i))
+            num_enabled++;
+    }
 
-        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
-        glpi(buffer, &return_length);
+    return num_enabled;
+}
+#elif defined __ANDROID__ || defined __linux__
+CpuSet::CpuSet()
+{
+    disable_all();
+}
 
-        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
-        DWORD byte_offset = 0;
-        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
-        {
-            if (ptr->Relationship == RelationCache)
-            {
-                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
-                if (Cache->Level == 3)
-                {
-                    size = std::max(size, (int)Cache->Size);
-                }
-            }
+void CpuSet::enable(int cpu)
+{
+    CPU_SET(cpu, &cpu_set);
+}
 
-            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
-            ptr++;
-        }
+void CpuSet::disable(int cpu)
+{
+    CPU_CLR(cpu, &cpu_set);
+}
 
-        free(buffer);
-    }
-#elif defined __ANDROID__ || defined __linux__
-    size = get_big_cpu_data_cache_size(3);
-#if defined(_SC_LEVEL3_CACHE_SIZE)
-    if (size <= 0)
-        size = sysconf(_SC_LEVEL3_CACHE_SIZE);
-#endif
-#elif __APPLE__
-    // perflevel 0 is the higher performance cluster
-    // get the size shared among all cpus
-    size = get_hw_capability("hw.perflevel0.l3cachesize");
-#endif
+void CpuSet::disable_all()
+{
+    CPU_ZERO(&cpu_set);
+}
 
-    // l3 cache size can be zero
+bool CpuSet::is_enabled(int cpu) const
+{
+    return CPU_ISSET(cpu, &cpu_set);
+}
 
-    return size;
+int CpuSet::num_enabled() const
+{
+    int num_enabled = 0;
+    for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
+    {
+        if (is_enabled(i))
+            num_enabled++;
+    }
+
+    return num_enabled;
+}
+#elif __APPLE__
+CpuSet::CpuSet()
+{
+    disable_all();
 }
 
-static int g_cpu_level2_cachesize = get_cpu_level2_cachesize();
-static int g_cpu_level3_cachesize = get_cpu_level3_cachesize();
+void CpuSet::enable(int cpu)
+{
+    policy |= (1 << cpu);
+}
 
-int get_cpu_level2_cache_size()
+void CpuSet::disable(int cpu)
 {
-    return g_cpu_level2_cachesize;
+    policy &= ~(1 << cpu);
 }
 
-int get_cpu_level3_cache_size()
+void CpuSet::disable_all()
 {
-    return g_cpu_level3_cachesize;
+    policy = 0;
 }
 
-#if (defined _WIN32 && !(defined __MINGW32__))
-static CpuSet get_smt_cpu_mask()
+bool CpuSet::is_enabled(int cpu) const
 {
-    CpuSet smt_cpu_mask;
+    return policy & (1 << cpu);
+}
 
-    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
-    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
-    if (glpi == NULL)
+int CpuSet::num_enabled() const
+{
+    int num_enabled = 0;
+    for (int i = 0; i < (int)sizeof(policy) * 8; i++)
     {
-        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
-        return smt_cpu_mask;
+        if (is_enabled(i))
+            num_enabled++;
     }
 
-    DWORD return_length = 0;
-    glpi(NULL, &return_length);
-
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
-    glpi(buffer, &return_length);
+    return num_enabled;
+}
+#else
+CpuSet::CpuSet()
+{
+}
 
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
-    DWORD byte_offset = 0;
-    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
-    {
-        if (ptr->Relationship == RelationProcessorCore)
-        {
-            CpuSet smt_set;
-            smt_set.mask = ptr->ProcessorMask;
-            if (smt_set.num_enabled() > 1)
-            {
-                // this core is smt
-                smt_cpu_mask.mask |= smt_set.mask;
-            }
-        }
+void CpuSet::enable(int /* cpu */)
+{
+}
 
-        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
-        ptr++;
-    }
+void CpuSet::disable(int /* cpu */)
+{
+}
 
-    free(buffer);
+void CpuSet::disable_all()
+{
+}
 
-    return smt_cpu_mask;
+bool CpuSet::is_enabled(int /* cpu */) const
+{
+    return true;
 }
 
-static std::vector<int> get_max_freq_mhz()
+int CpuSet::num_enabled() const
 {
-    typedef struct _PROCESSOR_POWER_INFORMATION
-    {
-        ULONG Number;
-        ULONG MaxMhz;
-        ULONG CurrentMhz;
-        ULONG MhzLimit;
-        ULONG MaxIdleState;
-        ULONG CurrentIdleState;
-    } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
+    return get_cpu_count();
+}
+#endif
 
-    HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));
+int cpu_support_arm_edsp()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return 0;
+#else
+    return g_hwcaps & HWCAP_EDSP;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return 0;
+#else
+    return g_hw_cputype == CPU_TYPE_ARM;
+#endif
+#else
+    return 0;
+#endif
+}
 
-    typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
-    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
-    if (cnpi == NULL)
-    {
-        NCNN_LOGE("CallNtPowerInformation is not supported");
-        FreeLibrary(powrprof);
-        return std::vector<int>(g_cpucount, 0);
-    }
+int cpu_support_arm_neon()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps & HWCAP_ASIMD;
+#else
+    return g_hwcaps & HWCAP_NEON;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_cputype == CPU_TYPE_ARM64;
+#else
+    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
+#endif
+#else
+    return 0;
+#endif
+}
 
-    DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
-    PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);
+int cpu_support_arm_vfpv4()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    // neon always enable fma and fp16
+    return g_hwcaps & HWCAP_ASIMD;
+#else
+    return g_hwcaps & HWCAP_VFPv4;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_cputype == CPU_TYPE_ARM64;
+#else
+    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
+#endif
+#else
+    return 0;
+#endif
+}
 
-    cnpi(ProcessorInformation, NULL, 0, buffer, return_length);
+int cpu_support_arm_asimdhp()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps & HWCAP_ASIMDHP;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_optional_arm_FEAT_FP16
+           || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL
+           || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
+           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-    std::vector<int> ret;
-    for (int i = 0; i < g_cpucount; i++)
-    {
-        ULONG max_mhz = buffer[i].MaxMhz;
-        ret.push_back(max_mhz);
-    }
+int cpu_support_arm_cpuid()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps & HWCAP_CPUID;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+    return 0;
+#else
+    return 0;
+#endif
+}
 
-    free(buffer);
-    FreeLibrary(powrprof);
-    return ret;
+int cpu_support_arm_asimddp()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps & HWCAP_ASIMDDP;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_optional_arm_FEAT_DotProd
+           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
 }
 
-static int set_sched_affinity(const CpuSet& thread_affinity_mask)
+int cpu_support_arm_asimdfhm()
 {
-    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
-    if (prev_mask == 0)
-    {
-        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
-        return -1;
-    }
-
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps & HWCAP_ASIMDFHM;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_optional_arm_FEAT_FHM
+           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
+#else
+    return 0;
+#endif
+#else
     return 0;
+#endif
 }
-#endif // (defined _WIN32 && !(defined __MINGW32__))
 
-#if defined __ANDROID__ || defined __linux__
-static int get_max_freq_khz(int cpuid)
+int cpu_support_arm_bf16()
 {
-    // first try, for all possible cpu
-    char path[256];
-    sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-
-    FILE* fp = fopen(path, "rb");
-
-    if (!fp)
-    {
-        // second try, for online cpu
-        sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
-        fp = fopen(path, "rb");
-
-        if (fp)
-        {
-            int max_freq_khz = 0;
-            while (!feof(fp))
-            {
-                int freq_khz = 0;
-                int nscan = fscanf(fp, "%d %*d", &freq_khz);
-                if (nscan != 1)
-                    break;
-
-                if (freq_khz > max_freq_khz)
-                    max_freq_khz = freq_khz;
-            }
-
-            fclose(fp);
-
-            if (max_freq_khz != 0)
-                return max_freq_khz;
-
-            fp = NULL;
-        }
-
-        if (!fp)
-        {
-            // third try, for online cpu
-            sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
-            fp = fopen(path, "rb");
-
-            if (!fp)
-                return -1;
-
-            int max_freq_khz = -1;
-            int nscan = fscanf(fp, "%d", &max_freq_khz);
-            if (nscan != 1)
-            {
-                NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
-            }
-            fclose(fp);
-
-            return max_freq_khz;
-        }
-    }
-
-    int max_freq_khz = 0;
-    while (!feof(fp))
-    {
-        int freq_khz = 0;
-        int nscan = fscanf(fp, "%d %*d", &freq_khz);
-        if (nscan != 1)
-            break;
-
-        if (freq_khz > max_freq_khz)
-            max_freq_khz = freq_khz;
-    }
-
-    fclose(fp);
-
-    return max_freq_khz;
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps2 & HWCAP2_BF16;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_optional_arm_FEAT_BF16
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
 }
 
-static bool is_smt_cpu(int cpuid)
+int cpu_support_arm_i8mm()
 {
-    // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
-    char path[256];
-    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);
-
-    FILE* fp = fopen(path, "rb");
-
-    if (!fp)
-    {
-        sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
-        fp = fopen(path, "rb");
-
-        if (!fp)
-            return false;
-    }
-
-    bool is_smt = false;
-    while (!feof(fp))
-    {
-        char ch = fgetc(fp);
-        if (ch == ',' || ch == '-')
-        {
-            is_smt = true;
-            break;
-        }
-    }
-
-    fclose(fp);
-
-    return is_smt;
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps2 & HWCAP2_I8MM;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return g_hw_optional_arm_FEAT_I8MM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
 }
 
-static int set_sched_affinity(const CpuSet& thread_affinity_mask)
+int cpu_support_arm_sve()
 {
-    // set affinity for thread
-#if defined(__BIONIC__)
-    pid_t pid = gettid();
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps & HWCAP_SVE;
 #else
-    pid_t pid = syscall(SYS_gettid);
+    return 0;
 #endif
-
-    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
-    if (syscallret)
-    {
-        NCNN_LOGE("syscall error %d", syscallret);
-        return -1;
-    }
-
+#elif __APPLE__
+#if __aarch64__
+    return 0; // no known apple cpu support armv8.6 sve
+#else
+    return 0;
+#endif
+#else
     return 0;
+#endif
 }
-#endif // defined __ANDROID__ || defined __linux__
 
-#if __APPLE__
-static int set_sched_affinity(const CpuSet& thread_affinity_mask)
+int cpu_support_arm_sve2()
 {
-    // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
-    // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
-    // https://gist.github.com/Coneko/4234842
-
-    // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
-    // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
-    // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio
-
-    int affinity_tag = THREAD_AFFINITY_TAG_NULL;
-    for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
-    {
-        if (thread_affinity_mask.is_enabled(i))
-        {
-            affinity_tag = i + 1;
-            break;
-        }
-    }
-
-    mach_port_t tid = pthread_mach_thread_np(pthread_self());
-
-    thread_affinity_policy_data_t policy_data;
-    policy_data.affinity_tag = affinity_tag;
-    int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
-    if (ret && ret != KERN_NOT_SUPPORTED)
-    {
-        NCNN_LOGE("thread_policy_set error %d", ret);
-        return -1;
-    }
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps2 & HWCAP2_SVE2;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return 0; // no known apple cpu support armv8.6 sve2
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
+int cpu_support_arm_svebf16()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps2 & HWCAP2_SVEBF16;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return 0; // no known apple cpu support armv8.6 svebf16
+#else
     return 0;
-}
-#endif // __APPLE__
-
-static int g_powersave = 0;
-
-int get_cpu_powersave()
-{
-    return g_powersave;
+#endif
+#else
+    return 0;
+#endif
 }
 
-int set_cpu_powersave(int powersave)
+int cpu_support_arm_svei8mm()
 {
-    if (powersave < 0 || powersave > 2)
-    {
-        NCNN_LOGE("powersave %d not supported", powersave);
-        return -1;
-    }
-
-    const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
-
-    int ret = set_cpu_thread_affinity(thread_affinity_mask);
-    if (ret != 0)
-        return ret;
-
-    g_powersave = powersave;
-
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps2 & HWCAP2_SVEI8MM;
+#else
+    return 0;
+#endif
+#elif __APPLE__
+#if __aarch64__
+    return 0; // no known apple cpu support armv8.6 svei8mm
+#else
+    return 0;
+#endif
+#else
     return 0;
+#endif
 }
 
-class cpu_thread_affinity_mask
-{
-public:
-    cpu_thread_affinity_mask();
-
-    CpuSet mask_all;
-    CpuSet mask_little;
-    CpuSet mask_big;
-};
-
-cpu_thread_affinity_mask::cpu_thread_affinity_mask()
+int cpu_support_arm_svef32mm()
 {
-    mask_all.disable_all();
-
-#if (defined _WIN32 && !(defined __MINGW32__))
-    // get max freq mhz for all cores
-    int max_freq_mhz_min = INT_MAX;
-    int max_freq_mhz_max = 0;
-    std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
-    for (int i = 0; i < g_cpucount; i++)
-    {
-        int max_freq_mhz = cpu_max_freq_mhz[i];
-
-        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);
-
-        if (max_freq_mhz > max_freq_mhz_max)
-            max_freq_mhz_max = max_freq_mhz;
-        if (max_freq_mhz < max_freq_mhz_min)
-            max_freq_mhz_min = max_freq_mhz;
-    }
-
-    int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
-    if (max_freq_mhz_medium == max_freq_mhz_max)
-    {
-        mask_little.disable_all();
-        mask_big = mask_all;
-        return;
-    }
-
-    CpuSet smt_cpu_mask = get_smt_cpu_mask();
-
-    for (int i = 0; i < g_cpucount; i++)
-    {
-        if (smt_cpu_mask.is_enabled(i))
-        {
-            // always treat smt core as big core
-            mask_big.enable(i);
-            continue;
-        }
-
-        if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
-            mask_little.enable(i);
-        else
-            mask_big.enable(i);
-    }
-#elif defined __ANDROID__ || defined __linux__
-    int max_freq_khz_min = INT_MAX;
-    int max_freq_khz_max = 0;
-    std::vector<int> cpu_max_freq_khz(g_cpucount);
-    for (int i = 0; i < g_cpucount; i++)
-    {
-        int max_freq_khz = get_max_freq_khz(i);
-
-        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
-
-        cpu_max_freq_khz[i] = max_freq_khz;
-
-        if (max_freq_khz > max_freq_khz_max)
-            max_freq_khz_max = max_freq_khz;
-        if (max_freq_khz < max_freq_khz_min)
-            max_freq_khz_min = max_freq_khz;
-    }
-
-    int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
-    if (max_freq_khz_medium == max_freq_khz_max)
-    {
-        mask_little.disable_all();
-        mask_big = mask_all;
-        return;
-    }
-
-    for (int i = 0; i < g_cpucount; i++)
-    {
-        if (is_smt_cpu(i))
-        {
-            // always treat smt core as big core
-            mask_big.enable(i);
-            continue;
-        }
-
-        if (cpu_max_freq_khz[i] < max_freq_khz_medium)
-            mask_little.enable(i);
-        else
-            mask_big.enable(i);
-    }
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __aarch64__
+    return g_hwcaps2 & HWCAP2_SVEF32MM;
+#else
+    return 0;
+#endif
 #elif __APPLE__
-    int nperflevels = get_hw_capability("hw.nperflevels");
-    if (nperflevels == 1)
-    {
-        // smp models
-        mask_little.disable_all();
-        mask_big = mask_all;
-    }
-    else
-    {
-        // two or more clusters, level0 is the high-performance cluster
-        int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
-        for (int i = 0; i < perflevel0_logicalcpu; i++)
-        {
-            mask_big.enable(i);
-        }
-        for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
-        {
-            mask_little.enable(i);
-        }
-    }
+#if __aarch64__
+    return 0; // no known apple cpu support armv8.6 svef32mm
 #else
-    // TODO implement me for other platforms
-    mask_little.disable_all();
-    mask_big = mask_all;
+    return 0;
+#endif
+#else
+    return 0;
 #endif
 }
 
-static cpu_thread_affinity_mask g_thread_affinity_mask;
-
-const CpuSet& get_cpu_thread_affinity_mask(int powersave)
+int cpu_support_x86_avx()
 {
-    if (powersave == 0)
-        return g_thread_affinity_mask.mask_all;
-
-    if (powersave == 1)
-        return g_thread_affinity_mask.mask_little;
-
-    if (powersave == 2)
-        return g_thread_affinity_mask.mask_big;
-
-    NCNN_LOGE("powersave %d not supported", powersave);
-
-    // fallback to all cores anyway
-    return g_thread_affinity_mask.mask_all;
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx;
+#else
+    return 0;
+#endif
 }
 
-int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
+int cpu_support_x86_fma()
 {
-#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
-    int num_threads = thread_affinity_mask.num_enabled();
-
-#ifdef _OPENMP
-    // set affinity for each thread
-    set_omp_num_threads(num_threads);
-    std::vector<int> ssarets(num_threads, 0);
-    #pragma omp parallel for num_threads(num_threads)
-    for (int i = 0; i < num_threads; i++)
-    {
-        ssarets[i] = set_sched_affinity(thread_affinity_mask);
-    }
-    for (int i = 0; i < num_threads; i++)
-    {
-        if (ssarets[i] != 0)
-            return -1;
-    }
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_fma;
 #else
-    int ssaret = set_sched_affinity(thread_affinity_mask);
-    if (ssaret != 0)
-        return -1;
+    return 0;
 #endif
+}
 
+int cpu_support_x86_xop()
+{
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_xop;
+#else
     return 0;
-#elif __APPLE__
-
-#ifdef _OPENMP
-    int num_threads = thread_affinity_mask.num_enabled();
-
-    // set affinity for each thread
-    set_omp_num_threads(num_threads);
-    std::vector<int> ssarets(num_threads, 0);
-    #pragma omp parallel for num_threads(num_threads)
-    for (int i = 0; i < num_threads; i++)
-    {
-        // assign one core for each thread
-        int core = -1 - i;
-        for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
-        {
-            if (thread_affinity_mask.is_enabled(j))
-            {
-                if (core == -1)
-                {
-                    core = j;
-                    break;
-                }
-                else
-                {
-                    core++;
-                }
-            }
-        }
-        CpuSet this_thread_affinity_mask;
-        if (core != -1 - i)
-        {
-            this_thread_affinity_mask.enable(core);
-        }
+#endif
+}
 
-        ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
-    }
-    for (int i = 0; i < num_threads; i++)
-    {
-        if (ssarets[i] != 0)
-            return -1;
-    }
+int cpu_support_x86_f16c()
+{
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_f16c;
 #else
-    int ssaret = set_sched_affinity(thread_affinity_mask);
-    if (ssaret != 0)
-        return -1;
+    return 0;
 #endif
+}
 
-    return 0;
+int cpu_support_x86_avx2()
+{
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx2;
 #else
-    // TODO
-    (void)thread_affinity_mask;
-    return -1;
+    return 0;
 #endif
 }
 
-#if defined __ANDROID__ || defined __linux__
-#if __aarch64__
-union midr_info_t
+int cpu_support_x86_avx_vnni()
 {
-    struct __attribute__((packed))
-    {
-        unsigned int revision : 4;
-        unsigned int part : 12;
-        unsigned int architecture : 4;
-        unsigned int variant : 4;
-        unsigned int implementer : 8;
-    };
-    unsigned int midr;
-
-    midr_info_t(unsigned int _midr)
-        : midr(_midr)
-    {
-    }
-};
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx_vnni;
+#else
+    return 0;
+#endif
+}
 
-static unsigned int get_midr_from_sysfs(int cpuid)
+int cpu_support_x86_avx512()
 {
-    char path[256];
-    sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid);
-
-    FILE* fp = fopen(path, "rb");
-    if (!fp)
-        return 0;
-
-    unsigned int midr_el1 = 0;
-    int nscan = fscanf(fp, "%x", &midr_el1);
-    if (nscan != 1)
-    {
-        // ignore
-    }
-
-    fclose(fp);
-
-    return midr_el1;
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx512;
+#else
+    return 0;
+#endif
 }
 
-static int get_midr_from_proc_cpuinfo(std::vector<unsigned int>& midrs)
+int cpu_support_x86_avx512_vnni()
 {
-    FILE* fp = fopen("/proc/cpuinfo", "rb");
-    if (!fp)
-        return -1;
-
-    midrs.resize(g_cpucount, 0);
-
-    int cpuid = -1;
-    midr_info_t midr_info(0);
-
-    char line[1024];
-    while (!feof(fp))
-    {
-        char* s = fgets(line, 1024, fp);
-        if (!s)
-            break;
-
-        if (memcmp(line, "processor", 9) == 0)
-        {
-            // processor       : 4
-            int id = -1;
-            int nscan = sscanf(line, "%*[^:]: %d", &id);
-            if (nscan != 1)
-                continue;
-
-            if (cpuid >= 0 && cpuid < g_cpucount)
-            {
-                if (midr_info.midr == 0)
-                {
-                    // shared midr
-                    midrs[cpuid] = (unsigned int)-1;
-                }
-                else
-                {
-                    // save midr and reset
-                    midrs[cpuid] = midr_info.midr;
-                    for (int i = 0; i < g_cpucount; i++)
-                    {
-                        if (midrs[i] == (unsigned int)-1)
-                            midrs[i] = midr_info.midr;
-                    }
-                }
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx512_vnni;
+#else
+    return 0;
+#endif
+}
 
-                midr_info.midr = 0;
-            }
+int cpu_support_x86_avx512_bf16()
+{
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx512_bf16;
+#else
+    return 0;
+#endif
+}
 
-            cpuid = id;
-        }
+int cpu_support_x86_avx512_fp16()
+{
+    try_initialize_global_cpu_info();
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+    return g_cpu_support_x86_avx512_fp16;
+#else
+    return 0;
+#endif
+}
 
-        if (cpuid == -1)
-            continue;
+int cpu_support_mips_msa()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __mips__
+    return g_hwcaps & HWCAP_MIPS_MSA;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-        if (memcmp(line, "CPU implementer", 15) == 0)
-        {
-            // CPU implementer : 0x51
-            unsigned int id = 0;
-            int nscan = sscanf(line, "%*[^:]: %x", &id);
-            if (nscan != 1)
-                continue;
+int cpu_support_loongarch_lsx()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LSX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-            midr_info.implementer = id;
-        }
-        else if (memcmp(line, "CPU architecture", 16) == 0)
-        {
-            // CPU architecture: 8
-            int id = 0;
-            int nscan = sscanf(line, "%*[^:]: %d", &id);
-            if (nscan != 1)
-                continue;
+int cpu_support_loongarch_lasx()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LASX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-            midr_info.architecture = id;
-        }
-        else if (memcmp(line, "CPU variant", 11) == 0)
-        {
-            // CPU variant     : 0xd
-            int id = 0;
-            int nscan = sscanf(line, "%*[^:]: %x", &id);
-            if (nscan != 1)
-                continue;
+int cpu_support_loongson_mmi()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __mips__
+    return g_hwcaps & HWCAP_LOONGSON_MMI;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-            midr_info.variant = id;
-        }
-        else if (memcmp(line, "CPU part", 8) == 0)
-        {
-            // CPU part        : 0x804
-            int id = 0;
-            int nscan = sscanf(line, "%*[^:]: %x", &id);
-            if (nscan != 1)
-                continue;
+int cpu_support_riscv_v()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __riscv
+    return g_hwcaps & COMPAT_HWCAP_ISA_V;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-            midr_info.part = id;
-        }
-        else if (memcmp(line, "CPU revision", 12) == 0)
-        {
-            // CPU revision    : 14
-            int id = 0;
-            int nscan = sscanf(line, "%*[^:]: %d", &id);
-            if (nscan != 1)
-                continue;
+int cpu_support_riscv_zfh()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __riscv
+    // v + f does not imply zfh, but how to discover zfh properly ?
+    // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
+    return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
 
-            midr_info.revision = id;
-        }
-    }
+int cpu_riscv_vlenb()
+{
+    try_initialize_global_cpu_info();
+#if __riscv
+    if (!cpu_support_riscv_v())
+        return 0;
 
-    fclose(fp);
+    int a = 0;
+    asm volatile(
+        ".word  0xc22026f3  \n" // csrr  a3, vlenb
+        "mv     %0, a3      \n"
+        : "=r"(a)
+        :
+        : "memory", "a3");
+    return a;
+#else
+    return 0;
+#endif
+}
 
-    if (cpuid >= 0 && cpuid < g_cpucount)
-    {
-        if (midr_info.midr == 0)
-        {
-            // shared midr
-            midrs[cpuid] = (unsigned int)-1;
-        }
-        else
-        {
-            // save midr and reset
-            midrs[cpuid] = midr_info.midr;
-            for (int i = 0; i < g_cpucount; i++)
-            {
-                if (midrs[i] == (unsigned int)-1)
-                    midrs[i] = midr_info.midr;
-            }
-        }
+int get_cpu_count()
+{
+    try_initialize_global_cpu_info();
+    return g_cpucount;
+}
 
-        midr_info.midr = 0;
-    }
+int get_little_cpu_count()
+{
+    try_initialize_global_cpu_info();
+    return get_cpu_thread_affinity_mask(1).num_enabled();
+}
 
-    // /proc/cpuinfo may only report little/online cores on old kernel
-    if (get_big_cpu_count() == get_cpu_count())
-    {
-        // assign the remaining unknown midrs for smp cpu
-        for (int i = 0; i < g_cpucount; i++)
-        {
-            if (midrs[i] == 0)
-                midrs[i] = midr_info.midr;
-        }
-    }
-    else
-    {
-        // clear the big core midrs for hmp cpu if they are the same as little cores
-        const CpuSet& little_cs = get_cpu_thread_affinity_mask(1);
-        const CpuSet& big_cs = get_cpu_thread_affinity_mask(2);
+int get_big_cpu_count()
+{
+    try_initialize_global_cpu_info();
+    int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
+    return big_cpu_count ? big_cpu_count : g_cpucount;
+}
 
-        unsigned int little_midr = 0;
-        for (int i = 0; i < g_cpucount; i++)
-        {
-            if (little_cs.is_enabled(i))
-            {
-                little_midr = midrs[i];
-                break;
-            }
-        }
+int get_physical_cpu_count()
+{
+    try_initialize_global_cpu_info();
+    return g_physical_cpucount;
+}
 
-        for (int i = 0; i < g_cpucount; i++)
-        {
-            if (big_cs.is_enabled(i))
-            {
-                if (midrs[i] == little_midr)
-                {
-                    midrs[i] = 0;
-                }
-            }
-        }
-    }
+int get_physical_little_cpu_count()
+{
+    try_initialize_global_cpu_info();
+    if (g_physical_cpucount == g_cpucount)
+        return get_little_cpu_count();
 
-    return 0;
+    return g_physical_cpucount * 2 - g_cpucount;
 }
 
-// return midr for the current running core
-static unsigned int get_midr_from_register()
+int get_physical_big_cpu_count()
 {
-    uint64_t midr;
-    asm volatile("mrs   %0, MIDR_EL1"
-                 : "=r"(midr));
+    try_initialize_global_cpu_info();
+    if (g_physical_cpucount == g_cpucount)
+        return get_big_cpu_count();
 
-    return (unsigned int)midr;
+    return g_cpucount - g_physical_cpucount;
 }
 
-static int get_sched_affinity(CpuSet& thread_affinity_mask)
+int get_cpu_level2_cache_size()
 {
-    // get affinity for thread
-#if defined(__BIONIC__)
-    pid_t pid = gettid();
-#else
-    pid_t pid = syscall(SYS_gettid);
-#endif
+    try_initialize_global_cpu_info();
+    return g_cpu_level2_cachesize;
+}
 
-    thread_affinity_mask.disable_all();
+int get_cpu_level3_cache_size()
+{
+    try_initialize_global_cpu_info();
+    return g_cpu_level3_cachesize;
+}
 
-    int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
-    if (syscallret)
+int get_cpu_powersave()
+{
+    try_initialize_global_cpu_info();
+    return g_powersave;
+}
+
+int set_cpu_powersave(int powersave)
+{
+    try_initialize_global_cpu_info();
+    if (powersave < 0 || powersave > 2)
     {
-        // handle get error silently
+        NCNN_LOGE("powersave %d not supported", powersave);
         return -1;
     }
 
+    const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
+
+    int ret = set_cpu_thread_affinity(thread_affinity_mask);
+    if (ret != 0)
+        return ret;
+
+    g_powersave = powersave;
+
     return 0;
 }
 
-static int midr_is_a53_a55(unsigned int midr)
+const CpuSet& get_cpu_thread_affinity_mask(int powersave)
 {
-    // 0x 41 ? f d03 ? = arm cortex-a53
-    // 0x 51 ? f 801 ? = qcom kryo200 a53
-    // 0x 41 ? f d04 ? = arm cortex-a35
-    // 0x 41 ? f d05 ? = arm cortex-a55
-    // 0x 51 ? f 803 ? = qcom kryo300 a55
-    // 0x 51 ? f 805 ? = qcom kryo400 a55
+    try_initialize_global_cpu_info();
+    if (powersave == 0)
+        return g_cpu_affinity_mask_all;
 
-    midr_info_t midr_info(midr);
+    if (powersave == 1)
+        return g_cpu_affinity_mask_little;
 
-    return (midr_info.implementer == 0x41 && midr_info.part == 0xd03)
-           || (midr_info.implementer == 0x51 && midr_info.part == 0x801)
-           || (midr_info.implementer == 0x41 && midr_info.part == 0xd04)
-           || (midr_info.implementer == 0x41 && midr_info.part == 0xd05)
-           || (midr_info.implementer == 0x51 && midr_info.part == 0x803)
-           || (midr_info.implementer == 0x51 && midr_info.part == 0x805);
+    if (powersave == 2)
+        return g_cpu_affinity_mask_big;
+
+    NCNN_LOGE("powersave %d not supported", powersave);
+
+    // fallback to all cores anyway
+    return g_cpu_affinity_mask_all;
 }
 
-static int detect_cpu_is_arm_a53_a55()
+int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
 {
-    int a53_a55_cpu_count = 0;
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
+#ifdef _OPENMP
+    int num_threads = thread_affinity_mask.num_enabled();
 
-    // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1
-    bool sysfs_midr = true;
-    for (int i = 0; i < g_cpucount; i++)
+    // set affinity for each thread
+    set_omp_num_threads(num_threads);
+    std::vector<int> ssarets(num_threads, 0);
+    #pragma omp parallel for num_threads(num_threads)
+    for (int i = 0; i < num_threads; i++)
     {
-        unsigned int midr = 0;
+        ssarets[i] = set_sched_affinity(thread_affinity_mask);
+    }
+    for (int i = 0; i < num_threads; i++)
+    {
+        if (ssarets[i] != 0)
+            return -1;
+    }
+#else
+    int ssaret = set_sched_affinity(thread_affinity_mask);
+    if (ssaret != 0)
+        return -1;
+#endif
 
-        // for kernel 4.7+
-        midr = get_midr_from_sysfs(i);
-        if (midr == 0)
-        {
-            sysfs_midr = false;
-            break;
-        }
+    return 0;
+#elif __APPLE__
 
-        if (midr_is_a53_a55(midr))
-        {
-            a53_a55_cpu_count++;
-        }
-    }
+#ifdef _OPENMP
+    int num_threads = thread_affinity_mask.num_enabled();
 
-    if (!sysfs_midr)
+    // set affinity for each thread
+    set_omp_num_threads(num_threads);
+    std::vector<int> ssarets(num_threads, 0);
+    #pragma omp parallel for num_threads(num_threads)
+    for (int i = 0; i < num_threads; i++)
     {
-        // second try, collect midr from /proc/cpuinfo
-        std::vector<unsigned int> midrs;
-        int ret = get_midr_from_proc_cpuinfo(midrs);
-        if (ret == 0 && (int)midrs.size() == g_cpucount)
+        // assign one core for each thread
+        int core = -1 - i;
+        for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
         {
-            for (int i = 0; i < g_cpucount; i++)
+            if (thread_affinity_mask.is_enabled(j))
             {
-                if (midr_is_a53_a55(midrs[i]))
+                if (core == -1)
                 {
-                    a53_a55_cpu_count++;
+                    core = j;
+                    break;
+                }
+                else
+                {
+                    core++;
                 }
             }
         }
-        else
+        CpuSet this_thread_affinity_mask;
+        if (core != -1 - i)
         {
-            // third try, assume all aarch64 little cores are a53/a55
-            a53_a55_cpu_count = get_little_cpu_count();
+            this_thread_affinity_mask.enable(core);
         }
-    }
 
-    if (a53_a55_cpu_count == 0)
-        return 0; // all non a53/a55
-
-    if (a53_a55_cpu_count == g_cpucount)
-        return 1; // all a53/a55
+        ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
+    }
+    for (int i = 0; i < num_threads; i++)
+    {
+        if (ssarets[i] != 0)
+            return -1;
+    }
+#else
+    int ssaret = set_sched_affinity(thread_affinity_mask);
+    if (ssaret != 0)
+        return -1;
+#endif
 
-    // little cores are a53/a55
-    return 2;
+    return 0;
+#else
+    // TODO
+    (void)thread_affinity_mask;
+    return -1;
+#endif
 }
 
-static int g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55();
-#endif // __aarch64__
-#endif // defined __ANDROID__ || defined __linux__
-
 int is_current_thread_running_on_a53_a55()
 {
+    try_initialize_global_cpu_info();
 #if defined __ANDROID__ || defined __linux__
 #if __aarch64__
     if (g_cpu_is_arm_a53_a55 == 0)
diff --git a/src/layer/arm/convolution_3x3_winograd.h b/src/layer/arm/convolution_3x3_winograd.h
index 681979b10..5b5872f9f 100644
--- a/src/layer/arm/convolution_3x3_winograd.h
+++ b/src/layer/arm/convolution_3x3_winograd.h
@@ -4464,6 +4464,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B,
     // resolve optimal tile size from cache size
     const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));
 
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
     (void)B;
 
diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h
index 3e5dee1ae..813b81299 100644
--- a/src/layer/arm/convolution_3x3_winograd_fp16s.h
+++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h
@@ -1999,6 +1999,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in
     // resolve optimal tile size from cache size
     const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));
 
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
     (void)B;
 
diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h
index eeb6df6bb..45651ddae 100644
--- a/src/layer/arm/convolution_im2col_gemm.h
+++ b/src/layer/arm/convolution_im2col_gemm.h
@@ -5950,6 +5950,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in
     // resolve optimal tile size from cache size
     const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));
 
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     // solve K
     {
         // try not to split K
diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h
index 1cb603e7d..f29420e97 100644
--- a/src/layer/arm/convolution_im2col_gemm_bf16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h
@@ -5831,6 +5831,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int
     // resolve optimal tile size from cache size
     const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));
 
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     // solve K
     {
         // try not to split K
diff --git a/src/layer/arm/convolution_im2col_gemm_fp16s.h b/src/layer/arm/convolution_im2col_gemm_fp16s.h
index 360f05f41..a4cc82d70 100644
--- a/src/layer/arm/convolution_im2col_gemm_fp16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h
@@ -3022,6 +3022,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in
     // resolve optimal tile size from cache size
     const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));
 
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     // solve K
     {
         // try not to split K
diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp
index 86079cd6b..e5cfede04 100644
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -3662,7 +3662,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
 static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
     // resolve optimal tile size from cache size
-    size_t l2_cache_size = get_cpu_level2_cache_size();
+    const size_t l2_cache_size = get_cpu_level2_cache_size();
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float));
 
 #if __aarch64__
diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp
index e73a82473..f2b352b80 100644
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2281,7 +2281,11 @@ static void gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_til
 static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
     // resolve optimal tile size from cache size
-    size_t l2_cache_size = get_cpu_level2_cache_size();
+    const size_t l2_cache_size = get_cpu_level2_cache_size();
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16));
 
     TILE_M = std::max(8, tile_size / 8 * 8);
diff --git a/src/layer/arm/gemm_bf16s_fp16s.h b/src/layer/arm/gemm_bf16s_fp16s.h
index 768200a6e..7f93d003c 100644
--- a/src/layer/arm/gemm_bf16s_fp16s.h
+++ b/src/layer/arm/gemm_bf16s_fp16s.h
@@ -1522,7 +1522,11 @@ static void transpose_unpack_output_tile_bf16_fp16(const Mat& topT, Mat& top_blo
 static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
     // resolve optimal tile size from cache size
-    size_t l2_cache_size = get_cpu_level2_cache_size();
+    const size_t l2_cache_size = get_cpu_level2_cache_size();
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));
 
     TILE_M = std::max(8, tile_size / 8 * 8);
diff --git a/src/layer/x86/convolution_3x3_winograd.h b/src/layer/x86/convolution_3x3_winograd.h
index 0c273bed0..ca7751c29 100644
--- a/src/layer/x86/convolution_3x3_winograd.h
+++ b/src/layer/x86/convolution_3x3_winograd.h
@@ -1820,7 +1820,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, Mat&
 static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
     // resolve optimal tile size from cache size
-    size_t l2_cache_size = get_cpu_level2_cache_size();
+    const size_t l2_cache_size = get_cpu_level2_cache_size();
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
 
     // solve M
     {
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
index a85c802c2..8ee06d443 100644
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -6699,7 +6699,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
 static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
     // resolve optimal tile size from cache size
-    size_t l2_cache_size = get_cpu_level2_cache_size();
+    const size_t l2_cache_size = get_cpu_level2_cache_size();
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
     int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));
 
 #if __AVX512F__