Support for >64 CPU systems

11 months ago · c475868e26
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -13,6 +13,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <algorithm>
 #include <vector>

 #ifdef _OPENMP
 #if NCNN_SIMPLEOMP
@@ -182,6 +184,7 @@ __attribute__((constructor)) void ncnn_kmp_env_initializer()
 static int g_cpucount;
 static int g_physical_cpucount;
 static int g_powersave;
 static int g_max_cpu_count = 0; // Maximum CPU count detected at runtime
 static ncnn::CpuSet g_cpu_affinity_mask_all;
 static ncnn::CpuSet g_cpu_affinity_mask_little;
 static ncnn::CpuSet g_cpu_affinity_mask_big;
@@ -916,24 +919,58 @@ static int get_cpucount()
 }

 #if defined __ANDROID__ || defined __linux__
 static int get_thread_siblings(int cpuid)
 static void get_thread_siblings(int cpuid, ncnn::CpuSet& siblings)
 {
    siblings.disable_all();

    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);

    FILE* fp = 0; //fopen(path, "rb");
    if (fp)
    {
        int thread_siblings = -1;
        int nscan = fscanf(fp, "%x", &thread_siblings);
        if (nscan != 1)
        // Try to read hex mask directly (this path is currently disabled)
        char hex_str[256];
        int nscan = fscanf(fp, "%255s", hex_str);
        if (nscan == 1)
        {
            // ignore
            // Parse hex string into CpuSet
            int len = strlen(hex_str);
            if (hex_str[0] == '0' && hex_str[1] == 'x')
            {
                // Skip "0x" prefix
                len -= 2;
                memmove(hex_str, hex_str + 2, len + 1);
            }

            int ci = 0;
            for (int i = len - 1; i >= 0; i--)
            {
                char c = hex_str[i];
                int hex_val = 0;

                if (c >= '0' && c <= '9')
                    hex_val = c - '0';
                else if (c >= 'a' && c <= 'f')
                    hex_val = c - 'a' + 10;
                else if (c >= 'A' && c <= 'F')
                    hex_val = c - 'A' + 10;
                else
                    continue;

                if (hex_val & 1) siblings.enable(ci + 0);
                if (hex_val & 2) siblings.enable(ci + 1);
                if (hex_val & 4) siblings.enable(ci + 2);
                if (hex_val & 8) siblings.enable(ci + 3);

                ci += 4;
            }
        }

        fclose(fp);

        return thread_siblings;
        if (!siblings.is_empty())
            return;
    }

    // second try, parse from human-readable thread_siblings_list
@@ -942,8 +979,6 @@ static int get_thread_siblings(int cpuid)
    fp = fopen(path, "rb");
    if (fp)
    {
        int thread_siblings = -1;

        int id0;
        char sep;
        int id1;
@@ -951,36 +986,28 @@ static int get_thread_siblings(int cpuid)
        int nscan = fscanf(fp, "%d", &id0);
        if (nscan == 1)
        {
            thread_siblings = (1 << id0);
            siblings.enable(id0);

            while (fscanf(fp, "%c%d", &sep, &id1) == 2)
            {
                if (sep == ',')
                {
                    thread_siblings |= (1 << id1);
                    siblings.enable(id1);
                }
                if (sep == '-' && id0 < id1)
                {
                    for (int i = id0 + 1; i <= id1; i++)
                    {
                        thread_siblings |= (1 << i);
                        siblings.enable(i);
                    }
                }

                id0 = id1;
            }
        }
        else
        {
            // ignore
        }

        fclose(fp);

        return thread_siblings;
    }

    return -1;
 }
 #endif // defined __ANDROID__ || defined __linux__

@@ -1017,11 +1044,12 @@ static int get_physical_cpucount()

    free(buffer);
 #elif defined __ANDROID__ || defined __linux__
    std::vector<int> thread_set;
    std::vector<ncnn::CpuSet> thread_set;
    for (int i = 0; i < g_cpucount; i++)
    {
        int thread_siblings = get_thread_siblings(i);
        if (thread_siblings == -1)
        ncnn::CpuSet thread_siblings;
        get_thread_siblings(i, thread_siblings);
        if (thread_siblings.is_empty())
        {
            // ignore malformed one
            continue;
@@ -1030,7 +1058,18 @@ static int get_physical_cpucount()
        bool thread_siblings_exists = false;
        for (size_t j = 0; j < thread_set.size(); j++)
        {
            if (thread_set[j] == thread_siblings)
            // Compare CpuSets by checking if they have the same enabled CPUs
            bool same = true;
            int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id());
            for (int k = 0; k <= max_cpu; k++)
            {
                if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k))
                {
                    same = false;
                    break;
                }
            }
            if (same)
            {
                thread_siblings_exists = true;
                break;
@@ -1153,11 +1192,24 @@ static int get_data_cache_size(int cpuid, int level)
        int ci = 0;
        for (int i = len - 1; i >= 0; i--)
        {
            char x = shared_cpu_map_str[i];
            if (x & 1) shared_cpu_map.enable(ci + 0);
            if (x & 2) shared_cpu_map.enable(ci + 1);
            if (x & 4) shared_cpu_map.enable(ci + 2);
            if (x & 8) shared_cpu_map.enable(ci + 3);
            char c = shared_cpu_map_str[i];
            int hex_val = 0;

            // Convert hex character to value
            if (c >= '0' && c <= '9')
                hex_val = c - '0';
            else if (c >= 'a' && c <= 'f')
                hex_val = c - 'a' + 10;
            else if (c >= 'A' && c <= 'F')
                hex_val = c - 'A' + 10;
            else
                continue; // Skip invalid characters

            // Set bits according to hex value
            if (hex_val & 1) shared_cpu_map.enable(ci + 0);
            if (hex_val & 2) shared_cpu_map.enable(ci + 1);
            if (hex_val & 4) shared_cpu_map.enable(ci + 2);
            if (hex_val & 8) shared_cpu_map.enable(ci + 3);

            ci += 4;
        }
@@ -1169,14 +1221,15 @@ static int get_data_cache_size(int cpuid, int level)
    // resolve physical cpu count in the shared_cpu_map
    int shared_physical_cpu_count = 0;
    {
        std::vector<int> thread_set;
        std::vector<ncnn::CpuSet> thread_set;
        for (int i = 0; i < g_cpucount; i++)
        {
            if (!shared_cpu_map.is_enabled(i))
                continue;

            int thread_siblings = get_thread_siblings(i);
            if (thread_siblings == -1)
            ncnn::CpuSet thread_siblings;
            get_thread_siblings(i, thread_siblings);
            if (thread_siblings.is_empty())
            {
                // ignore malformed one
                continue;
@@ -1185,7 +1238,18 @@ static int get_data_cache_size(int cpuid, int level)
            bool thread_siblings_exists = false;
            for (size_t j = 0; j < thread_set.size(); j++)
            {
                if (thread_set[j] == thread_siblings)
                // Compare CpuSets by checking if they have the same enabled CPUs
                bool same = true;
                int max_cpu = std::max(thread_siblings.max_cpu_id(), thread_set[j].max_cpu_id());
                for (int k = 0; k <= max_cpu; k++)
                {
                    if (thread_siblings.is_enabled(k) != thread_set[j].is_enabled(k))
                    {
                        same = false;
                        break;
                    }
                }
                if (same)
                {
                    thread_siblings_exists = true;
                    break;
@@ -1373,11 +1437,17 @@ static ncnn::CpuSet get_smt_cpu_mask()
        if (ptr->Relationship == RelationProcessorCore)
        {
            ncnn::CpuSet smt_set;
            smt_set.mask = ptr->ProcessorMask;
            smt_set.set_legacy_mask(ptr->ProcessorMask);
            if (smt_set.num_enabled() > 1)
            {
                // this core is smt
                smt_cpu_mask.mask |= smt_set.mask;
                // this core is smt - merge with existing smt_cpu_mask
                for (int i = 0; i < 64; i++) // ProcessorMask is limited to 64 bits
                {
                    if (smt_set.is_enabled(i))
                    {
                        smt_cpu_mask.enable(i);
                    }
                }
            }
        }

@@ -1432,14 +1502,73 @@ static std::vector<int> get_max_freq_mhz()

 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
    if (prev_mask == 0)
    // Check if we can use the legacy method (<=64 CPUs)
    int max_cpu = thread_affinity_mask.max_cpu_id();
    if (max_cpu < 64)
    {
        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
        ULONG_PTR legacy_mask = thread_affinity_mask.get_legacy_mask();
        if (legacy_mask != 0)
        {
            DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), legacy_mask);
            if (prev_mask == 0)
            {
                NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
                return -1;
            }
            return 0;
        }
    }

    // For >64 CPU support, use SetThreadGroupAffinity
    // Windows organizes CPUs into groups of 64
    typedef BOOL(WINAPI * LPFN_STGA)(HANDLE, const GROUP_AFFINITY*, GROUP_AFFINITY*);

    HMODULE kernel32 = GetModuleHandle(TEXT("kernel32.dll"));
    if (!kernel32)
    {
        NCNN_LOGE("Failed to get kernel32.dll handle");
        return -1;
    }

    return 0;
    LPFN_STGA SetThreadGroupAffinityFunc = (LPFN_STGA)GetProcAddress(kernel32, "SetThreadGroupAffinity");
    if (!SetThreadGroupAffinityFunc)
    {
        NCNN_LOGE("SetThreadGroupAffinity not available, >64 CPU affinity not supported");
        return -1;
    }

    // Find the first enabled CPU and set affinity to its group
    // This is a simplified implementation - ideally we'd handle multiple groups
    for (int cpu = 0; cpu <= max_cpu; cpu++)
    {
        if (thread_affinity_mask.is_enabled(cpu))
        {
            GROUP_AFFINITY group_affinity = {0};
            group_affinity.Group = (WORD)(cpu / 64);
            group_affinity.Mask = 1ULL << (cpu % 64);

            // Add other CPUs in the same group
            for (int other_cpu = cpu + 1; other_cpu <= max_cpu && other_cpu < (group_affinity.Group + 1) * 64; other_cpu++)
            {
                if (thread_affinity_mask.is_enabled(other_cpu))
                {
                    group_affinity.Mask |= 1ULL << (other_cpu % 64);
                }
            }

            GROUP_AFFINITY prev_affinity;
            if (!SetThreadGroupAffinityFunc(GetCurrentThread(), &group_affinity, &prev_affinity))
            {
                NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError());
                return -1;
            }

            return 0;
        }
    }

    NCNN_LOGE("No CPUs enabled in affinity mask");
    return -1;
 }
 #endif // defined _WIN32

@@ -1560,7 +1689,14 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
    pid_t pid = syscall(SYS_gettid);
 #endif

    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
    const cpu_set_t* cpuset = thread_affinity_mask.get_cpu_set();
    if (!cpuset)
    {
        NCNN_LOGE("Failed to get cpu_set from CpuSet");
        return -1;
    }

    int syscallret = syscall(__NR_sched_setaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), cpuset);
    if (syscallret)
    {
        NCNN_LOGE("syscall error %d", syscallret);
@@ -1583,7 +1719,8 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
    // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio

    int affinity_tag = THREAD_AFFINITY_TAG_NULL;
    for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
    int max_cpu = thread_affinity_mask.max_cpu_id();
    for (int i = 0; i <= max_cpu && i < 32; i++) // Apple policy is limited to 32 bits
    {
        if (thread_affinity_mask.is_enabled(i))
        {
@@ -2052,13 +2189,25 @@ static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)

    thread_affinity_mask.disable_all();

    int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
    // Allocate a temporary cpu_set_t for the syscall
    cpu_set_t* temp_cpuset = CPU_ALLOC(CPU_SETSIZE);
    if (!temp_cpuset)
    {
        return -1;
    }

    int syscallret = syscall(__NR_sched_getaffinity, pid, CPU_ALLOC_SIZE(CPU_SETSIZE), temp_cpuset);
    if (syscallret)
    {
        CPU_FREE(temp_cpuset);
        // handle get error silently
        return -1;
    }

    // Copy the result to our CpuSet
    thread_affinity_mask.set_cpu_set(temp_cpuset);
    CPU_FREE(temp_cpuset);

    return 0;
 }

@@ -2149,6 +2298,10 @@ static void initialize_global_cpu_info()
    g_cpucount = get_cpucount();
    g_physical_cpucount = get_physical_cpucount();
    g_powersave = 0;

    // Set global max CPU count for CpuSet optimization
    g_max_cpu_count = g_cpucount;

    initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);

 #if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
@@ -2265,142 +2418,506 @@ static inline void try_initialize_global_cpu_info()

 namespace ncnn {

 #if defined _WIN32
 // New unified CpuSet implementation supporting >64 CPUs
 CpuSet::CpuSet()
    : fast_mask(0)
    , extended_mask(nullptr)
    , extended_capacity(0)
    , use_extended(false)
 #if defined _WIN32
    , legacy_mask_cache(0)
    , legacy_mask_valid(false)
 #endif
 #if defined __ANDROID__ || defined __linux__
    , cpu_set_cache(nullptr)
    , cpu_set_valid(false)
 #endif
 #if __APPLE__
    , legacy_policy_cache(0)
    , legacy_policy_valid(false)
 #endif
 {
    disable_all();
 }

 void CpuSet::enable(int cpu)
 CpuSet::CpuSet(const CpuSet& other)
    : fast_mask(0)
    , extended_mask(nullptr)
    , extended_capacity(0)
    , use_extended(false)
 #if defined _WIN32
    , legacy_mask_cache(0)
    , legacy_mask_valid(false)
 #endif
 #if defined __ANDROID__ || defined __linux__
    , cpu_set_cache(nullptr)
    , cpu_set_valid(false)
 #endif
 #if __APPLE__
    , legacy_policy_cache(0)
    , legacy_policy_valid(false)
 #endif
 {
    mask |= ((ULONG_PTR)1 << cpu);
    copy_from(other);
 }

 void CpuSet::disable(int cpu)
 CpuSet& CpuSet::operator=(const CpuSet& other)
 {
    mask &= ~((ULONG_PTR)1 << cpu);
    if (this != &other)
    {
        copy_from(other);
    }
    return *this;
 }

 void CpuSet::disable_all()
 CpuSet::~CpuSet()
 {
    mask = 0;
    if (extended_mask)
    {
        free(extended_mask);
    }
 #if defined __ANDROID__ || defined __linux__
    if (cpu_set_cache)
    {
        CPU_FREE(cpu_set_cache);
    }
 #endif
 }

 bool CpuSet::is_enabled(int cpu) const
 void CpuSet::copy_from(const CpuSet& other)
 {
    return mask & ((ULONG_PTR)1 << cpu);
 }
    // Clean up existing state
    if (extended_mask)
    {
        free(extended_mask);
        extended_mask = nullptr;
    }
    extended_capacity = 0;

 int CpuSet::num_enabled() const
 {
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
    // Copy basic state
    fast_mask = other.fast_mask;
    use_extended = other.use_extended;

    // Copy extended mask if needed
    if (other.use_extended && other.extended_mask)
    {
        if (is_enabled(i))
            num_enabled++;
        extended_capacity = other.extended_capacity;
        extended_mask = (uint64_t*)malloc(extended_capacity * sizeof(uint64_t));
        if (extended_mask)
        {
            memcpy(extended_mask, other.extended_mask, extended_capacity * sizeof(uint64_t));
        }
    }

    return num_enabled;
    // Invalidate caches
 #if defined _WIN32
    legacy_mask_valid = false;
 #endif
 #if defined __ANDROID__ || defined __linux__
    cpu_set_valid = false;
    if (cpu_set_cache)
    {
        CPU_FREE(cpu_set_cache);
        cpu_set_cache = nullptr;
    }
 #endif
 #if __APPLE__
    legacy_policy_valid = false;
 #endif
 }
 #elif defined __ANDROID__ || defined __linux__
 CpuSet::CpuSet()

 void CpuSet::ensure_capacity(int cpu_id)
 {
    disable_all();
 }
    if (cpu_id < FAST_PATH_BITS && !use_extended)
    {
        return; // Fast path is sufficient
    }

    // Need to switch to extended mode
    if (!use_extended)
    {
        use_extended = true;
        // Calculate required capacity
        int required_words = (cpu_id / BITS_PER_WORD) + 1;
        extended_capacity = std::max(required_words, 2); // Minimum 2 words
        extended_mask = (uint64_t*)calloc(extended_capacity, sizeof(uint64_t));
        if (extended_mask)
        {
            // Copy fast_mask to extended_mask[0]
            extended_mask[0] = fast_mask;
        }
        return;
    }

    // Already in extended mode, check if we need more capacity
    int required_words = (cpu_id / BITS_PER_WORD) + 1;
    if (required_words > extended_capacity)
    {
        int new_capacity = std::max(required_words, extended_capacity * 2);
        uint64_t* new_mask = (uint64_t*)realloc(extended_mask, new_capacity * sizeof(uint64_t));
        if (new_mask)
        {
            // Zero out new memory
            memset(new_mask + extended_capacity, 0, (new_capacity - extended_capacity) * sizeof(uint64_t));
            extended_mask = new_mask;
            extended_capacity = new_capacity;
        }
    }
 }
 void CpuSet::enable(int cpu)
 {
    CPU_SET(cpu, &cpu_set);
    if (cpu < 0) return;

    ensure_capacity(cpu);

    if (!use_extended && cpu < FAST_PATH_BITS)
    {
        fast_mask |= (1ULL << cpu);
    }
    else if (use_extended && extended_mask)
    {
        int word_idx = cpu / BITS_PER_WORD;
        int bit_idx = cpu % BITS_PER_WORD;
        if (word_idx < extended_capacity)
        {
            extended_mask[word_idx] |= (1ULL << bit_idx);
        }
    }

    // Invalidate caches
 #if defined _WIN32
    legacy_mask_valid = false;
 #endif
 #if defined __ANDROID__ || defined __linux__
    cpu_set_valid = false;
 #endif
 #if __APPLE__
    legacy_policy_valid = false;
 #endif
 }

 void CpuSet::disable(int cpu)
 {
    CPU_CLR(cpu, &cpu_set);
    if (cpu < 0) return;

    if (!use_extended && cpu < FAST_PATH_BITS)
    {
        fast_mask &= ~(1ULL << cpu);
    }
    else if (use_extended && extended_mask)
    {
        int word_idx = cpu / BITS_PER_WORD;
        int bit_idx = cpu % BITS_PER_WORD;
        if (word_idx < extended_capacity)
        {
            extended_mask[word_idx] &= ~(1ULL << bit_idx);
        }
    }

    // Invalidate caches
 #if defined _WIN32
    legacy_mask_valid = false;
 #endif
 #if defined __ANDROID__ || defined __linux__
    cpu_set_valid = false;
 #endif
 #if __APPLE__
    legacy_policy_valid = false;
 #endif
 }

 void CpuSet::disable_all()
 {
    CPU_ZERO(&cpu_set);
    fast_mask = 0;
    if (use_extended && extended_mask)
    {
        memset(extended_mask, 0, extended_capacity * sizeof(uint64_t));
    }

    // Invalidate caches
 #if defined _WIN32
    legacy_mask_valid = false;
 #endif
 #if defined __ANDROID__ || defined __linux__
    cpu_set_valid = false;
 #endif
 #if __APPLE__
    legacy_policy_valid = false;
 #endif
 }

 bool CpuSet::is_enabled(int cpu) const
 {
    return CPU_ISSET(cpu, &cpu_set);
 }
    if (cpu < 0) return false;

 int CpuSet::num_enabled() const
 {
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
    if (!use_extended && cpu < FAST_PATH_BITS)
    {
        return (fast_mask & (1ULL << cpu)) != 0;
    }
    else if (use_extended && extended_mask)
    {
        if (is_enabled(i))
            num_enabled++;
        int word_idx = cpu / BITS_PER_WORD;
        int bit_idx = cpu % BITS_PER_WORD;
        if (word_idx < extended_capacity)
        {
            return (extended_mask[word_idx] & (1ULL << bit_idx)) != 0;
        }
    }

    return num_enabled;
    return false;
 }
 #elif __APPLE__
 CpuSet::CpuSet()
 // Helper function to count bits in a 64-bit integer
 static int popcount64(uint64_t x)
 {
    disable_all();
 #if defined(__GNUC__) || defined(__clang__)
    return __builtin_popcountll(x);
 #elif defined(_MSC_VER)
    return (int)__popcnt64(x);
 #else
    // Fallback implementation
    int count = 0;
    while (x)
    {
        count += x & 1;
        x >>= 1;
    }
    return count;
 #endif
 }

 void CpuSet::enable(int cpu)
 int CpuSet::num_enabled() const
 {
    policy |= ((unsigned int)1 << cpu);
    int count = 0;

    if (!use_extended)
    {
        // Fast path: count bits in fast_mask
        count = popcount64(fast_mask);
    }
    else if (extended_mask)
    {
        // Extended path: count bits in all words
        for (int i = 0; i < extended_capacity; i++)
        {
            count += popcount64(extended_mask[i]);
        }
    }

    return count;
 }

 void CpuSet::disable(int cpu)
 int CpuSet::max_cpu_id() const
 {
    policy &= ~((unsigned int)1 << cpu);
    if (!use_extended)
    {
        if (fast_mask == 0) return -1;

        // Find highest set bit in fast_mask
        for (int i = FAST_PATH_BITS - 1; i >= 0; i--)
        {
            if (fast_mask & (1ULL << i))
                return i;
        }
        return -1;
    }
    else if (extended_mask)
    {
        // Find highest set bit in extended_mask
        for (int word = extended_capacity - 1; word >= 0; word--)
        {
            if (extended_mask[word] != 0)
            {
                for (int bit = BITS_PER_WORD - 1; bit >= 0; bit--)
                {
                    if (extended_mask[word] & (1ULL << bit))
                        return word * BITS_PER_WORD + bit;
                }
            }
        }
    }

    return -1;
 }

 void CpuSet::disable_all()
 bool CpuSet::is_empty() const
 {
    policy = 0;
    if (!use_extended)
    {
        return fast_mask == 0;
    }
    else if (extended_mask)
    {
        for (int i = 0; i < extended_capacity; i++)
        {
            if (extended_mask[i] != 0)
                return false;
        }
    }

    return true;
 }

 bool CpuSet::is_enabled(int cpu) const
 void CpuSet::set_range(int start_cpu, int end_cpu, bool enabled)
 {
    return policy & ((unsigned int)1 << cpu);
 }
    if (start_cpu < 0 || end_cpu < start_cpu) return;

 int CpuSet::num_enabled() const
    for (int cpu = start_cpu; cpu <= end_cpu; cpu++)
    {
        if (enabled)
            enable(cpu);
        else
            disable(cpu);
    }
 }
 // Platform-specific compatibility methods
 #if defined _WIN32
 ULONG_PTR CpuSet::get_legacy_mask() const
 {
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(policy) * 8; i++)
    if (!legacy_mask_valid)
    {
        if (is_enabled(i))
            num_enabled++;
        legacy_mask_cache = 0;

        if (!use_extended)
        {
            // Fast path: directly use fast_mask (truncated to ULONG_PTR size)
            legacy_mask_cache = (ULONG_PTR)(fast_mask & ((1ULL << (sizeof(ULONG_PTR) * 8)) - 1));
        }
        else if (extended_mask && extended_capacity > 0)
        {
            // Extended path: use first word, truncated to ULONG_PTR size
            legacy_mask_cache = (ULONG_PTR)(extended_mask[0] & ((1ULL << (sizeof(ULONG_PTR) * 8)) - 1));
        }

        legacy_mask_valid = true;
    }

    return num_enabled;
    return legacy_mask_cache;
 }
 #else
 CpuSet::CpuSet()

 void CpuSet::set_legacy_mask(ULONG_PTR mask)
 {
    disable_all();

    // Set bits according to the legacy mask
    for (int i = 0; i < (int)(sizeof(ULONG_PTR) * 8); i++)
    {
        if (mask & ((ULONG_PTR)1 << i))
        {
            enable(i);
        }
    }
 }
 #endif

 void CpuSet::enable(int /* cpu */)
 #if defined __ANDROID__ || defined __linux__
 const cpu_set_t* CpuSet::get_cpu_set() const
 {
    if (!cpu_set_valid)
    {
        // Allocate cpu_set_t if not already done
        if (!cpu_set_cache)
        {
            cpu_set_cache = CPU_ALLOC(CPU_SETSIZE);
            if (!cpu_set_cache)
                return nullptr;
        }

        CPU_ZERO_S(CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);

        // Copy our internal representation to cpu_set_t
        if (!use_extended)
        {
            for (int i = 0; i < FAST_PATH_BITS && i < CPU_SETSIZE; i++)
            {
                if (fast_mask & (1ULL << i))
                {
                    CPU_SET_S(i, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
                }
            }
        }
        else if (extended_mask)
        {
            for (int word = 0; word < extended_capacity; word++)
            {
                uint64_t mask = extended_mask[word];
                for (int bit = 0; bit < BITS_PER_WORD; bit++)
                {
                    int cpu_id = word * BITS_PER_WORD + bit;
                    if (cpu_id >= CPU_SETSIZE) break;

                    if (mask & (1ULL << bit))
                    {
                        CPU_SET_S(cpu_id, CPU_ALLOC_SIZE(CPU_SETSIZE), cpu_set_cache);
                    }
                }
                if ((word + 1) * BITS_PER_WORD >= CPU_SETSIZE) break;
            }
        }

        cpu_set_valid = true;
    }

    return cpu_set_cache;
 }

 void CpuSet::disable(int /* cpu */)
 cpu_set_t* CpuSet::get_cpu_set_mutable()
 {
    get_cpu_set(); // Ensure cache is valid
    return cpu_set_cache;
 }

 void CpuSet::disable_all()
 void CpuSet::set_cpu_set(const cpu_set_t* cpuset)
 {
    if (!cpuset) return;

    disable_all();

    // Copy from cpu_set_t to our internal representation
    for (int i = 0; i < CPU_SETSIZE; i++)
    {
        if (CPU_ISSET(i, cpuset))
        {
            enable(i);
        }
    }
 }
 #endif

 bool CpuSet::is_enabled(int /* cpu */) const
 #if __APPLE__
 unsigned int CpuSet::get_legacy_policy() const
 {
    return true;
    if (!legacy_policy_valid)
    {
        legacy_policy_cache = 0;

        if (!use_extended)
        {
            // Fast path: directly use fast_mask (truncated to 32 bits)
            legacy_policy_cache = (unsigned int)(fast_mask & 0xFFFFFFFFU);
        }
        else if (extended_mask && extended_capacity > 0)
        {
            // Extended path: use first word, truncated to 32 bits
            legacy_policy_cache = (unsigned int)(extended_mask[0] & 0xFFFFFFFFU);
        }

        legacy_policy_valid = true;
    }

    return legacy_policy_cache;
 }

 int CpuSet::num_enabled() const
 void CpuSet::set_legacy_policy(unsigned int policy)
 {
    return get_cpu_count();
    disable_all();

    // Set bits according to the legacy policy
    for (int i = 0; i < 32; i++)
    {
        if (policy & (1U << i))
        {
            enable(i);
        }
    }
 }
 #endif

@@ -3065,7 +3582,8 @@ int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
    {
        // assign one core for each thread
        int core = -1 - i;
        for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
        int max_cpu = thread_affinity_mask.max_cpu_id();
        for (int j = 0; j <= max_cpu && j < 32; j++) // Apple policy is limited to 32 bits
        {
            if (thread_affinity_mask.is_enabled(j))
            {
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -22,21 +22,64 @@ class NCNN_EXPORT CpuSet
 {
 public:
    CpuSet();
    CpuSet(const CpuSet& other);
    CpuSet& operator=(const CpuSet& other);
    ~CpuSet();

    void enable(int cpu);
    void disable(int cpu);
    void disable_all();
    bool is_enabled(int cpu) const;
    int num_enabled() const;

 public:
    // New methods for >64 CPU support
    int max_cpu_id() const;
    bool is_empty() const;
    void set_range(int start_cpu, int end_cpu, bool enabled);

    // Platform-specific accessors for backward compatibility
 #if defined _WIN32
    ULONG_PTR get_legacy_mask() const;
    void set_legacy_mask(ULONG_PTR mask);
 #endif
 #if defined __ANDROID__ || defined __linux__
    const cpu_set_t* get_cpu_set() const;
    cpu_set_t* get_cpu_set_mutable();
    void set_cpu_set(const cpu_set_t* cpuset);
 #endif
 #if __APPLE__
    unsigned int get_legacy_policy() const;
    void set_legacy_policy(unsigned int policy);
 #endif

 private:
    void ensure_capacity(int cpu_id);
    void copy_from(const CpuSet& other);

    // Internal implementation details
    static const int FAST_PATH_BITS = 64;
    static const int BITS_PER_WORD = 64;

    // Fast path for systems with <= 64 CPUs
    uint64_t fast_mask;

    // Extended path for systems with > 64 CPUs
    uint64_t* extended_mask;
    int extended_capacity; // in number of uint64_t words
    bool use_extended;

    // Platform-specific storage for compatibility
 #if defined _WIN32
    ULONG_PTR mask;
    mutable ULONG_PTR legacy_mask_cache;
    mutable bool legacy_mask_valid;
 #endif
 #if defined __ANDROID__ || defined __linux__
    cpu_set_t cpu_set;
    mutable cpu_set_t* cpu_set_cache;
    mutable bool cpu_set_valid;
 #endif
 #if __APPLE__
    unsigned int policy;
    mutable unsigned int legacy_policy_cache;
    mutable bool legacy_policy_valid;
 #endif
 };

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -60,9 +60,16 @@ endif()

 ncnn_add_test(c_api)
 ncnn_add_test(cpu)
 ncnn_add_test(cpu_large)
 ncnn_add_test(cpu_simulation)
 ncnn_add_test(expression)
 ncnn_add_test(paramdict)

 # Add validate_cpu_support test manually
 add_executable(test_validate_cpu_support validate_cpu_support.cpp)
 target_link_libraries(test_validate_cpu_support ncnn)
 add_test(NAME test_validate_cpu_support COMMAND test_validate_cpu_support)

 if(NCNN_VULKAN)
    ncnn_add_test(command)
 endif()
--- a/tests/test_cpu_large.cpp
+++ b/tests/test_cpu_large.cpp
@@ -0,0 +1,250 @@
 // Copyright 2024 Tencent
 // SPDX-License-Identifier: BSD-3-Clause

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "cpu.h"

 // Test CpuSet with >64 CPUs
 static int test_cpuset_large()
 {
    printf("Testing CpuSet with >64 CPUs...\n");
    
    ncnn::CpuSet set;
    
    // Test basic operations with large CPU IDs
    const int test_cpus[] = {0, 63, 64, 65, 127, 128, 255, 256, 511, 512, 1023};
    const int num_test_cpus = sizeof(test_cpus) / sizeof(test_cpus[0]);
    
    // Initially all should be disabled
    for (int i = 0; i < num_test_cpus; i++)
    {
        if (set.is_enabled(test_cpus[i]))
        {
            fprintf(stderr, "CPU %d should be disabled initially\n", test_cpus[i]);
            return 1;
        }
    }
    
    if (set.num_enabled() != 0)
    {
        fprintf(stderr, "Initially no CPUs should be enabled\n");
        return 1;
    }
    
    if (!set.is_empty())
    {
        fprintf(stderr, "Initially CpuSet should be empty\n");
        return 1;
    }
    
    // Enable all test CPUs
    for (int i = 0; i < num_test_cpus; i++)
    {
        set.enable(test_cpus[i]);
    }
    
    // Verify they are enabled
    for (int i = 0; i < num_test_cpus; i++)
    {
        if (!set.is_enabled(test_cpus[i]))
        {
            fprintf(stderr, "CPU %d should be enabled\n", test_cpus[i]);
            return 1;
        }
    }
    
    if (set.num_enabled() != num_test_cpus)
    {
        fprintf(stderr, "Expected %d enabled CPUs, got %d\n", num_test_cpus, set.num_enabled());
        return 1;
    }
    
    if (set.is_empty())
    {
        fprintf(stderr, "CpuSet should not be empty after enabling CPUs\n");
        return 1;
    }
    
    // Test max_cpu_id
    int max_cpu = set.max_cpu_id();
    if (max_cpu != 1023)
    {
        fprintf(stderr, "Expected max CPU ID 1023, got %d\n", max_cpu);
        return 1;
    }
    
    // Test disable
    set.disable(test_cpus[0]);
    if (set.is_enabled(test_cpus[0]))
    {
        fprintf(stderr, "CPU %d should be disabled after disable()\n", test_cpus[0]);
        return 1;
    }
    
    if (set.num_enabled() != num_test_cpus - 1)
    {
        fprintf(stderr, "Expected %d enabled CPUs after disable, got %d\n", 
                num_test_cpus - 1, set.num_enabled());
        return 1;
    }
    
    // Test set_range
    set.disable_all();
    set.set_range(100, 200, true);
    
    int expected_range_count = 200 - 100 + 1;
    if (set.num_enabled() != expected_range_count)
    {
        fprintf(stderr, "Expected %d CPUs in range [100,200], got %d\n", 
                expected_range_count, set.num_enabled());
        return 1;
    }
    
    for (int i = 100; i <= 200; i++)
    {
        if (!set.is_enabled(i))
        {
            fprintf(stderr, "CPU %d should be enabled in range [100,200]\n", i);
            return 1;
        }
    }
    
    // Test copy constructor
    ncnn::CpuSet set_copy(set);
    if (set_copy.num_enabled() != set.num_enabled())
    {
        fprintf(stderr, "Copy constructor failed: different num_enabled\n");
        return 1;
    }
    
    for (int i = 0; i <= 1023; i++)
    {
        if (set_copy.is_enabled(i) != set.is_enabled(i))
        {
            fprintf(stderr, "Copy constructor failed: CPU %d state differs\n", i);
            return 1;
        }
    }
    
    // Test assignment operator
    ncnn::CpuSet set_assigned;
    set_assigned.enable(999);
    set_assigned = set;
    
    if (set_assigned.num_enabled() != set.num_enabled())
    {
        fprintf(stderr, "Assignment operator failed: different num_enabled\n");
        return 1;
    }
    
    for (int i = 0; i <= 1023; i++)
    {
        if (set_assigned.is_enabled(i) != set.is_enabled(i))
        {
            fprintf(stderr, "Assignment operator failed: CPU %d state differs\n", i);
            return 1;
        }
    }
    
    printf("CpuSet large CPU test passed!\n");
    return 0;
 }

 // Test boundary conditions
 static int test_cpuset_boundary()
 {
    printf("Testing CpuSet boundary conditions...\n");
    
    ncnn::CpuSet set;
    
    // Test CPU ID 0
    set.enable(0);
    if (!set.is_enabled(0))
    {
        fprintf(stderr, "CPU 0 should be enabled\n");
        return 1;
    }
    
    // Test exactly 64 CPUs (boundary between fast and extended path)
    set.disable_all();
    for (int i = 0; i < 64; i++)
    {
        set.enable(i);
    }
    
    if (set.num_enabled() != 64)
    {
        fprintf(stderr, "Expected 64 enabled CPUs, got %d\n", set.num_enabled());
        return 1;
    }
    
    // Test 65th CPU (should trigger extended mode)
    set.enable(64);
    if (set.num_enabled() != 65)
    {
        fprintf(stderr, "Expected 65 enabled CPUs, got %d\n", set.num_enabled());
        return 1;
    }
    
    // Test negative CPU ID (should be ignored)
    set.enable(-1);
    set.disable(-1);
    // Should not crash
    
    // Test very large CPU ID
    set.enable(10000);
    if (!set.is_enabled(10000))
    {
        fprintf(stderr, "CPU 10000 should be enabled\n");
        return 1;
    }
    
    printf("CpuSet boundary test passed!\n");
    return 0;
 }

 // Test performance with large CPU sets
 static int test_cpuset_performance()
 {
    printf("Testing CpuSet performance with large CPU sets...\n");
    
    ncnn::CpuSet set;
    
    // Enable many CPUs
    const int max_cpu = 2048;
    for (int i = 0; i < max_cpu; i += 2)  // Enable every other CPU
    {
        set.enable(i);
    }
    
    // Verify count
    int expected_count = max_cpu / 2;
    if (set.num_enabled() != expected_count)
    {
        fprintf(stderr, "Expected %d enabled CPUs, got %d\n", expected_count, set.num_enabled());
        return 1;
    }
    
    // Test copy performance
    ncnn::CpuSet set_copy(set);
    if (set_copy.num_enabled() != expected_count)
    {
        fprintf(stderr, "Copy failed: expected %d enabled CPUs, got %d\n", 
                expected_count, set_copy.num_enabled());
        return 1;
    }
    
    printf("CpuSet performance test passed!\n");
    return 0;
 }

 int main()
 {
    return 0
           || test_cpuset_large()
           || test_cpuset_boundary()
           || test_cpuset_performance();
 }