get cpu l2 cache size and resolve gemm tile size (#4411)

* get cpu l2 cache size and resolve gemm tile size * optimize constant tile K * fix per-core l2 cache detection, better macos cpu cluster topology discovery
3 years ago · 18fbaebe68
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1361,6 +1361,131 @@ int get_physical_big_cpu_count()
    return g_cpucount - g_physical_cpucount;
 }

 static int get_cpu_level2_cachesize()
 {
    int size = 0;
 #if (defined _WIN32 && !(defined __MINGW32__))
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi != NULL)
    {
        DWORD return_length = 0;
        glpi(NULL, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
        glpi(buffer, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
        DWORD byte_offset = 0;
        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
        {
            if (ptr->Relationship == RelationCache)
            {
                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
                if (Cache->Level == 2)
                {
                    size = std::max(size, (int)Cache->Size);
                }
            }

            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            ptr++;
        }

        free(buffer);
    }
 #elif defined __linux__
    size = sysconf(_SC_LEVEL2_CACHE_SIZE);
 #elif __APPLE__
    // perflevel 0 is the higher performance cluster
    int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
    int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
    size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
 #endif

    // fallback to a common value
    if (size <= 0)
    {
 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
        size = 64 * 1024;
        if (cpu_support_x86_avx())
            size = 128 * 1024;
        if (cpu_support_x86_avx2())
            size = 256 * 1024;
        if (cpu_support_x86_avx512())
            size = 1024 * 1024;
 #elif __aarch64__
        size = 256 * 1024;
 #elif __arm__
        size = 128 * 1024;
 #else
        // is 64k still too large here ?
        size = 64 * 1024;
 #endif
    }

    return size;
 }

 static int get_cpu_level3_cachesize()
 {
    int size = 0;
 #if (defined _WIN32 && !(defined __MINGW32__))
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi != NULL)
    {
        DWORD return_length = 0;
        glpi(NULL, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
        glpi(buffer, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
        DWORD byte_offset = 0;
        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
        {
            if (ptr->Relationship == RelationCache)
            {
                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
                if (Cache->Level == 3)
                {
                    size = std::max(size, (int)Cache->Size);
                }
            }

            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            ptr++;
        }

        free(buffer);
    }
 #elif defined __linux__
    size = sysconf(_SC_LEVEL3_CACHE_SIZE);
 #elif __APPLE__
    // perflevel 0 is the higher performance cluster
    // get the size shared among all cpus
    size = get_hw_capability("hw.perflevel0.l3cachesize");
 #endif

    // l3 cache size can be zero

    return size;
 }

 static int g_cpu_level2_cachesize = get_cpu_level2_cachesize();
 static int g_cpu_level3_cachesize = get_cpu_level3_cachesize();

 int get_cpu_level2_cache_size()
 {
    return g_cpu_level2_cachesize;
 }

 int get_cpu_level3_cache_size()
 {
    return g_cpu_level3_cachesize;
 }

 #if (defined _WIN32 && !(defined __MINGW32__))
 static CpuSet get_smt_cpu_mask()
 {
@@ -1737,90 +1862,26 @@ static int setup_thread_affinity_masks()
            g_thread_affinity_mask_big.enable(i);
    }
 #elif __APPLE__
    // affinity info from cpu model
    // TODO find a general way to get per-core frequency on macos
    if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
    int nperflevels = get_hw_capability("hw.nperflevels");
    if (nperflevels == 1)
    {
        // 2 + 4
        g_thread_affinity_mask_big.enable(0);
        g_thread_affinity_mask_big.enable(1);
        g_thread_affinity_mask_little.enable(2);
        g_thread_affinity_mask_little.enable(3);
        g_thread_affinity_mask_little.enable(4);
        g_thread_affinity_mask_little.enable(5);
        // smp models
        g_thread_affinity_mask_little.disable_all();
        g_thread_affinity_mask_big = g_thread_affinity_mask_all;
    }
    else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
             || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
             || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
             || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
             || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH)
    else
    {
        int cpu_count = get_cpu_count();
        if (cpu_count == 6)
        {
            // 2 + 4
            g_thread_affinity_mask_big.enable(0);
            g_thread_affinity_mask_big.enable(1);
            g_thread_affinity_mask_little.enable(2);
            g_thread_affinity_mask_little.enable(3);
            g_thread_affinity_mask_little.enable(4);
            g_thread_affinity_mask_little.enable(5);
        }
        else if (cpu_count == 8)
        {
            // 4 + 4
            g_thread_affinity_mask_big.enable(0);
            g_thread_affinity_mask_big.enable(1);
            g_thread_affinity_mask_big.enable(2);
            g_thread_affinity_mask_big.enable(3);
            g_thread_affinity_mask_little.enable(4);
            g_thread_affinity_mask_little.enable(5);
            g_thread_affinity_mask_little.enable(6);
            g_thread_affinity_mask_little.enable(7);
        }
        else if (cpu_count == 10)
        // two or more clusters, level0 is the high-performance cluster
        int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
        for (int i = 0; i < perflevel0_logicalcpu; i++)
        {
            // 8 + 2
            g_thread_affinity_mask_big.enable(0);
            g_thread_affinity_mask_big.enable(1);
            g_thread_affinity_mask_big.enable(2);
            g_thread_affinity_mask_big.enable(3);
            g_thread_affinity_mask_big.enable(4);
            g_thread_affinity_mask_big.enable(5);
            g_thread_affinity_mask_big.enable(6);
            g_thread_affinity_mask_big.enable(7);
            g_thread_affinity_mask_little.enable(8);
            g_thread_affinity_mask_little.enable(9);
            g_thread_affinity_mask_big.enable(i);
        }
        else if (cpu_count == 20)
        for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
        {
            // 16 + 4
            g_thread_affinity_mask_big.enable(0);
            g_thread_affinity_mask_big.enable(1);
            g_thread_affinity_mask_big.enable(2);
            g_thread_affinity_mask_big.enable(3);
            g_thread_affinity_mask_big.enable(4);
            g_thread_affinity_mask_big.enable(5);
            g_thread_affinity_mask_big.enable(6);
            g_thread_affinity_mask_big.enable(7);
            g_thread_affinity_mask_big.enable(8);
            g_thread_affinity_mask_big.enable(9);
            g_thread_affinity_mask_big.enable(10);
            g_thread_affinity_mask_big.enable(11);
            g_thread_affinity_mask_big.enable(12);
            g_thread_affinity_mask_big.enable(13);
            g_thread_affinity_mask_big.enable(14);
            g_thread_affinity_mask_big.enable(15);
            g_thread_affinity_mask_little.enable(16);
            g_thread_affinity_mask_little.enable(17);
            g_thread_affinity_mask_little.enable(i);
        }
    }
    else
    {
        // smp models
        g_thread_affinity_mask_little.disable_all();
        g_thread_affinity_mask_big = g_thread_affinity_mask_all;
    }
 #else
    // TODO implement me for other platforms
    g_thread_affinity_mask_little.disable_all();
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -126,6 +126,10 @@ NCNN_EXPORT int get_physical_cpu_count();
 NCNN_EXPORT int get_physical_little_cpu_count();
 NCNN_EXPORT int get_physical_big_cpu_count();

 // cpu l2 varies from 64k to 1M, but l3 can be zero
 NCNN_EXPORT int get_cpu_level2_cache_size();
 NCNN_EXPORT int get_cpu_level3_cache_size();

 // bind all threads on little clusters if powersave enabled
 // affects HMP arch cpu like ARM big.LITTLE
 // only implemented on android at the moment
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -5892,24 +5892,60 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons

 static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
    // TODO do not hardcode
    // resolve optimal tile size from cache size
    size_t l2_cache_size = get_cpu_level2_cache_size();
    int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

 #if __AVX512F__
    TILE_M = tile_size / 16 * 16;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 4 * 4;
 #elif __AVX__
    TILE_M = tile_size / 8 * 8;
    TILE_N = tile_size / 2 * 2;
    TILE_K = tile_size / 4 * 4;
 #elif __SSE2__
    TILE_M = tile_size / 4 * 4;
    TILE_N = tile_size / 2 * 2;
    TILE_K = tile_size / 2 * 2;
 #else
    TILE_M = tile_size / 2 * 2;
    TILE_N = tile_size / 2 * 2;
    TILE_K = tile_size / 1 * 1;
 #endif

    if (K > 0)
    {
        int nn_K = (K + TILE_K - 1) / TILE_K;
 #if __AVX512F__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
 #elif __AVX__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
 #elif __SSE2__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
 #else
        TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K);
 #endif

        if (nn_K == 1)
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

 #if __AVX512F__
    TILE_M = 16 * 8;
    TILE_N = 4 * 32;
    TILE_K = 4 * 32;
            TILE_M = tile_size / 16 * 16;
            TILE_N = tile_size / 4 * 4;
 #elif __AVX__
    TILE_M = 8 * 16;
    TILE_N = 4 * 32;
    TILE_K = 2 * 64;
            TILE_M = tile_size / 8 * 8;
            TILE_N = tile_size / 2 * 2;
 #elif __SSE2__
    TILE_M = 4 * 16;
    TILE_N = 2 * 32;
    TILE_K = 2 * 32;
            TILE_M = tile_size / 4 * 4;
            TILE_N = tile_size / 2 * 2;
 #else
    TILE_M = 2 * 8;
    TILE_N = 2 * 8;
    TILE_K = 1 * 16;
            TILE_M = tile_size / 2 * 2;
            TILE_N = tile_size / 2 * 2;
 #endif
        }
    }

    TILE_M *= std::min(nT, get_physical_cpu_count());

@@ -5933,7 +5969,7 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
 #if __AVX512F__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
 #elif __AVX__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2);
 #elif __SSE2__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2);
 #else
@@ -5941,20 +5977,6 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
 #endif
    }

    if (K > 0)
    {
        int nn_K = (K + TILE_K - 1) / TILE_K;
 #if __AVX512F__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
 #elif __AVX__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
 #elif __SSE2__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
 #else
        TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K);
 #endif
    }

    if (nT > 1)
    {
 #if __AVX512F__