From 18fbaebe68f167eca5b3e92774ba7c8d7337d9ee Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 14 Dec 2022 08:49:28 +0800
Subject: [PATCH] get cpu l2 cache size and resolve gemm tile size (#4411)

* get cpu l2 cache size and resolve gemm tile size

* optimize constant tile K

* fix per-core l2 cache detection, better macos cpu cluster topology discovery
---
 src/cpu.cpp                | 213 ++++++++++++++++++++++++-------------
 src/cpu.h                  |   4 +
 src/layer/x86/gemm_x86.cpp |  78 +++++++++-----
 3 files changed, 191 insertions(+), 104 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 85c65335c..014944ea1 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1361,6 +1361,131 @@ int get_physical_big_cpu_count()
     return g_cpucount - g_physical_cpucount;
 }
 
+static int get_cpu_level2_cachesize()
+{
+    int size = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi != NULL)
+    {
+        DWORD return_length = 0;
+        glpi(NULL, &return_length);
+
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+        glpi(buffer, &return_length);
+
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+        DWORD byte_offset = 0;
+        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+        {
+            if (ptr->Relationship == RelationCache)
+            {
+                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
+                if (Cache->Level == 2)
+                {
+                    size = std::max(size, (int)Cache->Size);
+                }
+            }
+
+            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+            ptr++;
+        }
+
+        free(buffer);
+    }
+#elif defined __linux__
+    size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+#elif __APPLE__
+    // perflevel 0 is the higher performance cluster
+    int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
+    int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
+    size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
+#endif
+
+    // fallback to a common value
+    if (size <= 0)
+    {
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+        size = 64 * 1024;
+        if (cpu_support_x86_avx())
+            size = 128 * 1024;
+        if (cpu_support_x86_avx2())
+            size = 256 * 1024;
+        if (cpu_support_x86_avx512())
+            size = 1024 * 1024;
+#elif __aarch64__
+        size = 256 * 1024;
+#elif __arm__
+        size = 128 * 1024;
+#else
+        // is 64k still too large here ?
+        size = 64 * 1024;
+#endif
+    }
+
+    return size;
+}
+
+static int get_cpu_level3_cachesize()
+{
+    int size = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi != NULL)
+    {
+        DWORD return_length = 0;
+        glpi(NULL, &return_length);
+
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+        glpi(buffer, &return_length);
+
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+        DWORD byte_offset = 0;
+        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+        {
+            if (ptr->Relationship == RelationCache)
+            {
+                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
+                if (Cache->Level == 3)
+                {
+                    size = std::max(size, (int)Cache->Size);
+                }
+            }
+
+            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+            ptr++;
+        }
+
+        free(buffer);
+    }
+#elif defined __linux__
+    size = sysconf(_SC_LEVEL3_CACHE_SIZE);
+#elif __APPLE__
+    // perflevel 0 is the higher performance cluster
+    // get the size shared among all cpus
+    size = get_hw_capability("hw.perflevel0.l3cachesize");
+#endif
+
+    // l3 cache size can be zero
+
+    return size;
+}
+
+static int g_cpu_level2_cachesize = get_cpu_level2_cachesize();
+static int g_cpu_level3_cachesize = get_cpu_level3_cachesize();
+
+int get_cpu_level2_cache_size()
+{
+    return g_cpu_level2_cachesize;
+}
+
+int get_cpu_level3_cache_size()
+{
+    return g_cpu_level3_cachesize;
+}
+
 #if (defined _WIN32 && !(defined __MINGW32__))
 static CpuSet get_smt_cpu_mask()
 {
@@ -1737,90 +1862,26 @@ static int setup_thread_affinity_masks()
             g_thread_affinity_mask_big.enable(i);
     }
 #elif __APPLE__
-    // affinity info from cpu model
-    // TODO find a general way to get per-core frequency on macos
-    if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
+    int nperflevels = get_hw_capability("hw.nperflevels");
+    if (nperflevels == 1)
     {
-        // 2 + 4
-        g_thread_affinity_mask_big.enable(0);
-        g_thread_affinity_mask_big.enable(1);
-        g_thread_affinity_mask_little.enable(2);
-        g_thread_affinity_mask_little.enable(3);
-        g_thread_affinity_mask_little.enable(4);
-        g_thread_affinity_mask_little.enable(5);
+        // smp models
+        g_thread_affinity_mask_little.disable_all();
+        g_thread_affinity_mask_big = g_thread_affinity_mask_all;
     }
-    else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
-             || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
-             || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
-             || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
-             || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH)
+    else
     {
-        int cpu_count = get_cpu_count();
-        if (cpu_count == 6)
-        {
-            // 2 + 4
-            g_thread_affinity_mask_big.enable(0);
-            g_thread_affinity_mask_big.enable(1);
-            g_thread_affinity_mask_little.enable(2);
-            g_thread_affinity_mask_little.enable(3);
-            g_thread_affinity_mask_little.enable(4);
-            g_thread_affinity_mask_little.enable(5);
-        }
-        else if (cpu_count == 8)
-        {
-            // 4 + 4
-            g_thread_affinity_mask_big.enable(0);
-            g_thread_affinity_mask_big.enable(1);
-            g_thread_affinity_mask_big.enable(2);
-            g_thread_affinity_mask_big.enable(3);
-            g_thread_affinity_mask_little.enable(4);
-            g_thread_affinity_mask_little.enable(5);
-            g_thread_affinity_mask_little.enable(6);
-            g_thread_affinity_mask_little.enable(7);
-        }
-        else if (cpu_count == 10)
+        // two or more clusters, level0 is the high-performance cluster
+        int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
+        for (int i = 0; i < perflevel0_logicalcpu; i++)
         {
-            // 8 + 2
-            g_thread_affinity_mask_big.enable(0);
-            g_thread_affinity_mask_big.enable(1);
-            g_thread_affinity_mask_big.enable(2);
-            g_thread_affinity_mask_big.enable(3);
-            g_thread_affinity_mask_big.enable(4);
-            g_thread_affinity_mask_big.enable(5);
-            g_thread_affinity_mask_big.enable(6);
-            g_thread_affinity_mask_big.enable(7);
-            g_thread_affinity_mask_little.enable(8);
-            g_thread_affinity_mask_little.enable(9);
+            g_thread_affinity_mask_big.enable(i);
         }
-        else if (cpu_count == 20)
+        for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
         {
-            // 16 + 4
-            g_thread_affinity_mask_big.enable(0);
-            g_thread_affinity_mask_big.enable(1);
-            g_thread_affinity_mask_big.enable(2);
-            g_thread_affinity_mask_big.enable(3);
-            g_thread_affinity_mask_big.enable(4);
-            g_thread_affinity_mask_big.enable(5);
-            g_thread_affinity_mask_big.enable(6);
-            g_thread_affinity_mask_big.enable(7);
-            g_thread_affinity_mask_big.enable(8);
-            g_thread_affinity_mask_big.enable(9);
-            g_thread_affinity_mask_big.enable(10);
-            g_thread_affinity_mask_big.enable(11);
-            g_thread_affinity_mask_big.enable(12);
-            g_thread_affinity_mask_big.enable(13);
-            g_thread_affinity_mask_big.enable(14);
-            g_thread_affinity_mask_big.enable(15);
-            g_thread_affinity_mask_little.enable(16);
-            g_thread_affinity_mask_little.enable(17);
+            g_thread_affinity_mask_little.enable(i);
         }
     }
-    else
-    {
-        // smp models
-        g_thread_affinity_mask_little.disable_all();
-        g_thread_affinity_mask_big = g_thread_affinity_mask_all;
-    }
 #else
     // TODO implement me for other platforms
     g_thread_affinity_mask_little.disable_all();
diff --git a/src/cpu.h b/src/cpu.h
index 0f748f33d..d03e7e8b3 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -126,6 +126,10 @@ NCNN_EXPORT int get_physical_cpu_count();
 NCNN_EXPORT int get_physical_little_cpu_count();
 NCNN_EXPORT int get_physical_big_cpu_count();
 
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
 // bind all threads on little clusters if powersave enabled
 // affects HMP arch cpu like ARM big.LITTLE
 // only implemented on android at the moment
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
index 067033f43..230d3f271 100644
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -5892,24 +5892,60 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
 
 static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
-    // TODO do not hardcode
+    // resolve optimal tile size from cache size
+    size_t l2_cache_size = get_cpu_level2_cache_size();
+    int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));
+
+#if __AVX512F__
+    TILE_M = tile_size / 16 * 16;
+    TILE_N = tile_size / 4 * 4;
+    TILE_K = tile_size / 4 * 4;
+#elif __AVX__
+    TILE_M = tile_size / 8 * 8;
+    TILE_N = tile_size / 2 * 2;
+    TILE_K = tile_size / 4 * 4;
+#elif __SSE2__
+    TILE_M = tile_size / 4 * 4;
+    TILE_N = tile_size / 2 * 2;
+    TILE_K = tile_size / 2 * 2;
+#else
+    TILE_M = tile_size / 2 * 2;
+    TILE_N = tile_size / 2 * 2;
+    TILE_K = tile_size / 1 * 1;
+#endif
+
+    if (K > 0)
+    {
+        int nn_K = (K + TILE_K - 1) / TILE_K;
+#if __AVX512F__
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
+#elif __AVX__
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
+#elif __SSE2__
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
+#else
+        TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K);
+#endif
+
+        if (nn_K == 1)
+        {
+            tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);
+
 #if __AVX512F__
-    TILE_M = 16 * 8;
-    TILE_N = 4 * 32;
-    TILE_K = 4 * 32;
+            TILE_M = tile_size / 16 * 16;
+            TILE_N = tile_size / 4 * 4;
 #elif __AVX__
-    TILE_M = 8 * 16;
-    TILE_N = 4 * 32;
-    TILE_K = 2 * 64;
+            TILE_M = tile_size / 8 * 8;
+            TILE_N = tile_size / 2 * 2;
 #elif __SSE2__
-    TILE_M = 4 * 16;
-    TILE_N = 2 * 32;
-    TILE_K = 2 * 32;
+            TILE_M = tile_size / 4 * 4;
+            TILE_N = tile_size / 2 * 2;
 #else
-    TILE_M = 2 * 8;
-    TILE_N = 2 * 8;
-    TILE_K = 1 * 16;
+            TILE_M = tile_size / 2 * 2;
+            TILE_N = tile_size / 2 * 2;
 #endif
+        }
+    }
 
     TILE_M *= std::min(nT, get_physical_cpu_count());
 
@@ -5933,7 +5969,7 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
 #if __AVX512F__
         TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
 #elif __AVX__
-        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
+        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2);
 #elif __SSE2__
         TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2);
 #else
@@ -5941,20 +5977,6 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
 #endif
     }
 
-    if (K > 0)
-    {
-        int nn_K = (K + TILE_K - 1) / TILE_K;
-#if __AVX512F__
-        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
-#elif __AVX__
-        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
-#elif __SSE2__
-        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
-#else
-        TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K);
-#endif
-    }
-
     if (nT > 1)
     {
 #if __AVX512F__