From 18fbaebe68f167eca5b3e92774ba7c8d7337d9ee Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 14 Dec 2022 08:49:28 +0800 Subject: [PATCH] get cpu l2 cache size and resolve gemm tile size (#4411) * get cpu l2 cache size and resolve gemm tile size * optimize constant tile K * fix per-core l2 cache detection, better macos cpu cluster topology discovery --- src/cpu.cpp | 213 ++++++++++++++++++++++++------------- src/cpu.h | 4 + src/layer/x86/gemm_x86.cpp | 78 +++++++++----- 3 files changed, 191 insertions(+), 104 deletions(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index 85c65335c..014944ea1 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -1361,6 +1361,131 @@ int get_physical_big_cpu_count() return g_cpucount - g_physical_cpucount; } +static int get_cpu_level2_cachesize() +{ + int size = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi != NULL) + { + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationCache) + { + PCACHE_DESCRIPTOR Cache = &ptr->Cache; + if (Cache->Level == 2) + { + size = std::max(size, (int)Cache->Size); + } + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); + } +#elif defined __linux__ + size = sysconf(_SC_LEVEL2_CACHE_SIZE); +#elif __APPLE__ + // perflevel 0 is the higher performance cluster + int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2"); + int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize"); + size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize; +#endif + + // fallback to a common value + if (size <= 0) + { +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + size = 64 * 1024; + if (cpu_support_x86_avx()) + size = 128 * 1024; + if (cpu_support_x86_avx2()) + size = 256 * 1024; + if (cpu_support_x86_avx512()) + size = 1024 * 1024; +#elif __aarch64__ + size = 256 * 1024; +#elif __arm__ + size = 128 * 1024; +#else + // is 64k still too large here ? + size = 64 * 1024; +#endif + } + + return size; +} + +static int get_cpu_level3_cachesize() +{ + int size = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi != NULL) + { + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationCache) + { + PCACHE_DESCRIPTOR Cache = &ptr->Cache; + if (Cache->Level == 3) + { + size = std::max(size, (int)Cache->Size); + } + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); + } +#elif defined __linux__ + size = sysconf(_SC_LEVEL3_CACHE_SIZE); +#elif __APPLE__ + // perflevel 0 is the higher performance cluster + // get the size shared among all cpus + size = get_hw_capability("hw.perflevel0.l3cachesize"); +#endif + + // l3 cache size can be zero + + return size; +} + +static int g_cpu_level2_cachesize = get_cpu_level2_cachesize(); +static int g_cpu_level3_cachesize = get_cpu_level3_cachesize(); + +int get_cpu_level2_cache_size() +{ + return g_cpu_level2_cachesize; +} + +int get_cpu_level3_cache_size() +{ + return g_cpu_level3_cachesize; +} + #if (defined _WIN32 && !(defined __MINGW32__)) static CpuSet get_smt_cpu_mask() { @@ -1737,90 +1862,26 @@ static int setup_thread_affinity_masks() g_thread_affinity_mask_big.enable(i); } #elif __APPLE__ - // affinity info from cpu model - // TODO find a general way to get per-core frequency on macos - if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL) + int nperflevels = get_hw_capability("hw.nperflevels"); + if (nperflevels == 1) { - // 2 + 4 - g_thread_affinity_mask_big.enable(0); - g_thread_affinity_mask_big.enable(1); - g_thread_affinity_mask_little.enable(2); - g_thread_affinity_mask_little.enable(3); - g_thread_affinity_mask_little.enable(4); - g_thread_affinity_mask_little.enable(5); + // smp models + g_thread_affinity_mask_little.disable_all(); + g_thread_affinity_mask_big = g_thread_affinity_mask_all; } - else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST - || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER - || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM - || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD - || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH) + else { - int cpu_count = get_cpu_count(); - if (cpu_count == 6) - { - // 2 + 4 - g_thread_affinity_mask_big.enable(0); - g_thread_affinity_mask_big.enable(1); - g_thread_affinity_mask_little.enable(2); - g_thread_affinity_mask_little.enable(3); - g_thread_affinity_mask_little.enable(4); - g_thread_affinity_mask_little.enable(5); - } - else if (cpu_count == 8) - { - // 4 + 4 - g_thread_affinity_mask_big.enable(0); - g_thread_affinity_mask_big.enable(1); - g_thread_affinity_mask_big.enable(2); - g_thread_affinity_mask_big.enable(3); - g_thread_affinity_mask_little.enable(4); - g_thread_affinity_mask_little.enable(5); - g_thread_affinity_mask_little.enable(6); - g_thread_affinity_mask_little.enable(7); - } - else if (cpu_count == 10) + // two or more clusters, level0 is the high-performance cluster + int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max"); + for (int i = 0; i < perflevel0_logicalcpu; i++) { - // 8 + 2 - g_thread_affinity_mask_big.enable(0); - g_thread_affinity_mask_big.enable(1); - g_thread_affinity_mask_big.enable(2); - g_thread_affinity_mask_big.enable(3); - g_thread_affinity_mask_big.enable(4); - g_thread_affinity_mask_big.enable(5); - g_thread_affinity_mask_big.enable(6); - g_thread_affinity_mask_big.enable(7); - g_thread_affinity_mask_little.enable(8); - g_thread_affinity_mask_little.enable(9); + g_thread_affinity_mask_big.enable(i); } - else if (cpu_count == 20) + for (int i = perflevel0_logicalcpu; i < g_cpucount; i++) { - // 16 + 4 - g_thread_affinity_mask_big.enable(0); - g_thread_affinity_mask_big.enable(1); - g_thread_affinity_mask_big.enable(2); - g_thread_affinity_mask_big.enable(3); - g_thread_affinity_mask_big.enable(4); - g_thread_affinity_mask_big.enable(5); - g_thread_affinity_mask_big.enable(6); - g_thread_affinity_mask_big.enable(7); - g_thread_affinity_mask_big.enable(8); - g_thread_affinity_mask_big.enable(9); - g_thread_affinity_mask_big.enable(10); - g_thread_affinity_mask_big.enable(11); - g_thread_affinity_mask_big.enable(12); - g_thread_affinity_mask_big.enable(13); - g_thread_affinity_mask_big.enable(14); - g_thread_affinity_mask_big.enable(15); - g_thread_affinity_mask_little.enable(16); - g_thread_affinity_mask_little.enable(17); + g_thread_affinity_mask_little.enable(i); } } - else - { - // smp models - g_thread_affinity_mask_little.disable_all(); - g_thread_affinity_mask_big = g_thread_affinity_mask_all; - } #else // TODO implement me for other platforms g_thread_affinity_mask_little.disable_all(); diff --git a/src/cpu.h b/src/cpu.h index 0f748f33d..d03e7e8b3 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -126,6 +126,10 @@ NCNN_EXPORT int get_physical_cpu_count(); NCNN_EXPORT int get_physical_little_cpu_count(); NCNN_EXPORT int get_physical_big_cpu_count(); +// cpu l2 varies from 64k to 1M, but l3 can be zero +NCNN_EXPORT int get_cpu_level2_cache_size(); +NCNN_EXPORT int get_cpu_level3_cache_size(); + // bind all threads on little clusters if powersave enabled // affects HMP arch cpu like ARM big.LITTLE // only implemented on android at the moment diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp index 067033f43..230d3f271 100644 --- a/src/layer/x86/gemm_x86.cpp +++ b/src/layer/x86/gemm_x86.cpp @@ -5892,24 +5892,60 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { - // TODO do not hardcode + // resolve optimal tile size from cache size + size_t l2_cache_size = get_cpu_level2_cache_size(); + int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); + +#if __AVX512F__ + TILE_M = tile_size / 16 * 16; + TILE_N = tile_size / 4 * 4; + TILE_K = tile_size / 4 * 4; +#elif __AVX__ + TILE_M = tile_size / 8 * 8; + TILE_N = tile_size / 2 * 2; + TILE_K = tile_size / 4 * 4; +#elif __SSE2__ + TILE_M = tile_size / 4 * 4; + TILE_N = tile_size / 2 * 2; + TILE_K = tile_size / 2 * 2; +#else + TILE_M = tile_size / 2 * 2; + TILE_N = tile_size / 2 * 2; + TILE_K = tile_size / 1 * 1; +#endif + + if (K > 0) + { + int nn_K = (K + TILE_K - 1) / TILE_K; +#if __AVX512F__ + TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4); +#elif __AVX__ + TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4); +#elif __SSE2__ + TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2); +#else + TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K); +#endif + + if (nn_K == 1) + { + tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K); + #if __AVX512F__ - TILE_M = 16 * 8; - TILE_N = 4 * 32; - TILE_K = 4 * 32; + TILE_M = tile_size / 16 * 16; + TILE_N = tile_size / 4 * 4; #elif __AVX__ - TILE_M = 8 * 16; - TILE_N = 4 * 32; - TILE_K = 2 * 64; + TILE_M = tile_size / 8 * 8; + TILE_N = tile_size / 2 * 2; #elif __SSE2__ - TILE_M = 4 * 16; - TILE_N = 2 * 32; - TILE_K = 2 * 32; + TILE_M = tile_size / 4 * 4; + TILE_N = tile_size / 2 * 2; #else - TILE_M = 2 * 8; - TILE_N = 2 * 8; - TILE_K = 1 * 16; + TILE_M = tile_size / 2 * 2; + TILE_N = tile_size / 2 * 2; #endif + } + } TILE_M *= std::min(nT, get_physical_cpu_count()); @@ -5933,7 +5969,7 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, #if __AVX512F__ TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4); #elif __AVX__ - TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4); + TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2); #elif __SSE2__ TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2); #else @@ -5941,20 +5977,6 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, #endif } - if (K > 0) - { - int nn_K = (K + TILE_K - 1) / TILE_K; -#if __AVX512F__ - TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4); -#elif __AVX__ - TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2); -#elif __SSE2__ - TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2); -#else - TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K); -#endif - } - if (nT > 1) { #if __AVX512F__