Browse Source

get cpu l2 cache size and resolve gemm tile size (#4411)

* get cpu l2 cache size and resolve gemm tile size

* optimize constant tile K

* fix per-core l2 cache detection, better macos cpu cluster topology discovery
tags/20230223
nihui GitHub 3 years ago
parent
commit
18fbaebe68
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 191 additions and 104 deletions
  1. +137
    -76
      src/cpu.cpp
  2. +4
    -0
      src/cpu.h
  3. +50
    -28
      src/layer/x86/gemm_x86.cpp

+ 137
- 76
src/cpu.cpp View File

@@ -1361,6 +1361,131 @@ int get_physical_big_cpu_count()
return g_cpucount - g_physical_cpucount;
}

static int get_cpu_level2_cachesize()
{
int size = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi != NULL)
{
DWORD return_length = 0;
glpi(NULL, &return_length);

PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
glpi(buffer, &return_length);

PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
DWORD byte_offset = 0;
while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
{
if (ptr->Relationship == RelationCache)
{
PCACHE_DESCRIPTOR Cache = &ptr->Cache;
if (Cache->Level == 2)
{
size = std::max(size, (int)Cache->Size);
}
}

byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}

free(buffer);
}
#elif defined __linux__
size = sysconf(_SC_LEVEL2_CACHE_SIZE);
#elif __APPLE__
// perflevel 0 is the higher performance cluster
int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
#endif

// fallback to a common value
if (size <= 0)
{
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
size = 64 * 1024;
if (cpu_support_x86_avx())
size = 128 * 1024;
if (cpu_support_x86_avx2())
size = 256 * 1024;
if (cpu_support_x86_avx512())
size = 1024 * 1024;
#elif __aarch64__
size = 256 * 1024;
#elif __arm__
size = 128 * 1024;
#else
// is 64k still too large here ?
size = 64 * 1024;
#endif
}

return size;
}

static int get_cpu_level3_cachesize()
{
int size = 0;
#if (defined _WIN32 && !(defined __MINGW32__))
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi != NULL)
{
DWORD return_length = 0;
glpi(NULL, &return_length);

PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
glpi(buffer, &return_length);

PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
DWORD byte_offset = 0;
while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
{
if (ptr->Relationship == RelationCache)
{
PCACHE_DESCRIPTOR Cache = &ptr->Cache;
if (Cache->Level == 3)
{
size = std::max(size, (int)Cache->Size);
}
}

byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}

free(buffer);
}
#elif defined __linux__
size = sysconf(_SC_LEVEL3_CACHE_SIZE);
#elif __APPLE__
// perflevel 0 is the higher performance cluster
// get the size shared among all cpus
size = get_hw_capability("hw.perflevel0.l3cachesize");
#endif

// l3 cache size can be zero

return size;
}

static int g_cpu_level2_cachesize = get_cpu_level2_cachesize();
static int g_cpu_level3_cachesize = get_cpu_level3_cachesize();

int get_cpu_level2_cache_size()
{
return g_cpu_level2_cachesize;
}

int get_cpu_level3_cache_size()
{
return g_cpu_level3_cachesize;
}

#if (defined _WIN32 && !(defined __MINGW32__))
static CpuSet get_smt_cpu_mask()
{
@@ -1737,90 +1862,26 @@ static int setup_thread_affinity_masks()
g_thread_affinity_mask_big.enable(i);
}
#elif __APPLE__
// affinity info from cpu model
// TODO find a general way to get per-core frequency on macos
if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
int nperflevels = get_hw_capability("hw.nperflevels");
if (nperflevels == 1)
{
// 2 + 4
g_thread_affinity_mask_big.enable(0);
g_thread_affinity_mask_big.enable(1);
g_thread_affinity_mask_little.enable(2);
g_thread_affinity_mask_little.enable(3);
g_thread_affinity_mask_little.enable(4);
g_thread_affinity_mask_little.enable(5);
// smp models
g_thread_affinity_mask_little.disable_all();
g_thread_affinity_mask_big = g_thread_affinity_mask_all;
}
else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
|| g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
|| g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
|| g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
|| g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH)
else
{
int cpu_count = get_cpu_count();
if (cpu_count == 6)
{
// 2 + 4
g_thread_affinity_mask_big.enable(0);
g_thread_affinity_mask_big.enable(1);
g_thread_affinity_mask_little.enable(2);
g_thread_affinity_mask_little.enable(3);
g_thread_affinity_mask_little.enable(4);
g_thread_affinity_mask_little.enable(5);
}
else if (cpu_count == 8)
{
// 4 + 4
g_thread_affinity_mask_big.enable(0);
g_thread_affinity_mask_big.enable(1);
g_thread_affinity_mask_big.enable(2);
g_thread_affinity_mask_big.enable(3);
g_thread_affinity_mask_little.enable(4);
g_thread_affinity_mask_little.enable(5);
g_thread_affinity_mask_little.enable(6);
g_thread_affinity_mask_little.enable(7);
}
else if (cpu_count == 10)
// two or more clusters, level0 is the high-performance cluster
int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
for (int i = 0; i < perflevel0_logicalcpu; i++)
{
// 8 + 2
g_thread_affinity_mask_big.enable(0);
g_thread_affinity_mask_big.enable(1);
g_thread_affinity_mask_big.enable(2);
g_thread_affinity_mask_big.enable(3);
g_thread_affinity_mask_big.enable(4);
g_thread_affinity_mask_big.enable(5);
g_thread_affinity_mask_big.enable(6);
g_thread_affinity_mask_big.enable(7);
g_thread_affinity_mask_little.enable(8);
g_thread_affinity_mask_little.enable(9);
g_thread_affinity_mask_big.enable(i);
}
else if (cpu_count == 20)
for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
{
// 16 + 4
g_thread_affinity_mask_big.enable(0);
g_thread_affinity_mask_big.enable(1);
g_thread_affinity_mask_big.enable(2);
g_thread_affinity_mask_big.enable(3);
g_thread_affinity_mask_big.enable(4);
g_thread_affinity_mask_big.enable(5);
g_thread_affinity_mask_big.enable(6);
g_thread_affinity_mask_big.enable(7);
g_thread_affinity_mask_big.enable(8);
g_thread_affinity_mask_big.enable(9);
g_thread_affinity_mask_big.enable(10);
g_thread_affinity_mask_big.enable(11);
g_thread_affinity_mask_big.enable(12);
g_thread_affinity_mask_big.enable(13);
g_thread_affinity_mask_big.enable(14);
g_thread_affinity_mask_big.enable(15);
g_thread_affinity_mask_little.enable(16);
g_thread_affinity_mask_little.enable(17);
g_thread_affinity_mask_little.enable(i);
}
}
else
{
// smp models
g_thread_affinity_mask_little.disable_all();
g_thread_affinity_mask_big = g_thread_affinity_mask_all;
}
#else
// TODO implement me for other platforms
g_thread_affinity_mask_little.disable_all();


+ 4
- 0
src/cpu.h View File

@@ -126,6 +126,10 @@ NCNN_EXPORT int get_physical_cpu_count();
NCNN_EXPORT int get_physical_little_cpu_count();
NCNN_EXPORT int get_physical_big_cpu_count();

// cpu l2 varies from 64k to 1M, but l3 can be zero
NCNN_EXPORT int get_cpu_level2_cache_size();
NCNN_EXPORT int get_cpu_level3_cache_size();

// bind all threads on little clusters if powersave enabled
// affects HMP arch cpu like ARM big.LITTLE
// only implemented on android at the moment


+ 50
- 28
src/layer/x86/gemm_x86.cpp View File

@@ -5892,24 +5892,60 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons

static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// TODO do not hardcode
// resolve optimal tile size from cache size
size_t l2_cache_size = get_cpu_level2_cache_size();
int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

#if __AVX512F__
TILE_M = tile_size / 16 * 16;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 4 * 4;
#elif __AVX__
TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 2 * 2;
TILE_K = tile_size / 4 * 4;
#elif __SSE2__
TILE_M = tile_size / 4 * 4;
TILE_N = tile_size / 2 * 2;
TILE_K = tile_size / 2 * 2;
#else
TILE_M = tile_size / 2 * 2;
TILE_N = tile_size / 2 * 2;
TILE_K = tile_size / 1 * 1;
#endif

if (K > 0)
{
int nn_K = (K + TILE_K - 1) / TILE_K;
#if __AVX512F__
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#elif __AVX__
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#elif __SSE2__
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#else
TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K);
#endif

if (nn_K == 1)
{
tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

#if __AVX512F__
TILE_M = 16 * 8;
TILE_N = 4 * 32;
TILE_K = 4 * 32;
TILE_M = tile_size / 16 * 16;
TILE_N = tile_size / 4 * 4;
#elif __AVX__
TILE_M = 8 * 16;
TILE_N = 4 * 32;
TILE_K = 2 * 64;
TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 2 * 2;
#elif __SSE2__
TILE_M = 4 * 16;
TILE_N = 2 * 32;
TILE_K = 2 * 32;
TILE_M = tile_size / 4 * 4;
TILE_N = tile_size / 2 * 2;
#else
TILE_M = 2 * 8;
TILE_N = 2 * 8;
TILE_K = 1 * 16;
TILE_M = tile_size / 2 * 2;
TILE_N = tile_size / 2 * 2;
#endif
}
}

TILE_M *= std::min(nT, get_physical_cpu_count());

@@ -5933,7 +5969,7 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
#if __AVX512F__
TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#elif __AVX__
TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2);
#elif __SSE2__
TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 1) / 2 * 2);
#else
@@ -5941,20 +5977,6 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
#endif
}

if (K > 0)
{
int nn_K = (K + TILE_K - 1) / TILE_K;
#if __AVX512F__
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#elif __AVX__
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#elif __SSE2__
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#else
TILE_K = std::min(TILE_K, (K + nn_K - 1) / nn_K);
#endif
}

if (nT > 1)
{
#if __AVX512F__


Loading…
Cancel
Save