force global cpu info initialization (#4725)

* force global cpu info initialization * sanitize zero nT
3 years ago · 2b87dc2cf7
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
--- a/src/layer/arm/convolution_3x3_winograd.h
+++ b/src/layer/arm/convolution_3x3_winograd.h
@@ -4464,6 +4464,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B,
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
    (void)B;

--- a/src/layer/arm/convolution_3x3_winograd_fp16s.h
+++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h
@@ -1999,6 +1999,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
    (void)B;

--- a/src/layer/arm/convolution_im2col_gemm.h
+++ b/src/layer/arm/convolution_im2col_gemm.h
@@ -5950,6 +5950,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
--- a/src/layer/arm/convolution_im2col_gemm_bf16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h
@@ -5831,6 +5831,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int
    // resolve optimal tile size from cache size
    const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
--- a/src/layer/arm/convolution_im2col_gemm_fp16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h
@@ -3022,6 +3022,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -3662,7 +3662,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
 static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
    // resolve optimal tile size from cache size
    size_t l2_cache_size = get_cpu_level2_cache_size();
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float));

 #if __aarch64__
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2281,7 +2281,11 @@ static void gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_til
 static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
    // resolve optimal tile size from cache size
    size_t l2_cache_size = get_cpu_level2_cache_size();
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16));

    TILE_M = std::max(8, tile_size / 8 * 8);
--- a/src/layer/arm/gemm_bf16s_fp16s.h
+++ b/src/layer/arm/gemm_bf16s_fp16s.h
@@ -1522,7 +1522,11 @@ static void transpose_unpack_output_tile_bf16_fp16(const Mat& topT, Mat& top_blo
 static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
    // resolve optimal tile size from cache size
    size_t l2_cache_size = get_cpu_level2_cache_size();
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));

    TILE_M = std::max(8, tile_size / 8 * 8);
--- a/src/layer/x86/convolution_3x3_winograd.h
+++ b/src/layer/x86/convolution_3x3_winograd.h
@@ -1820,7 +1820,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, Mat&
 static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
    // resolve optimal tile size from cache size
    size_t l2_cache_size = get_cpu_level2_cache_size();
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve M
    {
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -6699,7 +6699,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
 static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
 {
    // resolve optimal tile size from cache size
    size_t l2_cache_size = get_cpu_level2_cache_size();
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

 #if __AVX512F__