fix divide by zero in get optimal tile size mnk (#4610)

3 years ago · f34becf6fc
--- a/src/layer/arm/convolution_3x3_winograd.h
+++ b/src/layer/arm/convolution_3x3_winograd.h
@@ -4479,11 +4479,11 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B,
 #endif

 #if __aarch64__
        TILE_K = tile_size / 8 * 8;
        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
        TILE_K = tile_size / 4 * 4;
        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
        TILE_K = tile_size / 2 * 2;
        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
--- a/src/layer/arm/convolution_3x3_winograd_fp16s.h
+++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h
@@ -2007,7 +2007,7 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in
        // try not to split K
        int tile_size = (l2_cache_size_fp16 - 32) / 12;

        TILE_K = tile_size / 8 * 8;
        TILE_K = std::max(8, tile_size / 8 * 8);

        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
--- a/src/layer/arm/convolution_im2col_gemm.h
+++ b/src/layer/arm/convolution_im2col_gemm.h
@@ -5962,11 +5962,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in
 #endif

 #if __aarch64__
        TILE_K = tile_size / 8 * 8;
        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
        TILE_K = tile_size / 4 * 4;
        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
        TILE_K = tile_size / 2 * 2;
        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
--- a/src/layer/arm/convolution_im2col_gemm_bf16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h
@@ -5843,11 +5843,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int
 #endif

 #if __aarch64__
        TILE_K = tile_size / 8 * 8;
        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
        TILE_K = tile_size / 4 * 4;
        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
        TILE_K = tile_size / 2 * 2;
        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
--- a/src/layer/arm/convolution_im2col_gemm_fp16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h
@@ -3027,7 +3027,7 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in
        // try not to split K
        int tile_size = (l2_cache_size_fp16 - 32) / 12;

        TILE_K = tile_size / 8 * 8;
        TILE_K = std::max(8, tile_size / 8 * 8);

        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -3666,17 +3666,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
    int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

 #if __aarch64__
    TILE_M = tile_size / 8 * 8;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 8 * 8;
    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
    TILE_M = tile_size / 4 * 4;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 4 * 4;
    TILE_M = std::max(4, tile_size / 4 * 4);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(4, tile_size / 4 * 4);
 #else
    TILE_M = tile_size / 2 * 2;
    TILE_N = tile_size;
    TILE_K = tile_size / 2 * 2;
    TILE_M = std::max(2, tile_size / 2 * 2);
    TILE_N = std::max(1, tile_size);
    TILE_K = std::max(2, tile_size / 2 * 2);
 #endif

    if (K > 0)
@@ -3695,14 +3695,14 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

 #if __aarch64__
            TILE_M = tile_size / 8 * 8;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __ARM_NEON
            TILE_M = tile_size / 4 * 4;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(4, tile_size / 4 * 4);
            TILE_N = std::max(4, tile_size / 4 * 4);
 #else
            TILE_M = tile_size / 2 * 2;
            TILE_N = tile_size;
            TILE_M = std::max(2, tile_size / 2 * 2);
            TILE_N = std::max(1, tile_size);
 #endif
        }
    }
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2284,9 +2284,9 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M
    size_t l2_cache_size = get_cpu_level2_cache_size();
    int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(__fp16));

    TILE_M = tile_size / 8 * 8;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 8 * 8;
    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);

    if (K > 0)
    {
@@ -2297,8 +2297,8 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(__fp16) / TILE_K);

            TILE_M = tile_size / 8 * 8;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
        }
    }

--- a/src/layer/arm/gemm_bf16s_fp16s.h
+++ b/src/layer/arm/gemm_bf16s_fp16s.h
@@ -1525,9 +1525,9 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T
    size_t l2_cache_size = get_cpu_level2_cache_size();
    int tile_size = (int)sqrt((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));

    TILE_M = tile_size / 8 * 8;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 8 * 8;
    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);

    if (K > 0)
    {
@@ -1538,8 +1538,8 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(unsigned short) / TILE_K);

            TILE_M = tile_size / 8 * 8;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
        }
    }

--- a/src/layer/x86/convolution_3x3_winograd.h
+++ b/src/layer/x86/convolution_3x3_winograd.h
@@ -1827,13 +1827,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
        int tile_size = (int)sqrt((float)l2_cache_size / sizeof(float) / 3);

 #if __AVX512F__
        TILE_M = tile_size / 16 * 16;
        TILE_M = std::max(16, tile_size / 16 * 16);
 #elif __AVX__
        TILE_M = tile_size / 8 * 8;
        TILE_M = std::max(8, tile_size / 8 * 8);
 #elif __SSE2__
        TILE_M = tile_size / 4 * 4;
        TILE_M = std::max(4, tile_size / 4 * 4);
 #else
        TILE_M = tile_size / 2 * 2;
        TILE_M = std::max(2, tile_size / 2 * 2);
 #endif

        TILE_M *= std::min(nT, get_physical_cpu_count());
@@ -1868,13 +1868,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
        int tile_size = (int)(sqrt((float)l2_cache_size / sizeof(float)) - TILE_M);

 #if __AVX512F__
        TILE_K = tile_size / 16 * 16;
        TILE_K = std::max(16, tile_size / 16 * 16);
 #elif __AVX__
        TILE_K = tile_size / 8 * 8;
        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __SSE2__
        TILE_K = tile_size / 4 * 4;
        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
        TILE_K = tile_size / 2 * 2;
        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
@@ -1894,13 +1894,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
        int tile_size = (int)(((float)l2_cache_size / sizeof(float) - TILE_M * TILE_K) / (TILE_M + TILE_K));

 #if __AVX512F__
        TILE_N = tile_size / 4 * 4;
        TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __AVX__
        TILE_N = tile_size / 4 * 4;
        TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __SSE2__
        TILE_N = tile_size / 4 * 4;
        TILE_N = std::max(4, tile_size / 4 * 4);
 #else
        TILE_N = tile_size;
        TILE_N = std::max(1, tile_size);
 #endif

        int nn_N = (N + TILE_N - 1) / TILE_N;
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -6703,21 +6703,21 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
    int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

 #if __AVX512F__
    TILE_M = tile_size / 16 * 16;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 16 * 16;
    TILE_M = std::max(16, tile_size / 16 * 16);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(16, tile_size / 16 * 16);
 #elif __AVX__
    TILE_M = tile_size / 8 * 8;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 8 * 8;
    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __SSE2__
    TILE_M = tile_size / 4 * 4;
    TILE_N = tile_size / 4 * 4;
    TILE_K = tile_size / 4 * 4;
    TILE_M = std::max(4, tile_size / 4 * 4);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(4, tile_size / 4 * 4);
 #else
    TILE_M = tile_size / 2 * 2;
    TILE_N = tile_size;
    TILE_K = tile_size / 2 * 2;
    TILE_M = std::max(2, tile_size / 2 * 2);
    TILE_N = std::max(1, tile_size);
    TILE_K = std::max(2, tile_size / 2 * 2);
 #endif

    if (K > 0)
@@ -6738,17 +6738,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

 #if __AVX512F__
            TILE_M = tile_size / 16 * 16;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(16, tile_size / 16 * 16);
            TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __AVX__
            TILE_M = tile_size / 8 * 8;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __SSE2__
            TILE_M = tile_size / 4 * 4;
            TILE_N = tile_size / 4 * 4;
            TILE_M = std::max(4, tile_size / 4 * 4);
            TILE_N = std::max(4, tile_size / 4 * 4);
 #else
            TILE_M = tile_size / 2 * 2;
            TILE_N = tile_size;
            TILE_M = std::max(2, tile_size / 2 * 2);
            TILE_N = std::max(1, tile_size);
 #endif
        }
    }