From f34becf6fc7679dcc82ab2b7a7e06daf11f85164 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 7 Apr 2023 19:29:36 +0800 Subject: [PATCH] fix divide by zero in get optimal tile size mnk (#4610) --- src/layer/arm/convolution_3x3_winograd.h | 6 +-- .../arm/convolution_3x3_winograd_fp16s.h | 2 +- src/layer/arm/convolution_im2col_gemm.h | 6 +-- src/layer/arm/convolution_im2col_gemm_bf16s.h | 6 +-- src/layer/arm/convolution_im2col_gemm_fp16s.h | 2 +- src/layer/arm/gemm_arm.cpp | 30 +++++++------- src/layer/arm/gemm_arm_asimdhp.cpp | 10 ++--- src/layer/arm/gemm_bf16s_fp16s.h | 10 ++--- src/layer/x86/convolution_3x3_winograd.h | 24 +++++------ src/layer/x86/gemm_x86.cpp | 40 +++++++++---------- 10 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/layer/arm/convolution_3x3_winograd.h b/src/layer/arm/convolution_3x3_winograd.h index 7207d84b1..681979b10 100644 --- a/src/layer/arm/convolution_3x3_winograd.h +++ b/src/layer/arm/convolution_3x3_winograd.h @@ -4479,11 +4479,11 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B, #endif #if __aarch64__ - TILE_K = tile_size / 8 * 8; + TILE_K = std::max(8, tile_size / 8 * 8); #elif __ARM_NEON - TILE_K = tile_size / 4 * 4; + TILE_K = std::max(4, tile_size / 4 * 4); #else - TILE_K = tile_size / 2 * 2; + TILE_K = std::max(2, tile_size / 2 * 2); #endif int nn_K = (K + TILE_K - 1) / TILE_K; diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h index fc7adb622..3e5dee1ae 100644 --- a/src/layer/arm/convolution_3x3_winograd_fp16s.h +++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h @@ -2007,7 +2007,7 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in // try not to split K int tile_size = (l2_cache_size_fp16 - 32) / 12; - TILE_K = tile_size / 8 * 8; + TILE_K = std::max(8, tile_size / 8 * 8); int nn_K = (K + TILE_K - 1) / TILE_K; TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8); diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h index 5cf274340..eeb6df6bb 100644 --- a/src/layer/arm/convolution_im2col_gemm.h +++ b/src/layer/arm/convolution_im2col_gemm.h @@ -5962,11 +5962,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in #endif #if __aarch64__ - TILE_K = tile_size / 8 * 8; + TILE_K = std::max(8, tile_size / 8 * 8); #elif __ARM_NEON - TILE_K = tile_size / 4 * 4; + TILE_K = std::max(4, tile_size / 4 * 4); #else - TILE_K = tile_size / 2 * 2; + TILE_K = std::max(2, tile_size / 2 * 2); #endif int nn_K = (K + TILE_K - 1) / TILE_K; diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h index 83cc0d3d2..1cb603e7d 100644 --- a/src/layer/arm/convolution_im2col_gemm_bf16s.h +++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h @@ -5843,11 +5843,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int #endif #if __aarch64__ - TILE_K = tile_size / 8 * 8; + TILE_K = std::max(8, tile_size / 8 * 8); #elif __ARM_NEON - TILE_K = tile_size / 4 * 4; + TILE_K = std::max(4, tile_size / 4 * 4); #else - TILE_K = tile_size / 2 * 2; + TILE_K = std::max(2, tile_size / 2 * 2); #endif int nn_K = (K + TILE_K - 1) / TILE_K; diff --git a/src/layer/arm/convolution_im2col_gemm_fp16s.h b/src/layer/arm/convolution_im2col_gemm_fp16s.h index de158c35b..360f05f41 100644 --- a/src/layer/arm/convolution_im2col_gemm_fp16s.h +++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h @@ -3027,7 +3027,7 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in // try not to split K int tile_size = (l2_cache_size_fp16 - 32) / 12; - TILE_K = tile_size / 8 * 8; + TILE_K = std::max(8, tile_size / 8 * 8); int nn_K = (K + TILE_K - 1) / TILE_K; TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8); diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp index af40fb694..20dd68c92 100644 --- a/src/layer/arm/gemm_arm.cpp +++ b/src/layer/arm/gemm_arm.cpp @@ -3666,17 +3666,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); #if __aarch64__ - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 8 * 8; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(8, tile_size / 8 * 8); #elif __ARM_NEON - TILE_M = tile_size / 4 * 4; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 4 * 4; + TILE_M = std::max(4, tile_size / 4 * 4); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(4, tile_size / 4 * 4); #else - TILE_M = tile_size / 2 * 2; - TILE_N = tile_size; - TILE_K = tile_size / 2 * 2; + TILE_M = std::max(2, tile_size / 2 * 2); + TILE_N = std::max(1, tile_size); + TILE_K = std::max(2, tile_size / 2 * 2); #endif if (K > 0) @@ -3695,14 +3695,14 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K); #if __aarch64__ - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); #elif __ARM_NEON - TILE_M = tile_size / 4 * 4; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(4, tile_size / 4 * 4); + TILE_N = std::max(4, tile_size / 4 * 4); #else - TILE_M = tile_size / 2 * 2; - TILE_N = tile_size; + TILE_M = std::max(2, tile_size / 2 * 2); + TILE_N = std::max(1, tile_size); #endif } } diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp index d0261d67d..f40c1e610 100644 --- a/src/layer/arm/gemm_arm_asimdhp.cpp +++ b/src/layer/arm/gemm_arm_asimdhp.cpp @@ -2284,9 +2284,9 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M size_t l2_cache_size = get_cpu_level2_cache_size(); int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(__fp16)); - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 8 * 8; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(8, tile_size / 8 * 8); if (K > 0) { @@ -2297,8 +2297,8 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M { tile_size = (int)((float)l2_cache_size / 2 / sizeof(__fp16) / TILE_K); - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); } } diff --git a/src/layer/arm/gemm_bf16s_fp16s.h b/src/layer/arm/gemm_bf16s_fp16s.h index 540bcf4de..a61f9647b 100644 --- a/src/layer/arm/gemm_bf16s_fp16s.h +++ b/src/layer/arm/gemm_bf16s_fp16s.h @@ -1525,9 +1525,9 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T size_t l2_cache_size = get_cpu_level2_cache_size(); int tile_size = (int)sqrt((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float))); - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 8 * 8; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(8, tile_size / 8 * 8); if (K > 0) { @@ -1538,8 +1538,8 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T { tile_size = (int)((float)l2_cache_size / 2 / sizeof(unsigned short) / TILE_K); - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); } } diff --git a/src/layer/x86/convolution_3x3_winograd.h b/src/layer/x86/convolution_3x3_winograd.h index 481acd90c..0c273bed0 100644 --- a/src/layer/x86/convolution_3x3_winograd.h +++ b/src/layer/x86/convolution_3x3_winograd.h @@ -1827,13 +1827,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int tile_size = (int)sqrt((float)l2_cache_size / sizeof(float) / 3); #if __AVX512F__ - TILE_M = tile_size / 16 * 16; + TILE_M = std::max(16, tile_size / 16 * 16); #elif __AVX__ - TILE_M = tile_size / 8 * 8; + TILE_M = std::max(8, tile_size / 8 * 8); #elif __SSE2__ - TILE_M = tile_size / 4 * 4; + TILE_M = std::max(4, tile_size / 4 * 4); #else - TILE_M = tile_size / 2 * 2; + TILE_M = std::max(2, tile_size / 2 * 2); #endif TILE_M *= std::min(nT, get_physical_cpu_count()); @@ -1868,13 +1868,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int tile_size = (int)(sqrt((float)l2_cache_size / sizeof(float)) - TILE_M); #if __AVX512F__ - TILE_K = tile_size / 16 * 16; + TILE_K = std::max(16, tile_size / 16 * 16); #elif __AVX__ - TILE_K = tile_size / 8 * 8; + TILE_K = std::max(8, tile_size / 8 * 8); #elif __SSE2__ - TILE_K = tile_size / 4 * 4; + TILE_K = std::max(4, tile_size / 4 * 4); #else - TILE_K = tile_size / 2 * 2; + TILE_K = std::max(2, tile_size / 2 * 2); #endif int nn_K = (K + TILE_K - 1) / TILE_K; @@ -1894,13 +1894,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int tile_size = (int)(((float)l2_cache_size / sizeof(float) - TILE_M * TILE_K) / (TILE_M + TILE_K)); #if __AVX512F__ - TILE_N = tile_size / 4 * 4; + TILE_N = std::max(4, tile_size / 4 * 4); #elif __AVX__ - TILE_N = tile_size / 4 * 4; + TILE_N = std::max(4, tile_size / 4 * 4); #elif __SSE2__ - TILE_N = tile_size / 4 * 4; + TILE_N = std::max(4, tile_size / 4 * 4); #else - TILE_N = tile_size; + TILE_N = std::max(1, tile_size); #endif int nn_N = (N + TILE_N - 1) / TILE_N; diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp index d90da9209..9ffa1d27c 100644 --- a/src/layer/x86/gemm_x86.cpp +++ b/src/layer/x86/gemm_x86.cpp @@ -6703,21 +6703,21 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); #if __AVX512F__ - TILE_M = tile_size / 16 * 16; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 16 * 16; + TILE_M = std::max(16, tile_size / 16 * 16); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(16, tile_size / 16 * 16); #elif __AVX__ - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 8 * 8; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(8, tile_size / 8 * 8); #elif __SSE2__ - TILE_M = tile_size / 4 * 4; - TILE_N = tile_size / 4 * 4; - TILE_K = tile_size / 4 * 4; + TILE_M = std::max(4, tile_size / 4 * 4); + TILE_N = std::max(4, tile_size / 4 * 4); + TILE_K = std::max(4, tile_size / 4 * 4); #else - TILE_M = tile_size / 2 * 2; - TILE_N = tile_size; - TILE_K = tile_size / 2 * 2; + TILE_M = std::max(2, tile_size / 2 * 2); + TILE_N = std::max(1, tile_size); + TILE_K = std::max(2, tile_size / 2 * 2); #endif if (K > 0) @@ -6738,17 +6738,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K); #if __AVX512F__ - TILE_M = tile_size / 16 * 16; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(16, tile_size / 16 * 16); + TILE_N = std::max(4, tile_size / 4 * 4); #elif __AVX__ - TILE_M = tile_size / 8 * 8; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(8, tile_size / 8 * 8); + TILE_N = std::max(4, tile_size / 4 * 4); #elif __SSE2__ - TILE_M = tile_size / 4 * 4; - TILE_N = tile_size / 4 * 4; + TILE_M = std::max(4, tile_size / 4 * 4); + TILE_N = std::max(4, tile_size / 4 * 4); #else - TILE_M = tile_size / 2 * 2; - TILE_N = tile_size; + TILE_M = std::max(2, tile_size / 2 * 2); + TILE_N = std::max(1, tile_size); #endif } }