From f34becf6fc7679dcc82ab2b7a7e06daf11f85164 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Fri, 7 Apr 2023 19:29:36 +0800
Subject: [PATCH] fix divide by zero in get optimal tile size mnk (#4610)

---
 src/layer/arm/convolution_3x3_winograd.h      |  6 +--
 .../arm/convolution_3x3_winograd_fp16s.h      |  2 +-
 src/layer/arm/convolution_im2col_gemm.h       |  6 +--
 src/layer/arm/convolution_im2col_gemm_bf16s.h |  6 +--
 src/layer/arm/convolution_im2col_gemm_fp16s.h |  2 +-
 src/layer/arm/gemm_arm.cpp                    | 30 +++++++-------
 src/layer/arm/gemm_arm_asimdhp.cpp            | 10 ++---
 src/layer/arm/gemm_bf16s_fp16s.h              | 10 ++---
 src/layer/x86/convolution_3x3_winograd.h      | 24 +++++------
 src/layer/x86/gemm_x86.cpp                    | 40 +++++++++----------
 10 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/src/layer/arm/convolution_3x3_winograd.h b/src/layer/arm/convolution_3x3_winograd.h
index 7207d84b1..681979b10 100644
--- a/src/layer/arm/convolution_3x3_winograd.h
+++ b/src/layer/arm/convolution_3x3_winograd.h
@@ -4479,11 +4479,11 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B,
 #endif
 
 #if __aarch64__
-        TILE_K = tile_size / 8 * 8;
+        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
-        TILE_K = tile_size / 4 * 4;
+        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
-        TILE_K = tile_size / 2 * 2;
+        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif
 
         int nn_K = (K + TILE_K - 1) / TILE_K;
diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h
index fc7adb622..3e5dee1ae 100644
--- a/src/layer/arm/convolution_3x3_winograd_fp16s.h
+++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h
@@ -2007,7 +2007,7 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in
         // try not to split K
         int tile_size = (l2_cache_size_fp16 - 32) / 12;
 
-        TILE_K = tile_size / 8 * 8;
+        TILE_K = std::max(8, tile_size / 8 * 8);
 
         int nn_K = (K + TILE_K - 1) / TILE_K;
         TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h
index 5cf274340..eeb6df6bb 100644
--- a/src/layer/arm/convolution_im2col_gemm.h
+++ b/src/layer/arm/convolution_im2col_gemm.h
@@ -5962,11 +5962,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in
 #endif
 
 #if __aarch64__
-        TILE_K = tile_size / 8 * 8;
+        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
-        TILE_K = tile_size / 4 * 4;
+        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
-        TILE_K = tile_size / 2 * 2;
+        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif
 
         int nn_K = (K + TILE_K - 1) / TILE_K;
diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h
index 83cc0d3d2..1cb603e7d 100644
--- a/src/layer/arm/convolution_im2col_gemm_bf16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h
@@ -5843,11 +5843,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int
 #endif
 
 #if __aarch64__
-        TILE_K = tile_size / 8 * 8;
+        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
-        TILE_K = tile_size / 4 * 4;
+        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
-        TILE_K = tile_size / 2 * 2;
+        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif
 
         int nn_K = (K + TILE_K - 1) / TILE_K;
diff --git a/src/layer/arm/convolution_im2col_gemm_fp16s.h b/src/layer/arm/convolution_im2col_gemm_fp16s.h
index de158c35b..360f05f41 100644
--- a/src/layer/arm/convolution_im2col_gemm_fp16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h
@@ -3027,7 +3027,7 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in
         // try not to split K
         int tile_size = (l2_cache_size_fp16 - 32) / 12;
 
-        TILE_K = tile_size / 8 * 8;
+        TILE_K = std::max(8, tile_size / 8 * 8);
 
         int nn_K = (K + TILE_K - 1) / TILE_K;
         TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp
index af40fb694..20dd68c92 100644
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -3666,17 +3666,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
     int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));
 
 #if __aarch64__
-    TILE_M = tile_size / 8 * 8;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 8 * 8;
+    TILE_M = std::max(8, tile_size / 8 * 8);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __ARM_NEON
-    TILE_M = tile_size / 4 * 4;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 4 * 4;
+    TILE_M = std::max(4, tile_size / 4 * 4);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(4, tile_size / 4 * 4);
 #else
-    TILE_M = tile_size / 2 * 2;
-    TILE_N = tile_size;
-    TILE_K = tile_size / 2 * 2;
+    TILE_M = std::max(2, tile_size / 2 * 2);
+    TILE_N = std::max(1, tile_size);
+    TILE_K = std::max(2, tile_size / 2 * 2);
 #endif
 
     if (K > 0)
@@ -3695,14 +3695,14 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
             tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);
 
 #if __aarch64__
-            TILE_M = tile_size / 8 * 8;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(8, tile_size / 8 * 8);
+            TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __ARM_NEON
-            TILE_M = tile_size / 4 * 4;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(4, tile_size / 4 * 4);
+            TILE_N = std::max(4, tile_size / 4 * 4);
 #else
-            TILE_M = tile_size / 2 * 2;
-            TILE_N = tile_size;
+            TILE_M = std::max(2, tile_size / 2 * 2);
+            TILE_N = std::max(1, tile_size);
 #endif
         }
     }
diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp
index d0261d67d..f40c1e610 100644
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2284,9 +2284,9 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M
     size_t l2_cache_size = get_cpu_level2_cache_size();
     int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(__fp16));
 
-    TILE_M = tile_size / 8 * 8;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 8 * 8;
+    TILE_M = std::max(8, tile_size / 8 * 8);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(8, tile_size / 8 * 8);
 
     if (K > 0)
     {
@@ -2297,8 +2297,8 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M
         {
             tile_size = (int)((float)l2_cache_size / 2 / sizeof(__fp16) / TILE_K);
 
-            TILE_M = tile_size / 8 * 8;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(8, tile_size / 8 * 8);
+            TILE_N = std::max(4, tile_size / 4 * 4);
         }
     }
 
diff --git a/src/layer/arm/gemm_bf16s_fp16s.h b/src/layer/arm/gemm_bf16s_fp16s.h
index 540bcf4de..a61f9647b 100644
--- a/src/layer/arm/gemm_bf16s_fp16s.h
+++ b/src/layer/arm/gemm_bf16s_fp16s.h
@@ -1525,9 +1525,9 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T
     size_t l2_cache_size = get_cpu_level2_cache_size();
     int tile_size = (int)sqrt((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));
 
-    TILE_M = tile_size / 8 * 8;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 8 * 8;
+    TILE_M = std::max(8, tile_size / 8 * 8);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(8, tile_size / 8 * 8);
 
     if (K > 0)
     {
@@ -1538,8 +1538,8 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T
         {
             tile_size = (int)((float)l2_cache_size / 2 / sizeof(unsigned short) / TILE_K);
 
-            TILE_M = tile_size / 8 * 8;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(8, tile_size / 8 * 8);
+            TILE_N = std::max(4, tile_size / 4 * 4);
         }
     }
 
diff --git a/src/layer/x86/convolution_3x3_winograd.h b/src/layer/x86/convolution_3x3_winograd.h
index 481acd90c..0c273bed0 100644
--- a/src/layer/x86/convolution_3x3_winograd.h
+++ b/src/layer/x86/convolution_3x3_winograd.h
@@ -1827,13 +1827,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
         int tile_size = (int)sqrt((float)l2_cache_size / sizeof(float) / 3);
 
 #if __AVX512F__
-        TILE_M = tile_size / 16 * 16;
+        TILE_M = std::max(16, tile_size / 16 * 16);
 #elif __AVX__
-        TILE_M = tile_size / 8 * 8;
+        TILE_M = std::max(8, tile_size / 8 * 8);
 #elif __SSE2__
-        TILE_M = tile_size / 4 * 4;
+        TILE_M = std::max(4, tile_size / 4 * 4);
 #else
-        TILE_M = tile_size / 2 * 2;
+        TILE_M = std::max(2, tile_size / 2 * 2);
 #endif
 
         TILE_M *= std::min(nT, get_physical_cpu_count());
@@ -1868,13 +1868,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
         int tile_size = (int)(sqrt((float)l2_cache_size / sizeof(float)) - TILE_M);
 
 #if __AVX512F__
-        TILE_K = tile_size / 16 * 16;
+        TILE_K = std::max(16, tile_size / 16 * 16);
 #elif __AVX__
-        TILE_K = tile_size / 8 * 8;
+        TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __SSE2__
-        TILE_K = tile_size / 4 * 4;
+        TILE_K = std::max(4, tile_size / 4 * 4);
 #else
-        TILE_K = tile_size / 2 * 2;
+        TILE_K = std::max(2, tile_size / 2 * 2);
 #endif
 
         int nn_K = (K + TILE_K - 1) / TILE_K;
@@ -1894,13 +1894,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
         int tile_size = (int)(((float)l2_cache_size / sizeof(float) - TILE_M * TILE_K) / (TILE_M + TILE_K));
 
 #if __AVX512F__
-        TILE_N = tile_size / 4 * 4;
+        TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __AVX__
-        TILE_N = tile_size / 4 * 4;
+        TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __SSE2__
-        TILE_N = tile_size / 4 * 4;
+        TILE_N = std::max(4, tile_size / 4 * 4);
 #else
-        TILE_N = tile_size;
+        TILE_N = std::max(1, tile_size);
 #endif
 
         int nn_N = (N + TILE_N - 1) / TILE_N;
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
index d90da9209..9ffa1d27c 100644
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -6703,21 +6703,21 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
     int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));
 
 #if __AVX512F__
-    TILE_M = tile_size / 16 * 16;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 16 * 16;
+    TILE_M = std::max(16, tile_size / 16 * 16);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(16, tile_size / 16 * 16);
 #elif __AVX__
-    TILE_M = tile_size / 8 * 8;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 8 * 8;
+    TILE_M = std::max(8, tile_size / 8 * 8);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(8, tile_size / 8 * 8);
 #elif __SSE2__
-    TILE_M = tile_size / 4 * 4;
-    TILE_N = tile_size / 4 * 4;
-    TILE_K = tile_size / 4 * 4;
+    TILE_M = std::max(4, tile_size / 4 * 4);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(4, tile_size / 4 * 4);
 #else
-    TILE_M = tile_size / 2 * 2;
-    TILE_N = tile_size;
-    TILE_K = tile_size / 2 * 2;
+    TILE_M = std::max(2, tile_size / 2 * 2);
+    TILE_N = std::max(1, tile_size);
+    TILE_K = std::max(2, tile_size / 2 * 2);
 #endif
 
     if (K > 0)
@@ -6738,17 +6738,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
             tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);
 
 #if __AVX512F__
-            TILE_M = tile_size / 16 * 16;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(16, tile_size / 16 * 16);
+            TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __AVX__
-            TILE_M = tile_size / 8 * 8;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(8, tile_size / 8 * 8);
+            TILE_N = std::max(4, tile_size / 4 * 4);
 #elif __SSE2__
-            TILE_M = tile_size / 4 * 4;
-            TILE_N = tile_size / 4 * 4;
+            TILE_M = std::max(4, tile_size / 4 * 4);
+            TILE_N = std::max(4, tile_size / 4 * 4);
 #else
-            TILE_M = tile_size / 2 * 2;
-            TILE_N = tile_size;
+            TILE_M = std::max(2, tile_size / 2 * 2);
+            TILE_N = std::max(1, tile_size);
 #endif
         }
     }