Browse Source

fix divide by zero in get optimal tile size mnk (#4610)

tags/20230517
nihui GitHub 3 years ago
parent
commit
f34becf6fc
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 68 additions and 68 deletions
  1. +3
    -3
      src/layer/arm/convolution_3x3_winograd.h
  2. +1
    -1
      src/layer/arm/convolution_3x3_winograd_fp16s.h
  3. +3
    -3
      src/layer/arm/convolution_im2col_gemm.h
  4. +3
    -3
      src/layer/arm/convolution_im2col_gemm_bf16s.h
  5. +1
    -1
      src/layer/arm/convolution_im2col_gemm_fp16s.h
  6. +15
    -15
      src/layer/arm/gemm_arm.cpp
  7. +5
    -5
      src/layer/arm/gemm_arm_asimdhp.cpp
  8. +5
    -5
      src/layer/arm/gemm_bf16s_fp16s.h
  9. +12
    -12
      src/layer/x86/convolution_3x3_winograd.h
  10. +20
    -20
      src/layer/x86/gemm_x86.cpp

+ 3
- 3
src/layer/arm/convolution_3x3_winograd.h View File

@@ -4479,11 +4479,11 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B,
#endif

#if __aarch64__
TILE_K = tile_size / 8 * 8;
TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
TILE_K = tile_size / 4 * 4;
TILE_K = std::max(4, tile_size / 4 * 4);
#else
TILE_K = tile_size / 2 * 2;
TILE_K = std::max(2, tile_size / 2 * 2);
#endif

int nn_K = (K + TILE_K - 1) / TILE_K;


+ 1
- 1
src/layer/arm/convolution_3x3_winograd_fp16s.h View File

@@ -2007,7 +2007,7 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in
// try not to split K
int tile_size = (l2_cache_size_fp16 - 32) / 12;

TILE_K = tile_size / 8 * 8;
TILE_K = std::max(8, tile_size / 8 * 8);

int nn_K = (K + TILE_K - 1) / TILE_K;
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);


+ 3
- 3
src/layer/arm/convolution_im2col_gemm.h View File

@@ -5962,11 +5962,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in
#endif

#if __aarch64__
TILE_K = tile_size / 8 * 8;
TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
TILE_K = tile_size / 4 * 4;
TILE_K = std::max(4, tile_size / 4 * 4);
#else
TILE_K = tile_size / 2 * 2;
TILE_K = std::max(2, tile_size / 2 * 2);
#endif

int nn_K = (K + TILE_K - 1) / TILE_K;


+ 3
- 3
src/layer/arm/convolution_im2col_gemm_bf16s.h View File

@@ -5843,11 +5843,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int
#endif

#if __aarch64__
TILE_K = tile_size / 8 * 8;
TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
TILE_K = tile_size / 4 * 4;
TILE_K = std::max(4, tile_size / 4 * 4);
#else
TILE_K = tile_size / 2 * 2;
TILE_K = std::max(2, tile_size / 2 * 2);
#endif

int nn_K = (K + TILE_K - 1) / TILE_K;


+ 1
- 1
src/layer/arm/convolution_im2col_gemm_fp16s.h View File

@@ -3027,7 +3027,7 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in
// try not to split K
int tile_size = (l2_cache_size_fp16 - 32) / 12;

TILE_K = tile_size / 8 * 8;
TILE_K = std::max(8, tile_size / 8 * 8);

int nn_K = (K + TILE_K - 1) / TILE_K;
TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);


+ 15
- 15
src/layer/arm/gemm_arm.cpp View File

@@ -3666,17 +3666,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

#if __aarch64__
TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 8 * 8;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
TILE_M = tile_size / 4 * 4;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 4 * 4;
TILE_M = std::max(4, tile_size / 4 * 4);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(4, tile_size / 4 * 4);
#else
TILE_M = tile_size / 2 * 2;
TILE_N = tile_size;
TILE_K = tile_size / 2 * 2;
TILE_M = std::max(2, tile_size / 2 * 2);
TILE_N = std::max(1, tile_size);
TILE_K = std::max(2, tile_size / 2 * 2);
#endif

if (K > 0)
@@ -3695,14 +3695,14 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

#if __aarch64__
TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
#elif __ARM_NEON
TILE_M = tile_size / 4 * 4;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(4, tile_size / 4 * 4);
TILE_N = std::max(4, tile_size / 4 * 4);
#else
TILE_M = tile_size / 2 * 2;
TILE_N = tile_size;
TILE_M = std::max(2, tile_size / 2 * 2);
TILE_N = std::max(1, tile_size);
#endif
}
}


+ 5
- 5
src/layer/arm/gemm_arm_asimdhp.cpp View File

@@ -2284,9 +2284,9 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M
size_t l2_cache_size = get_cpu_level2_cache_size();
int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(__fp16));

TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 8 * 8;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(8, tile_size / 8 * 8);

if (K > 0)
{
@@ -2297,8 +2297,8 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M
{
tile_size = (int)((float)l2_cache_size / 2 / sizeof(__fp16) / TILE_K);

TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
}
}



+ 5
- 5
src/layer/arm/gemm_bf16s_fp16s.h View File

@@ -1525,9 +1525,9 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T
size_t l2_cache_size = get_cpu_level2_cache_size();
int tile_size = (int)sqrt((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));

TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 8 * 8;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(8, tile_size / 8 * 8);

if (K > 0)
{
@@ -1538,8 +1538,8 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T
{
tile_size = (int)((float)l2_cache_size / 2 / sizeof(unsigned short) / TILE_K);

TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
}
}



+ 12
- 12
src/layer/x86/convolution_3x3_winograd.h View File

@@ -1827,13 +1827,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
int tile_size = (int)sqrt((float)l2_cache_size / sizeof(float) / 3);

#if __AVX512F__
TILE_M = tile_size / 16 * 16;
TILE_M = std::max(16, tile_size / 16 * 16);
#elif __AVX__
TILE_M = tile_size / 8 * 8;
TILE_M = std::max(8, tile_size / 8 * 8);
#elif __SSE2__
TILE_M = tile_size / 4 * 4;
TILE_M = std::max(4, tile_size / 4 * 4);
#else
TILE_M = tile_size / 2 * 2;
TILE_M = std::max(2, tile_size / 2 * 2);
#endif

TILE_M *= std::min(nT, get_physical_cpu_count());
@@ -1868,13 +1868,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
int tile_size = (int)(sqrt((float)l2_cache_size / sizeof(float)) - TILE_M);

#if __AVX512F__
TILE_K = tile_size / 16 * 16;
TILE_K = std::max(16, tile_size / 16 * 16);
#elif __AVX__
TILE_K = tile_size / 8 * 8;
TILE_K = std::max(8, tile_size / 8 * 8);
#elif __SSE2__
TILE_K = tile_size / 4 * 4;
TILE_K = std::max(4, tile_size / 4 * 4);
#else
TILE_K = tile_size / 2 * 2;
TILE_K = std::max(2, tile_size / 2 * 2);
#endif

int nn_K = (K + TILE_K - 1) / TILE_K;
@@ -1894,13 +1894,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N,
int tile_size = (int)(((float)l2_cache_size / sizeof(float) - TILE_M * TILE_K) / (TILE_M + TILE_K));

#if __AVX512F__
TILE_N = tile_size / 4 * 4;
TILE_N = std::max(4, tile_size / 4 * 4);
#elif __AVX__
TILE_N = tile_size / 4 * 4;
TILE_N = std::max(4, tile_size / 4 * 4);
#elif __SSE2__
TILE_N = tile_size / 4 * 4;
TILE_N = std::max(4, tile_size / 4 * 4);
#else
TILE_N = tile_size;
TILE_N = std::max(1, tile_size);
#endif

int nn_N = (N + TILE_N - 1) / TILE_N;


+ 20
- 20
src/layer/x86/gemm_x86.cpp View File

@@ -6703,21 +6703,21 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

#if __AVX512F__
TILE_M = tile_size / 16 * 16;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 16 * 16;
TILE_M = std::max(16, tile_size / 16 * 16);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(16, tile_size / 16 * 16);
#elif __AVX__
TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 8 * 8;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(8, tile_size / 8 * 8);
#elif __SSE2__
TILE_M = tile_size / 4 * 4;
TILE_N = tile_size / 4 * 4;
TILE_K = tile_size / 4 * 4;
TILE_M = std::max(4, tile_size / 4 * 4);
TILE_N = std::max(4, tile_size / 4 * 4);
TILE_K = std::max(4, tile_size / 4 * 4);
#else
TILE_M = tile_size / 2 * 2;
TILE_N = tile_size;
TILE_K = tile_size / 2 * 2;
TILE_M = std::max(2, tile_size / 2 * 2);
TILE_N = std::max(1, tile_size);
TILE_K = std::max(2, tile_size / 2 * 2);
#endif

if (K > 0)
@@ -6738,17 +6738,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c
tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

#if __AVX512F__
TILE_M = tile_size / 16 * 16;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(16, tile_size / 16 * 16);
TILE_N = std::max(4, tile_size / 4 * 4);
#elif __AVX__
TILE_M = tile_size / 8 * 8;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(8, tile_size / 8 * 8);
TILE_N = std::max(4, tile_size / 4 * 4);
#elif __SSE2__
TILE_M = tile_size / 4 * 4;
TILE_N = tile_size / 4 * 4;
TILE_M = std::max(4, tile_size / 4 * 4);
TILE_N = std::max(4, tile_size / 4 * 4);
#else
TILE_M = tile_size / 2 * 2;
TILE_N = tile_size;
TILE_M = std::max(2, tile_size / 2 * 2);
TILE_N = std::max(1, tile_size);
#endif
}
}


Loading…
Cancel
Save