| @@ -4479,11 +4479,11 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B, | |||
| #endif | |||
| #if __aarch64__ | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| #elif __ARM_NEON | |||
| TILE_K = tile_size / 4 * 4; | |||
| TILE_K = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_K = tile_size / 2 * 2; | |||
| TILE_K = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| @@ -2007,7 +2007,7 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in | |||
| // try not to split K | |||
| int tile_size = (l2_cache_size_fp16 - 32) / 12; | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8); | |||
| @@ -5962,11 +5962,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in | |||
| #endif | |||
| #if __aarch64__ | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| #elif __ARM_NEON | |||
| TILE_K = tile_size / 4 * 4; | |||
| TILE_K = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_K = tile_size / 2 * 2; | |||
| TILE_K = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| @@ -5843,11 +5843,11 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int | |||
| #endif | |||
| #if __aarch64__ | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| #elif __ARM_NEON | |||
| TILE_K = tile_size / 4 * 4; | |||
| TILE_K = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_K = tile_size / 2 * 2; | |||
| TILE_K = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| @@ -3027,7 +3027,7 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in | |||
| // try not to split K | |||
| int tile_size = (l2_cache_size_fp16 - 32) / 12; | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8); | |||
| @@ -3666,17 +3666,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c | |||
| int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); | |||
| #if __aarch64__ | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| #elif __ARM_NEON | |||
| TILE_M = tile_size / 4 * 4; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 4 * 4; | |||
| TILE_M = std::max(4, tile_size / 4 * 4); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_M = tile_size / 2 * 2; | |||
| TILE_N = tile_size; | |||
| TILE_K = tile_size / 2 * 2; | |||
| TILE_M = std::max(2, tile_size / 2 * 2); | |||
| TILE_N = std::max(1, tile_size); | |||
| TILE_K = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| if (K > 0) | |||
| @@ -3695,14 +3695,14 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c | |||
| tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K); | |||
| #if __aarch64__ | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #elif __ARM_NEON | |||
| TILE_M = tile_size / 4 * 4; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(4, tile_size / 4 * 4); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_M = tile_size / 2 * 2; | |||
| TILE_N = tile_size; | |||
| TILE_M = std::max(2, tile_size / 2 * 2); | |||
| TILE_N = std::max(1, tile_size); | |||
| #endif | |||
| } | |||
| } | |||
| @@ -2284,9 +2284,9 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M | |||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||
| int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(__fp16)); | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| if (K > 0) | |||
| { | |||
| @@ -2297,8 +2297,8 @@ static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M | |||
| { | |||
| tile_size = (int)((float)l2_cache_size / 2 / sizeof(__fp16) / TILE_K); | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| } | |||
| } | |||
| @@ -1525,9 +1525,9 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T | |||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||
| int tile_size = (int)sqrt((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float))); | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| if (K > 0) | |||
| { | |||
| @@ -1538,8 +1538,8 @@ static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_T | |||
| { | |||
| tile_size = (int)((float)l2_cache_size / 2 / sizeof(unsigned short) / TILE_K); | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| } | |||
| } | |||
| @@ -1827,13 +1827,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, | |||
| int tile_size = (int)sqrt((float)l2_cache_size / sizeof(float) / 3); | |||
| #if __AVX512F__ | |||
| TILE_M = tile_size / 16 * 16; | |||
| TILE_M = std::max(16, tile_size / 16 * 16); | |||
| #elif __AVX__ | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| #elif __SSE2__ | |||
| TILE_M = tile_size / 4 * 4; | |||
| TILE_M = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_M = tile_size / 2 * 2; | |||
| TILE_M = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| TILE_M *= std::min(nT, get_physical_cpu_count()); | |||
| @@ -1868,13 +1868,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, | |||
| int tile_size = (int)(sqrt((float)l2_cache_size / sizeof(float)) - TILE_M); | |||
| #if __AVX512F__ | |||
| TILE_K = tile_size / 16 * 16; | |||
| TILE_K = std::max(16, tile_size / 16 * 16); | |||
| #elif __AVX__ | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| #elif __SSE2__ | |||
| TILE_K = tile_size / 4 * 4; | |||
| TILE_K = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_K = tile_size / 2 * 2; | |||
| TILE_K = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| @@ -1894,13 +1894,13 @@ static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, | |||
| int tile_size = (int)(((float)l2_cache_size / sizeof(float) - TILE_M * TILE_K) / (TILE_M + TILE_K)); | |||
| #if __AVX512F__ | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #elif __AVX__ | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #elif __SSE2__ | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_N = tile_size; | |||
| TILE_N = std::max(1, tile_size); | |||
| #endif | |||
| int nn_N = (N + TILE_N - 1) / TILE_N; | |||
| @@ -6703,21 +6703,21 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c | |||
| int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); | |||
| #if __AVX512F__ | |||
| TILE_M = tile_size / 16 * 16; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 16 * 16; | |||
| TILE_M = std::max(16, tile_size / 16 * 16); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(16, tile_size / 16 * 16); | |||
| #elif __AVX__ | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 8 * 8; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(8, tile_size / 8 * 8); | |||
| #elif __SSE2__ | |||
| TILE_M = tile_size / 4 * 4; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_K = tile_size / 4 * 4; | |||
| TILE_M = std::max(4, tile_size / 4 * 4); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| TILE_K = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_M = tile_size / 2 * 2; | |||
| TILE_N = tile_size; | |||
| TILE_K = tile_size / 2 * 2; | |||
| TILE_M = std::max(2, tile_size / 2 * 2); | |||
| TILE_N = std::max(1, tile_size); | |||
| TILE_K = std::max(2, tile_size / 2 * 2); | |||
| #endif | |||
| if (K > 0) | |||
| @@ -6738,17 +6738,17 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c | |||
| tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K); | |||
| #if __AVX512F__ | |||
| TILE_M = tile_size / 16 * 16; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(16, tile_size / 16 * 16); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #elif __AVX__ | |||
| TILE_M = tile_size / 8 * 8; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(8, tile_size / 8 * 8); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #elif __SSE2__ | |||
| TILE_M = tile_size / 4 * 4; | |||
| TILE_N = tile_size / 4 * 4; | |||
| TILE_M = std::max(4, tile_size / 4 * 4); | |||
| TILE_N = std::max(4, tile_size / 4 * 4); | |||
| #else | |||
| TILE_M = tile_size / 2 * 2; | |||
| TILE_N = tile_size; | |||
| TILE_M = std::max(2, tile_size / 2 * 2); | |||
| TILE_N = std::max(1, tile_size); | |||
| #endif | |||
| } | |||
| } | |||