* force global cpu info initialization * sanitize zero nTtags/20230517
| @@ -4464,6 +4464,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B, | |||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float)); | const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float)); | ||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ? | // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ? | ||||
| (void)B; | (void)B; | ||||
| @@ -1999,6 +1999,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in | |||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); | const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); | ||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ? | // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ? | ||||
| (void)B; | (void)B; | ||||
| @@ -5950,6 +5950,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in | |||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float)); | const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float)); | ||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| // solve K | // solve K | ||||
| { | { | ||||
| // try not to split K | // try not to split K | ||||
| @@ -5831,6 +5831,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int | |||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); | const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); | ||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| // solve K | // solve K | ||||
| { | { | ||||
| // try not to split K | // try not to split K | ||||
| @@ -3022,6 +3022,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in | |||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); | const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short)); | ||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| // solve K | // solve K | ||||
| { | { | ||||
| // try not to split K | // try not to split K | ||||
| @@ -3662,7 +3662,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons | |||||
| static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | ||||
| { | { | ||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| const size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float)); | int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float)); | ||||
| #if __aarch64__ | #if __aarch64__ | ||||
| @@ -2281,7 +2281,11 @@ static void gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_til | |||||
| static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | ||||
| { | { | ||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| const size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16)); | int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16)); | ||||
| TILE_M = std::max(8, tile_size / 8 * 8); | TILE_M = std::max(8, tile_size / 8 * 8); | ||||
| @@ -1522,7 +1522,11 @@ static void transpose_unpack_output_tile_bf16_fp16(const Mat& topT, Mat& top_blo | |||||
| static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | ||||
| { | { | ||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| const size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float))); | int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float))); | ||||
| TILE_M = std::max(8, tile_size / 8 * 8); | TILE_M = std::max(8, tile_size / 8 * 8); | ||||
| @@ -1820,7 +1820,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, Mat& | |||||
| static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | ||||
| { | { | ||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| const size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| // solve M | // solve M | ||||
| { | { | ||||
| @@ -6699,7 +6699,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons | |||||
| static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) | ||||
| { | { | ||||
| // resolve optimal tile size from cache size | // resolve optimal tile size from cache size | ||||
| size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| const size_t l2_cache_size = get_cpu_level2_cache_size(); | |||||
| if (nT == 0) | |||||
| nT = get_physical_big_cpu_count(); | |||||
| int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); | int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float)); | ||||
| #if __AVX512F__ | #if __AVX512F__ | ||||