Browse Source

force global cpu info initialization (#4725)

* force global cpu info initialization

* sanitize zero nT
tags/20230517
nihui GitHub 3 years ago
parent
commit
2b87dc2cf7
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1944 additions and 1787 deletions
  1. +1905
    -1782
      src/cpu.cpp
  2. +3
    -0
      src/layer/arm/convolution_3x3_winograd.h
  3. +3
    -0
      src/layer/arm/convolution_3x3_winograd_fp16s.h
  4. +3
    -0
      src/layer/arm/convolution_im2col_gemm.h
  5. +3
    -0
      src/layer/arm/convolution_im2col_gemm_bf16s.h
  6. +3
    -0
      src/layer/arm/convolution_im2col_gemm_fp16s.h
  7. +5
    -1
      src/layer/arm/gemm_arm.cpp
  8. +5
    -1
      src/layer/arm/gemm_arm_asimdhp.cpp
  9. +5
    -1
      src/layer/arm/gemm_bf16s_fp16s.h
  10. +4
    -1
      src/layer/x86/convolution_3x3_winograd.h
  11. +5
    -1
      src/layer/x86/gemm_x86.cpp

+ 1905
- 1782
src/cpu.cpp
File diff suppressed because it is too large
View File


+ 3
- 0
src/layer/arm/convolution_3x3_winograd.h View File

@@ -4464,6 +4464,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B,
// resolve optimal tile size from cache size
const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));

if (nT == 0)
nT = get_physical_big_cpu_count();

// we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
(void)B;



+ 3
- 0
src/layer/arm/convolution_3x3_winograd_fp16s.h View File

@@ -1999,6 +1999,9 @@ static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, in
// resolve optimal tile size from cache size
const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

if (nT == 0)
nT = get_physical_big_cpu_count();

// we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
(void)B;



+ 3
- 0
src/layer/arm/convolution_im2col_gemm.h View File

@@ -5950,6 +5950,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, in
// resolve optimal tile size from cache size
const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));

if (nT == 0)
nT = get_physical_big_cpu_count();

// solve K
{
// try not to split K


+ 3
- 0
src/layer/arm/convolution_im2col_gemm_bf16s.h View File

@@ -5831,6 +5831,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int
// resolve optimal tile size from cache size
const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

if (nT == 0)
nT = get_physical_big_cpu_count();

// solve K
{
// try not to split K


+ 3
- 0
src/layer/arm/convolution_im2col_gemm_fp16s.h View File

@@ -3022,6 +3022,9 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, in
// resolve optimal tile size from cache size
const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

if (nT == 0)
nT = get_physical_big_cpu_count();

// solve K
{
// try not to split K


+ 5
- 1
src/layer/arm/gemm_arm.cpp View File

@@ -3662,7 +3662,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// resolve optimal tile size from cache size
size_t l2_cache_size = get_cpu_level2_cache_size();
const size_t l2_cache_size = get_cpu_level2_cache_size();

if (nT == 0)
nT = get_physical_big_cpu_count();

int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float));

#if __aarch64__


+ 5
- 1
src/layer/arm/gemm_arm_asimdhp.cpp View File

@@ -2281,7 +2281,11 @@ static void gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_til
static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// resolve optimal tile size from cache size
size_t l2_cache_size = get_cpu_level2_cache_size();
const size_t l2_cache_size = get_cpu_level2_cache_size();

if (nT == 0)
nT = get_physical_big_cpu_count();

int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16));

TILE_M = std::max(8, tile_size / 8 * 8);


+ 5
- 1
src/layer/arm/gemm_bf16s_fp16s.h View File

@@ -1522,7 +1522,11 @@ static void transpose_unpack_output_tile_bf16_fp16(const Mat& topT, Mat& top_blo
static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// resolve optimal tile size from cache size
size_t l2_cache_size = get_cpu_level2_cache_size();
const size_t l2_cache_size = get_cpu_level2_cache_size();

if (nT == 0)
nT = get_physical_big_cpu_count();

int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));

TILE_M = std::max(8, tile_size / 8 * 8);


+ 4
- 1
src/layer/x86/convolution_3x3_winograd.h View File

@@ -1820,7 +1820,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, Mat&
static void get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// resolve optimal tile size from cache size
size_t l2_cache_size = get_cpu_level2_cache_size();
const size_t l2_cache_size = get_cpu_level2_cache_size();

if (nT == 0)
nT = get_physical_big_cpu_count();

// solve M
{


+ 5
- 1
src/layer/x86/gemm_x86.cpp View File

@@ -6699,7 +6699,11 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// resolve optimal tile size from cache size
size_t l2_cache_size = get_cpu_level2_cache_size();
const size_t l2_cache_size = get_cpu_level2_cache_size();

if (nT == 0)
nT = get_physical_big_cpu_count();

int tile_size = (int)sqrt((float)l2_cache_size / 3 / sizeof(float));

#if __AVX512F__


Loading…
Cancel
Save