Browse Source

arm handle allocation failures (#5490)

tags/20240820
nihui GitHub 1 year ago
parent
commit
debc33fee2
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
30 changed files with 468 additions and 66 deletions
  1. +8
    -0
      src/layer/arm/concat_arm.cpp
  2. +33
    -3
      src/layer/arm/convolution_3x3_winograd.h
  3. +33
    -3
      src/layer/arm/convolution_3x3_winograd_bf16s.h
  4. +33
    -3
      src/layer/arm/convolution_3x3_winograd_fp16s.h
  5. +22
    -2
      src/layer/arm/convolution_3x3_winograd_int8.h
  6. +26
    -11
      src/layer/arm/convolution_arm.cpp
  7. +2
    -2
      src/layer/arm/convolution_arm_asimddp.cpp
  8. +9
    -4
      src/layer/arm/convolution_arm_asimdhp.cpp
  9. +2
    -2
      src/layer/arm/convolution_arm_i8mm.cpp
  10. +9
    -1
      src/layer/arm/convolution_im2col_gemm.h
  11. +9
    -1
      src/layer/arm/convolution_im2col_gemm_bf16s.h
  12. +9
    -1
      src/layer/arm/convolution_im2col_gemm_fp16s.h
  13. +13
    -7
      src/layer/arm/convolution_im2col_gemm_int8.h
  14. +23
    -3
      src/layer/arm/convolutiondepthwise_arm.cpp
  15. +20
    -8
      src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
  16. +6
    -0
      src/layer/arm/crop_arm.cpp
  17. +3
    -1
      src/layer/arm/deconvolution_arm.cpp
  18. +3
    -1
      src/layer/arm/deconvolution_arm_asimdhp.cpp
  19. +14
    -2
      src/layer/arm/deconvolutiondepthwise_arm.cpp
  20. +14
    -2
      src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
  21. +48
    -0
      src/layer/arm/gemm_arm.cpp
  22. +26
    -0
      src/layer/arm/gemm_arm_asimdhp.cpp
  23. +24
    -0
      src/layer/arm/gemm_arm_vfpv4.cpp
  24. +10
    -0
      src/layer/arm/innerproduct_arm.cpp
  25. +2
    -0
      src/layer/arm/innerproduct_arm_asimdhp.cpp
  26. +2
    -0
      src/layer/arm/innerproduct_arm_vfpv4.cpp
  27. +40
    -8
      src/layer/arm/multiheadattention_arm.cpp
  28. +6
    -0
      src/layer/arm/padding_arm.cpp
  29. +11
    -1
      src/layer/arm/shufflechannel_arm.cpp
  30. +8
    -0
      src/layer/arm/slice_arm.cpp

+ 8
- 0
src/layer/arm/concat_arm.cpp View File

@@ -159,6 +159,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
if (elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
}

@@ -284,6 +286,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
if (elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
}

@@ -617,6 +621,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v
if (elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
}

@@ -816,6 +822,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v
if (elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
}



+ 33
- 3
src/layer/arm/convolution_3x3_winograd.h View File

@@ -5578,7 +5578,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_til
}
}

static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -5605,12 +5605,16 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -5634,6 +5638,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -5659,6 +5665,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -5688,6 +5696,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -7256,7 +7266,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_til
}
}

static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -7283,12 +7293,16 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -7312,6 +7326,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -7337,6 +7353,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -7366,6 +7384,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -9292,7 +9312,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_til
}
}

static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -9319,12 +9339,16 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -9348,6 +9372,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -9373,6 +9399,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -9402,4 +9430,6 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

+ 33
- 3
src/layer/arm/convolution_3x3_winograd_bf16s.h View File

@@ -920,7 +920,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& t
}
}

static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -947,12 +947,16 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -976,6 +980,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -1001,6 +1007,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -1030,6 +1038,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
conv3x3s1_winograd23_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
@@ -2497,7 +2507,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& t
}
}

static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -2524,12 +2534,16 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -2553,6 +2567,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -2578,6 +2594,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -2607,6 +2625,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
conv3x3s1_winograd43_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
@@ -4428,7 +4448,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& t
}
}

static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -4455,12 +4475,16 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -4484,6 +4508,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -4509,6 +4535,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -4538,4 +4566,6 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
conv3x3s1_winograd63_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

+ 33
- 3
src/layer/arm/convolution_3x3_winograd_fp16s.h View File

@@ -2916,7 +2916,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_fp16sa(const Mat&
}
}

static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -2943,12 +2943,16 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -2972,6 +2976,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -2997,6 +3003,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -3026,6 +3034,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
conv3x3s1_winograd23_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -4262,7 +4272,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat&
}
}

static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -4289,12 +4299,16 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -4318,6 +4332,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -4343,6 +4359,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -4372,6 +4390,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
conv3x3s1_winograd43_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd63_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -5873,7 +5893,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat&
}
}

static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -5900,12 +5920,16 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -5929,6 +5953,8 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -5954,6 +5980,8 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -5983,4 +6011,6 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
conv3x3s1_winograd63_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

+ 22
- 2
src/layer/arm/convolution_3x3_winograd_int8.h View File

@@ -4257,7 +4257,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to
}
}

static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
static int conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -4284,12 +4284,16 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -4313,6 +4317,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

// #pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -4340,6 +4346,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat
bottom_blob.release();

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -4369,6 +4377,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat
conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -5604,7 +5614,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to
}
}

static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
static int conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
@@ -5631,12 +5641,16 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
@@ -5660,6 +5674,8 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -5687,6 +5703,8 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat
bottom_blob.release();

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -5716,4 +5734,6 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat
conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
}
}

return 0;
}

+ 26
- 11
src/layer/arm/convolution_arm.cpp View File

@@ -488,22 +488,25 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
}

int ret = 0;
if (prefer_winograd23)
{
conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
ret = conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
}
else if (prefer_winograd43)
{
conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
ret = conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
}
else if (prefer_winograd63)
{
conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
ret = conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
}
else
{
// should never reach here
}
if (ret != 0)
return ret;

if (activation)
{
@@ -559,7 +562,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
}

convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
int ret = convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
if (ret != 0)
return ret;

if (activation)
{
@@ -1072,22 +1077,25 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
}

int ret = 0;
if (prefer_winograd23)
{
conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
ret = conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
}
else if (prefer_winograd43)
{
conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
ret = conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
}
else if (prefer_winograd63)
{
conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
ret = conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
}
else
{
// should never reach here
}
if (ret != 0)
return ret;

if (activation)
{
@@ -1143,7 +1151,9 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
}

convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
int ret = convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
if (ret != 0)
return ret;

if (activation)
{
@@ -1307,6 +1317,8 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
Option opt_q = opt;
opt_q.blob_allocator = opt.workspace_allocator;
quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
if (bottom_blob_int8.empty())
return -100;
}

// NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
@@ -1381,21 +1393,24 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
}

int ret = 0;
if (opt.use_winograd_convolution && prefer_winograd)
{
if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
ret = conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
else
conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
ret = conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
ret = convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
}
else
{
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
if (ret != 0)
return ret;

bottom_blob_bordered.release();



+ 2
- 2
src/layer/arm/convolution_arm_asimddp.cpp View File

@@ -37,9 +37,9 @@ void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Ma
convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

} // namespace ncnn

+ 9
- 4
src/layer/arm/convolution_arm_asimdhp.cpp View File

@@ -372,22 +372,25 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
}

int ret = 0;
if (prefer_winograd23)
{
conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt);
ret = conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt);
}
else if (prefer_winograd43)
{
conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt);
ret = conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt);
}
else if (prefer_winograd63)
{
conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt);
ret = conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt);
}
else
{
// should never reach here
}
if (ret != 0)
return ret;

if (activation)
{
@@ -471,7 +474,9 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
}

convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
int ret = convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
if (ret != 0)
return ret;

if (activation)
{


+ 2
- 2
src/layer/arm/convolution_arm_i8mm.cpp View File

@@ -37,9 +37,9 @@ void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat&
convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

} // namespace ncnn

+ 9
- 1
src/layer/arm/convolution_im2col_gemm.h View File

@@ -6829,7 +6829,7 @@ static void convolution_im2col_gemm_transform_kernel(const Mat& kernel, Mat& AT,
}
}

static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
static int convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
const int maxk = kernel_w * kernel_h;

@@ -6847,6 +6847,8 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -6870,7 +6872,11 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const

Mat topT_tileX;
if (K > TILE_K)
{
topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT_tileX.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -6901,4 +6907,6 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const
}
}
}

return 0;
}

+ 9
- 1
src/layer/arm/convolution_im2col_gemm_bf16s.h View File

@@ -6010,7 +6010,7 @@ static void convolution_im2col_gemm_transform_kernel_bf16s(const Mat& kernel, Ma
}
}

static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
static int convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
const int maxk = kernel_w * kernel_h;

@@ -6028,6 +6028,8 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob,
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -6051,7 +6053,11 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob,

Mat topT_tileX;
if (K > TILE_K)
{
topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT_tileX.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -6082,4 +6088,6 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob,
}
}
}

return 0;
}

+ 9
- 1
src/layer/arm/convolution_im2col_gemm_fp16s.h View File

@@ -3145,7 +3145,7 @@ static void convolution_im2col_gemm_transform_kernel_fp16sa(const Mat& kernel, M
}
}

static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
static int convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
// NCNN_LOGE("convolution_im2col_gemm_fp16sa %p %p %p %p", bottom_blob.data, top_blob.data, AT.data, bias.data);
const int maxk = kernel_w * kernel_h;
@@ -3164,6 +3164,8 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -3187,7 +3189,11 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob

Mat topT_tileX;
if (K > TILE_K)
{
topT_tileX.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (topT_tileX.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -3218,4 +3224,6 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob
}
}
}

return 0;
}

+ 13
- 7
src/layer/arm/convolution_im2col_gemm_int8.h View File

@@ -14,12 +14,12 @@

#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt);
void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt);
int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt);
void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt);
int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt);
#endif

static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
@@ -10944,21 +10944,19 @@ static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat
}
}

static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
static int convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
if (ncnn::cpu_support_arm_i8mm())
{
convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
return;
return convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
if (ncnn::cpu_support_arm_asimddp())
{
convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
return;
return convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}
#endif

@@ -10978,6 +10976,8 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob,
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -11001,7 +11001,11 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob,

Mat topT_tileX;
if (K > TILE_K)
{
topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT_tileX.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
@@ -11032,4 +11036,6 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob,
}
}
}

return 0;
}

+ 23
- 3
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -539,6 +539,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
if (bottom_blob_bordered_unpacked.empty())
return -100;
}

Mat top_blob_unpacked = top_blob;
@@ -560,13 +562,17 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
opt_g.blob_allocator = top_blob_unpacked.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
{
convert_packing(top_blob_unpacked, top_blob, 4, opt);
if (top_blob.empty())
return -100;
}
else
{
@@ -959,6 +965,8 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
if (bottom_blob_bordered_unpacked.empty())
return -100;
}

Mat top_blob_unpacked = top_blob;
@@ -980,13 +988,17 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo
opt_g.blob_allocator = top_blob_unpacked.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
{
convert_packing(top_blob_unpacked, top_blob, 4, opt);
if (top_blob.empty())
return -100;
}
else
{
@@ -1073,6 +1085,8 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
Option opt_q = opt;
opt_q.blob_allocator = opt.workspace_allocator;
quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
if (bottom_blob_int8.empty())
return -100;
}

Mat bottom_blob_bordered;
@@ -1537,6 +1551,8 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
if (bottom_blob_bordered_unpacked.empty())
return -100;
}

Mat top_blob_unpacked = top_blob;
@@ -1558,13 +1574,17 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
opt_g.blob_allocator = top_blob_unpacked.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
else
{


+ 20
- 8
src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp View File

@@ -260,17 +260,19 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo

// unpacking
Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
if (elempack == 4 && g_elempack == 1)
if (elempack > g_elempack)
{
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
if (bottom_blob_bordered_unpacked.empty())
return -100;
}

Mat top_blob_unpacked = top_blob;
if (out_g_elempack == 1 && out_elempack == 4)
if (out_g_elempack < out_elempack)
{
top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
if (top_blob_unpacked.empty())
return -100;
}
@@ -286,13 +288,17 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
opt_g.blob_allocator = top_blob_unpacked.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
if (out_g_elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, 4, opt);
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
else
{
@@ -603,6 +609,8 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
if (bottom_blob_bordered_unpacked.empty())
return -100;
}

Mat top_blob_unpacked = top_blob;
@@ -624,13 +632,17 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
opt_g.blob_allocator = top_blob_unpacked.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
else
{


+ 6
- 0
src/layer/arm/crop_arm.cpp View File

@@ -443,6 +443,8 @@ int Crop_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
if (bottom_blob_unpacked.empty())
return -100;
}

return Crop::forward(bottom_blob_unpacked, top_blob, opt);
@@ -780,6 +782,8 @@ int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
if (bottom_blob_unpacked.empty())
return -100;
}

Mat reference_blob_unpacked = reference_blob;
@@ -789,6 +793,8 @@ int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1);
if (reference_blob_unpacked.empty())
return -100;
}

std::vector<Mat> bottom_blobs_unpacked(2);


+ 3
- 1
src/layer/arm/deconvolution_arm.cpp View File

@@ -305,7 +305,9 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
Mat top_col2im;
Option opt_b = opt;
opt_b.blob_allocator = top_blob_bordered.allocator;
gemm->forward(bottom_blob_2, top_col2im, opt_b);
int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b);
if (ret != 0)
return ret;

{
// col2im


+ 3
- 1
src/layer/arm/deconvolution_arm_asimdhp.cpp View File

@@ -556,7 +556,9 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
Mat top_col2im;
Option opt_b = opt;
opt_b.blob_allocator = top_blob_bordered.allocator;
gemm->forward(bottom_blob_2, top_col2im, opt_b);
int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b);
if (ret != 0)
return ret;

{
// col2im


+ 14
- 2
src/layer/arm/deconvolutiondepthwise_arm.cpp View File

@@ -418,6 +418,8 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
if (bottom_blob_unpacked.empty())
return -100;
}

Mat top_blob_bordered_unpacked = top_blob_bordered;
@@ -439,13 +441,17 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c
opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

// forward
op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
{
convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
if (top_blob_bordered.empty())
return -100;
}
else
{
@@ -791,6 +797,8 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
if (bottom_blob_unpacked.empty())
return -100;
}

Mat top_blob_bordered_unpacked = top_blob_bordered;
@@ -812,13 +820,17 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b
opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

// forward
op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
{
convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
if (top_blob_bordered.empty())
return -100;
}
else
{


+ 14
- 2
src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp View File

@@ -329,6 +329,8 @@ int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_b
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
if (bottom_blob_unpacked.empty())
return -100;
}

Mat top_blob_bordered_unpacked = top_blob_bordered;
@@ -350,13 +352,17 @@ int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_b
opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

// forward
op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
{
convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
if (top_blob_bordered.empty())
return -100;
}
else
{
@@ -622,6 +628,8 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
if (bottom_blob_unpacked.empty())
return -100;
}

Mat top_blob_bordered_unpacked = top_blob_bordered;
@@ -643,13 +651,17 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

// forward
op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
if (ret != 0)
return ret;
}

// packing
if (out_g_elempack < out_elempack)
{
convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
if (top_blob_bordered.empty())
return -100;
}
else
{


+ 48
- 0
src/layer/arm/gemm_arm.cpp View File

@@ -3801,7 +3801,11 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator);
if (ATX.empty())
return -100;
Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -3832,7 +3836,11 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -3913,6 +3921,8 @@ static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob,
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -3943,7 +3953,11 @@ static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob,

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -4007,10 +4021,16 @@ static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
// int nn_N = (N + TILE_N - 1) / TILE_N;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator);
if (ATX.empty())
return -100;

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -4089,7 +4109,11 @@ static int gemm_AT_BT_arm(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_b

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -4512,7 +4536,11 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
if (ATX.empty())
return -100;
Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -4543,7 +4571,11 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -4625,6 +4657,8 @@ static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -4655,7 +4689,11 @@ static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -4720,10 +4758,16 @@ static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
// int nn_N = (N + TILE_N - 1) / TILE_N;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
if (ATX.empty())
return -100;

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -4803,7 +4847,11 @@ static int gemm_AT_BT_arm_bf16s(const Mat& AT, const Mat& BT, const Mat& C, Mat&

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)


+ 26
- 0
src/layer/arm/gemm_arm_asimdhp.cpp View File

@@ -2360,7 +2360,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
if (ATX.empty())
return -100;
Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -2391,7 +2395,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -2472,6 +2480,8 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -2502,7 +2512,11 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -2566,10 +2580,16 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
// int nn_N = (N + TILE_N - 1) / TILE_N;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
if (ATX.empty())
return -100;

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -2648,7 +2668,11 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -2910,6 +2934,8 @@ int Gemm_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
{
Mat CT_data;
CT_data.create_like(C, opt.workspace_allocator);
if (CT_data.empty())
return -100;

const int size = C.total() * C.elempack;
const __fp16* ptr = C;


+ 24
- 0
src/layer/arm/gemm_arm_vfpv4.cpp View File

@@ -47,7 +47,11 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
if (ATX.empty())
return -100;
Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -78,7 +82,11 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -160,6 +168,8 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

@@ -190,7 +200,11 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -255,10 +269,16 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
// int nn_N = (N + TILE_N - 1) / TILE_N;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
if (ATX.empty())
return -100;

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)
@@ -338,7 +358,11 @@ static int gemm_AT_BT_arm_fp16s(const Mat& AT, const Mat& BT, const Mat& C, Mat&

Mat topT;
if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
{
topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (topT.empty())
return -100;
}

#pragma omp parallel for num_threads(nT)
for (int ppi = 0; ppi < nn_M; ppi++)


+ 10
- 0
src/layer/arm/innerproduct_arm.cpp View File

@@ -415,6 +415,8 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
opt_flatten.blob_allocator = opt.workspace_allocator;

flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
if (bottom_blob_flattened.empty())
return -100;
}

size_t elemsize = bottom_blob_flattened.elemsize;
@@ -1064,6 +1066,8 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
opt_flatten.blob_allocator = opt.workspace_allocator;

flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
if (bottom_blob_flattened.empty())
return -100;
}

size_t elemsize = bottom_blob_flattened.elemsize;
@@ -1278,6 +1282,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co
Option opt_q = opt;
opt_q.blob_allocator = opt.workspace_allocator;
quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
if (bottom_blob_int8.empty())
return -100;
}

if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input)
@@ -1287,6 +1293,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co
Option opt_unpack = opt;
opt_unpack.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
if (bottom_blob_int8_unpacked.empty())
return -100;

int h = bottom_blob_int8_unpacked.h;

@@ -1684,6 +1692,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co
Option opt_flatten = opt;
opt_flatten.blob_allocator = opt.workspace_allocator;
flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
if (bottom_blob_int8_flattened.empty())
return -100;
}

// int elempack = bottom_blob_int8_flattened.elempack;


+ 2
- 0
src/layer/arm/innerproduct_arm_asimdhp.cpp View File

@@ -480,6 +480,8 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
opt_flatten.blob_allocator = opt.workspace_allocator;

flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
if (bottom_blob_flattened.empty())
return -100;
}

size_t elemsize = bottom_blob_flattened.elemsize;


+ 2
- 0
src/layer/arm/innerproduct_arm_vfpv4.cpp View File

@@ -75,6 +75,8 @@ int InnerProduct_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
opt_flatten.blob_allocator = opt.workspace_allocator;

flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
if (bottom_blob_flattened.empty())
return -100;
}

size_t elemsize = bottom_blob_flattened.elemsize;


+ 40
- 8
src/layer/arm/multiheadattention_arm.cpp View File

@@ -292,9 +292,11 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
opt.use_bf16_storage &= support_bf16_storage;

Mat attn_mask_blob_unpacked;
if (attn_mask_blob.elempack != 1)
if (attn_mask && attn_mask_blob.elempack != 1)
{
convert_packing(attn_mask_blob, attn_mask_blob_unpacked, 1, opt);
if (attn_mask_blob_unpacked.empty())
return -100;
}
else
{
@@ -310,12 +312,21 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
size_t elemsize = q_blob.elemsize / q_blob.elempack;

Mat q_affine;
q_gemm->forward(q_blob, q_affine, opt);
int retq = q_gemm->forward(q_blob, q_affine, opt);
if (retq != 0)
return retq;

Mat k_affine;
k_gemm->forward(k_blob, k_affine, opt);
int retk = k_gemm->forward(k_blob, k_affine, opt);
if (retk != 0)
return retk;

Mat qk_cross(dst_seqlen, src_seqlen * num_heads, elemsize, opt.blob_allocator);
if (qk_cross.empty())
return -100;

std::vector<int> retqks;
retqks.resize(num_heads);
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < num_heads; i++)
{
@@ -331,18 +342,32 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen);
Option opt1 = opt;
opt1.num_threads = 1;
qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1);
retqks[i] = qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1);
}
for (int i = 0; i < num_heads; i++)
{
if (retqks[i] != 0)
return retqks[i];
}

q_affine.release();
k_affine.release();

qk_softmax->forward_inplace(qk_cross, opt);
int retqk = qk_softmax->forward_inplace(qk_cross, opt);
if (retqk != 0)
return retqk;

Mat v_affine;
v_gemm->forward(v_blob, v_affine, opt);
int retv = v_gemm->forward(v_blob, v_affine, opt);
if (retv != 0)
return retv;

Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, elemsize, opt.blob_allocator);
if (qkv_cross.empty())
return -100;

std::vector<int> retqkvs;
retqkvs.resize(num_heads);
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < num_heads; i++)
{
@@ -353,12 +378,19 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
qkv_top_blobs[0] = qkv_cross.row_range(i * embed_dim_per_head, embed_dim_per_head);
Option opt1 = opt;
opt1.num_threads = 1;
qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1);
retqkvs[i] = qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1);
}
for (int i = 0; i < num_heads; i++)
{
if (retqkvs[i] != 0)
return retqkvs[i];
}

v_affine.release();

o_gemm->forward(qkv_cross, top_blobs[0], opt);
int reto = o_gemm->forward(qkv_cross, top_blobs[0], opt);
if (reto != 0)
return reto;

return 0;
}


+ 6
- 0
src/layer/arm/padding_arm.cpp View File

@@ -238,6 +238,8 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
if (bottom_blob_unpacked.empty())
return -100;
}

return Padding::forward(bottom_blob_unpacked, top_blob, opt);
@@ -616,6 +618,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
if (bottom_blob_unpacked.empty())
return -100;
}

return Padding::forward(bottom_blob_unpacked, top_blob, opt);
@@ -770,6 +774,8 @@ int Padding_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
if (bottom_blob_unpacked.empty())
return -100;
}

return Padding::forward(bottom_blob_unpacked, top_blob, opt);


+ 11
- 1
src/layer/arm/shufflechannel_arm.cpp View File

@@ -1,4 +1,4 @@
// Tencent is pleased to support the open source community by making ncnn available.
// // Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
@@ -143,6 +143,8 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt

Mat bottom_blob_unpacked;
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
if (bottom_blob_unpacked.empty())
return -100;

Mat top_blob_unpacked;
int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack);
@@ -389,6 +391,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo

Mat bottom_blob_unpacked;
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
if (bottom_blob_unpacked.empty())
return -100;

Mat top_blob_unpacked;
int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack);
@@ -396,6 +400,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo
return ret;

convert_packing(top_blob_unpacked, top_blob, elempack, opt);
if (top_blob.empty())
return -100;

return 0;
}
@@ -618,6 +624,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo

Mat bottom_blob_unpacked;
convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
if (bottom_blob_unpacked.empty())
return -100;

Mat top_blob_unpacked;
int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack);
@@ -625,6 +633,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo
return ret;

convert_packing(top_blob_unpacked, top_blob, elempack, opt);
if (top_blob.empty())
return -100;

return 0;
}


+ 8
- 0
src/layer/arm/slice_arm.cpp View File

@@ -167,6 +167,8 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
if (elempack > out_elempack)
{
convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
if (bottom_blob_unpacked.empty())
return -100;
}

const float* ptr = bottom_blob_unpacked;
@@ -331,6 +333,8 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
if (elempack > out_elempack)
{
convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
if (bottom_blob_unpacked.empty())
return -100;
}

int p = 0;
@@ -704,6 +708,8 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::ve
if (elempack > out_elempack)
{
convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
if (bottom_blob_unpacked.empty())
return -100;
}

const unsigned short* ptr = bottom_blob_unpacked;
@@ -934,6 +940,8 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::ve
if (elempack > out_elempack)
{
convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
if (bottom_blob_unpacked.empty())
return -100;
}

int p = 0;


Loading…
Cancel
Save