From debc33fee27acc44f2691897e2677ff2d87bbb39 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 7 Jun 2024 11:45:52 +0800 Subject: [PATCH] arm handle allocation failures (#5490) --- src/layer/arm/concat_arm.cpp | 8 ++++ src/layer/arm/convolution_3x3_winograd.h | 36 ++++++++++++-- .../arm/convolution_3x3_winograd_bf16s.h | 36 ++++++++++++-- .../arm/convolution_3x3_winograd_fp16s.h | 36 ++++++++++++-- src/layer/arm/convolution_3x3_winograd_int8.h | 24 +++++++++- src/layer/arm/convolution_arm.cpp | 37 +++++++++----- src/layer/arm/convolution_arm_asimddp.cpp | 4 +- src/layer/arm/convolution_arm_asimdhp.cpp | 13 +++-- src/layer/arm/convolution_arm_i8mm.cpp | 4 +- src/layer/arm/convolution_im2col_gemm.h | 10 +++- src/layer/arm/convolution_im2col_gemm_bf16s.h | 10 +++- src/layer/arm/convolution_im2col_gemm_fp16s.h | 10 +++- src/layer/arm/convolution_im2col_gemm_int8.h | 20 +++++--- src/layer/arm/convolutiondepthwise_arm.cpp | 26 ++++++++-- .../arm/convolutiondepthwise_arm_asimdhp.cpp | 28 +++++++---- src/layer/arm/crop_arm.cpp | 6 +++ src/layer/arm/deconvolution_arm.cpp | 4 +- src/layer/arm/deconvolution_arm_asimdhp.cpp | 4 +- src/layer/arm/deconvolutiondepthwise_arm.cpp | 16 ++++++- .../deconvolutiondepthwise_arm_asimdhp.cpp | 16 ++++++- src/layer/arm/gemm_arm.cpp | 48 +++++++++++++++++++ src/layer/arm/gemm_arm_asimdhp.cpp | 26 ++++++++++ src/layer/arm/gemm_arm_vfpv4.cpp | 24 ++++++++++ src/layer/arm/innerproduct_arm.cpp | 10 ++++ src/layer/arm/innerproduct_arm_asimdhp.cpp | 2 + src/layer/arm/innerproduct_arm_vfpv4.cpp | 2 + src/layer/arm/multiheadattention_arm.cpp | 48 +++++++++++++++---- src/layer/arm/padding_arm.cpp | 6 +++ src/layer/arm/shufflechannel_arm.cpp | 12 ++++- src/layer/arm/slice_arm.cpp | 8 ++++ 30 files changed, 468 insertions(+), 66 deletions(-) diff --git a/src/layer/arm/concat_arm.cpp b/src/layer/arm/concat_arm.cpp index 481bc13e4..5028a534c 100644 --- a/src/layer/arm/concat_arm.cpp +++ b/src/layer/arm/concat_arm.cpp @@ -159,6 +159,8 @@ int Concat_arm::forward(const std::vector& bottom_blobs, std::vector& if (elempack < out_elempack) { convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } } @@ -284,6 +286,8 @@ int Concat_arm::forward(const std::vector& bottom_blobs, std::vector& if (elempack < out_elempack) { convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } } @@ -617,6 +621,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::v if (elempack < out_elempack) { convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } } @@ -816,6 +822,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::v if (elempack < out_elempack) { convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } } diff --git a/src/layer/arm/convolution_3x3_winograd.h b/src/layer/arm/convolution_3x3_winograd.h index 291f6907f..56277f039 100644 --- a/src/layer/arm/convolution_3x3_winograd.h +++ b/src/layer/arm/convolution_3x3_winograd.h @@ -5578,7 +5578,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_til } } -static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -5605,12 +5605,16 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -5634,6 +5638,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -5659,6 +5665,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -5688,6 +5696,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) @@ -7256,7 +7266,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_til } } -static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -7283,12 +7293,16 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -7312,6 +7326,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -7337,6 +7353,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -7366,6 +7384,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) @@ -9292,7 +9312,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_til } } -static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -9319,12 +9339,16 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -9348,6 +9372,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -9373,6 +9399,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -9402,4 +9430,6 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } diff --git a/src/layer/arm/convolution_3x3_winograd_bf16s.h b/src/layer/arm/convolution_3x3_winograd_bf16s.h index 58ddceebd..79debb126 100644 --- a/src/layer/arm/convolution_3x3_winograd_bf16s.h +++ b/src/layer/arm/convolution_3x3_winograd_bf16s.h @@ -920,7 +920,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& t } } -static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -947,12 +947,16 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -976,6 +980,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -1001,6 +1007,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -1030,6 +1038,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co conv3x3s1_winograd23_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT) @@ -2497,7 +2507,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& t } } -static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -2524,12 +2534,16 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -2553,6 +2567,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -2578,6 +2594,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -2607,6 +2625,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co conv3x3s1_winograd43_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT) @@ -4428,7 +4448,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& t } } -static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -4455,12 +4475,16 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -4484,6 +4508,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -4509,6 +4535,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -4538,4 +4566,6 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co conv3x3s1_winograd63_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h index 7332d61a7..97b1a1dd8 100644 --- a/src/layer/arm/convolution_3x3_winograd_fp16s.h +++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h @@ -2916,7 +2916,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_fp16sa(const Mat& } } -static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -2943,12 +2943,16 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -2972,6 +2976,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -2997,6 +3003,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -3026,6 +3034,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c conv3x3s1_winograd23_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd43_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) @@ -4262,7 +4272,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& } } -static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -4289,12 +4299,16 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -4318,6 +4332,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -4343,6 +4359,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -4372,6 +4390,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c conv3x3s1_winograd43_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd63_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) @@ -5873,7 +5893,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& } } -static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) +static int conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -5900,12 +5920,16 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -5929,6 +5953,8 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -5954,6 +5980,8 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c } Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -5983,4 +6011,6 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c conv3x3s1_winograd63_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj); } } + + return 0; } diff --git a/src/layer/arm/convolution_3x3_winograd_int8.h b/src/layer/arm/convolution_3x3_winograd_int8.h index ab108b3f0..ec533b131 100644 --- a/src/layer/arm/convolution_3x3_winograd_int8.h +++ b/src/layer/arm/convolution_3x3_winograd_int8.h @@ -4257,7 +4257,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to } } -static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) +static int conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -4284,12 +4284,16 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -4313,6 +4317,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; // #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -4340,6 +4346,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat bottom_blob.release(); Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -4369,6 +4377,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj); } } + + return 0; } static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) @@ -5604,7 +5614,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to } } -static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) +static int conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) { int outw = top_blob.w; int outh = top_blob.h; @@ -5631,12 +5641,16 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; if (nT > 1 && nn_NK < nT) { Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); + if (B_tile.empty()) + return -100; for (int ppjk = 0; ppjk < nn_NK; ppjk++) { @@ -5660,6 +5674,8 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat else { Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); + if (B_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppjk = 0; ppjk < nn_NK; ppjk++) @@ -5687,6 +5703,8 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat bottom_blob.release(); Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (top_tileX.empty()) + return -100; #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -5716,4 +5734,6 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj); } } + + return 0; } diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 4198eeeb7..16624574b 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -488,22 +488,25 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT); } + int ret = 0; if (prefer_winograd23) { - conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); + ret = conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); } else if (prefer_winograd43) { - conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); + ret = conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); } else if (prefer_winograd63) { - conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); + ret = conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); } else { // should never reach here } + if (ret != 0) + return ret; if (activation) { @@ -559,7 +562,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); } - convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + int ret = convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + if (ret != 0) + return ret; if (activation) { @@ -1072,22 +1077,25 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT); } + int ret = 0; if (prefer_winograd23) { - conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); + ret = conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); } else if (prefer_winograd43) { - conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); + ret = conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); } else if (prefer_winograd63) { - conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); + ret = conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); } else { // should never reach here } + if (ret != 0) + return ret; if (activation) { @@ -1143,7 +1151,9 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); } - convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + int ret = convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + if (ret != 0) + return ret; if (activation) { @@ -1307,6 +1317,8 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con Option opt_q = opt; opt_q.blob_allocator = opt.workspace_allocator; quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + if (bottom_blob_int8.empty()) + return -100; } // NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h); @@ -1381,21 +1393,24 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); } + int ret = 0; if (opt.use_winograd_convolution && prefer_winograd) { if (opt.use_winograd43_convolution && !weight_winograd43_data.empty()) - conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); + ret = conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); else - conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); + ret = conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); } else if (opt.use_sgemm_convolution) { - convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + ret = convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); } else { convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); } + if (ret != 0) + return ret; bottom_blob_bordered.release(); diff --git a/src/layer/arm/convolution_arm_asimddp.cpp b/src/layer/arm/convolution_arm_asimddp.cpp index e2f36fdd5..d600e1b06 100644 --- a/src/layer/arm/convolution_arm_asimddp.cpp +++ b/src/layer/arm/convolution_arm_asimddp.cpp @@ -37,9 +37,9 @@ void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Ma convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt); } -void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) { - convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); + return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); } } // namespace ncnn diff --git a/src/layer/arm/convolution_arm_asimdhp.cpp b/src/layer/arm/convolution_arm_asimdhp.cpp index b1a98ea22..c7762020e 100644 --- a/src/layer/arm/convolution_arm_asimdhp.cpp +++ b/src/layer/arm/convolution_arm_asimdhp.cpp @@ -372,22 +372,25 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT); } + int ret = 0; if (prefer_winograd23) { - conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt); + ret = conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt); } else if (prefer_winograd43) { - conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt); + ret = conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt); } else if (prefer_winograd63) { - conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt); + ret = conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt); } else { // should never reach here } + if (ret != 0) + return ret; if (activation) { @@ -471,7 +474,9 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); } - convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + int ret = convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + if (ret != 0) + return ret; if (activation) { diff --git a/src/layer/arm/convolution_arm_i8mm.cpp b/src/layer/arm/convolution_arm_i8mm.cpp index a814261f9..0bfe3c65c 100644 --- a/src/layer/arm/convolution_arm_i8mm.cpp +++ b/src/layer/arm/convolution_arm_i8mm.cpp @@ -37,9 +37,9 @@ void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt); } -void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) { - convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); + return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); } } // namespace ncnn diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h index 45651ddae..af501efa2 100644 --- a/src/layer/arm/convolution_im2col_gemm.h +++ b/src/layer/arm/convolution_im2col_gemm.h @@ -6829,7 +6829,7 @@ static void convolution_im2col_gemm_transform_kernel(const Mat& kernel, Mat& AT, } } -static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +static int convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) { const int maxk = kernel_w * kernel_h; @@ -6847,6 +6847,8 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -6870,7 +6872,11 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat topT_tileX; if (K > TILE_K) + { topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT_tileX.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -6901,4 +6907,6 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const } } } + + return 0; } diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h index f29420e97..82319d058 100644 --- a/src/layer/arm/convolution_im2col_gemm_bf16s.h +++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h @@ -6010,7 +6010,7 @@ static void convolution_im2col_gemm_transform_kernel_bf16s(const Mat& kernel, Ma } } -static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +static int convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) { const int maxk = kernel_w * kernel_h; @@ -6028,6 +6028,8 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -6051,7 +6053,11 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, Mat topT_tileX; if (K > TILE_K) + { topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT_tileX.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -6082,4 +6088,6 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, } } } + + return 0; } diff --git a/src/layer/arm/convolution_im2col_gemm_fp16s.h b/src/layer/arm/convolution_im2col_gemm_fp16s.h index a4cc82d70..8e499b3e7 100644 --- a/src/layer/arm/convolution_im2col_gemm_fp16s.h +++ b/src/layer/arm/convolution_im2col_gemm_fp16s.h @@ -3145,7 +3145,7 @@ static void convolution_im2col_gemm_transform_kernel_fp16sa(const Mat& kernel, M } } -static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +static int convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) { // NCNN_LOGE("convolution_im2col_gemm_fp16sa %p %p %p %p", bottom_blob.data, top_blob.data, AT.data, bias.data); const int maxk = kernel_w * kernel_h; @@ -3164,6 +3164,8 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -3187,7 +3189,11 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob Mat topT_tileX; if (K > TILE_K) + { topT_tileX.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (topT_tileX.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -3218,4 +3224,6 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob } } } + + return 0; } diff --git a/src/layer/arm/convolution_im2col_gemm_int8.h b/src/layer/arm/convolution_im2col_gemm_int8.h index 171809fb3..72e95d95e 100644 --- a/src/layer/arm/convolution_im2col_gemm_int8.h +++ b/src/layer/arm/convolution_im2col_gemm_int8.h @@ -14,12 +14,12 @@ #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt); -void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); +int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); #endif #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt); -void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); +int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); #endif static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk) @@ -10944,21 +10944,19 @@ static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat } } -static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) +static int convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 if (ncnn::cpu_support_arm_i8mm()) { - convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); - return; + return convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); } #endif #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 if (ncnn::cpu_support_arm_asimddp()) { - convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); - return; + return convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); } #endif @@ -10978,6 +10976,8 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -11001,7 +11001,11 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, Mat topT_tileX; if (K > TILE_K) + { topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT_tileX.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppj = 0; ppj < nn_M; ppj++) @@ -11032,4 +11036,6 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, } } } + + return 0; } diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index aa1a211fe..4aa1bba31 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -539,6 +539,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + if (bottom_blob_bordered_unpacked.empty()) + return -100; } Mat top_blob_unpacked = top_blob; @@ -560,13 +562,17 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con opt_g.blob_allocator = top_blob_unpacked.allocator; // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack == 1 && out_elempack == 4) { convert_packing(top_blob_unpacked, top_blob, 4, opt); + if (top_blob.empty()) + return -100; } else { @@ -959,6 +965,8 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + if (bottom_blob_bordered_unpacked.empty()) + return -100; } Mat top_blob_unpacked = top_blob; @@ -980,13 +988,17 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo opt_g.blob_allocator = top_blob_unpacked.allocator; // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack == 1 && out_elempack == 4) { convert_packing(top_blob_unpacked, top_blob, 4, opt); + if (top_blob.empty()) + return -100; } else { @@ -1073,6 +1085,8 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ Option opt_q = opt; opt_q.blob_allocator = opt.workspace_allocator; quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); + if (bottom_blob_int8.empty()) + return -100; } Mat bottom_blob_bordered; @@ -1537,6 +1551,8 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + if (bottom_blob_bordered_unpacked.empty()) + return -100; } Mat top_blob_unpacked = top_blob; @@ -1558,13 +1574,17 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ opt_g.blob_allocator = top_blob_unpacked.allocator; // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack < out_elempack) { convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } else { diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp index cfea9f2a0..c21e32e45 100644 --- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp +++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp @@ -260,17 +260,19 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo // unpacking Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; - if (elempack == 4 && g_elempack == 1) + if (elempack > g_elempack) { Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; - convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + if (bottom_blob_bordered_unpacked.empty()) + return -100; } Mat top_blob_unpacked = top_blob; - if (out_g_elempack == 1 && out_elempack == 4) + if (out_g_elempack < out_elempack) { - top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); if (top_blob_unpacked.empty()) return -100; } @@ -286,13 +288,17 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo opt_g.blob_allocator = top_blob_unpacked.allocator; // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + if (ret != 0) + return ret; } // packing - if (out_g_elempack == 1 && out_elempack == 4) + if (out_g_elempack < out_elempack) { - convert_packing(top_blob_unpacked, top_blob, 4, opt); + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } else { @@ -603,6 +609,8 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + if (bottom_blob_bordered_unpacked.empty()) + return -100; } Mat top_blob_unpacked = top_blob; @@ -624,13 +632,17 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl opt_g.blob_allocator = top_blob_unpacked.allocator; // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack < out_elempack) { convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + if (top_blob.empty()) + return -100; } else { diff --git a/src/layer/arm/crop_arm.cpp b/src/layer/arm/crop_arm.cpp index f55fa5458..e6163e4ed 100644 --- a/src/layer/arm/crop_arm.cpp +++ b/src/layer/arm/crop_arm.cpp @@ -443,6 +443,8 @@ int Crop_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) opt_pack1.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + if (bottom_blob_unpacked.empty()) + return -100; } return Crop::forward(bottom_blob_unpacked, top_blob, opt); @@ -780,6 +782,8 @@ int Crop_arm::forward(const std::vector& bottom_blobs, std::vector& to opt_pack1.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + if (bottom_blob_unpacked.empty()) + return -100; } Mat reference_blob_unpacked = reference_blob; @@ -789,6 +793,8 @@ int Crop_arm::forward(const std::vector& bottom_blobs, std::vector& to opt_pack1.blob_allocator = opt.workspace_allocator; convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1); + if (reference_blob_unpacked.empty()) + return -100; } std::vector bottom_blobs_unpacked(2); diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp index c06532a66..e4d5c1715 100644 --- a/src/layer/arm/deconvolution_arm.cpp +++ b/src/layer/arm/deconvolution_arm.cpp @@ -305,7 +305,9 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti Mat top_col2im; Option opt_b = opt; opt_b.blob_allocator = top_blob_bordered.allocator; - gemm->forward(bottom_blob_2, top_col2im, opt_b); + int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b); + if (ret != 0) + return ret; { // col2im diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp index 9cb7df463..69bee4d79 100644 --- a/src/layer/arm/deconvolution_arm_asimdhp.cpp +++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp @@ -556,7 +556,9 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con Mat top_col2im; Option opt_b = opt; opt_b.blob_allocator = top_blob_bordered.allocator; - gemm->forward(bottom_blob_2, top_col2im, opt_b); + int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b); + if (ret != 0) + return ret; { // col2im diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp index 133d5158f..7baf4f720 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp @@ -418,6 +418,8 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); + if (bottom_blob_unpacked.empty()) + return -100; } Mat top_blob_bordered_unpacked = top_blob_bordered; @@ -439,13 +441,17 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; // forward - op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack == 1 && out_elempack == 4) { convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); + if (top_blob_bordered.empty()) + return -100; } else { @@ -791,6 +797,8 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); + if (bottom_blob_unpacked.empty()) + return -100; } Mat top_blob_bordered_unpacked = top_blob_bordered; @@ -812,13 +820,17 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; // forward - op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack == 1 && out_elempack == 4) { convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); + if (top_blob_bordered.empty()) + return -100; } else { diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp index 73b428ebf..8e8f274f7 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp @@ -329,6 +329,8 @@ int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_b Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); + if (bottom_blob_unpacked.empty()) + return -100; } Mat top_blob_bordered_unpacked = top_blob_bordered; @@ -350,13 +352,17 @@ int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_b opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; // forward - op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack == 1 && out_elempack == 4) { convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); + if (top_blob_bordered.empty()) + return -100; } else { @@ -622,6 +628,8 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p); + if (bottom_blob_unpacked.empty()) + return -100; } Mat top_blob_bordered_unpacked = top_blob_bordered; @@ -643,13 +651,17 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; // forward - op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + if (ret != 0) + return ret; } // packing if (out_g_elempack < out_elempack) { convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt); + if (top_blob_bordered.empty()) + return -100; } else { diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp index e88ca1cc6..e798680e2 100644 --- a/src/layer/arm/gemm_arm.cpp +++ b/src/layer/arm/gemm_arm.cpp @@ -3801,7 +3801,11 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int int nn_K = (K + TILE_K - 1) / TILE_K; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -3832,7 +3836,11 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -3913,6 +3921,8 @@ static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int nn_K = (K + TILE_K - 1) / TILE_K; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -3943,7 +3953,11 @@ static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -4007,10 +4021,16 @@ static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, // int nn_N = (N + TILE_N - 1) / TILE_N; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -4089,7 +4109,11 @@ static int gemm_AT_BT_arm(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_b Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -4512,7 +4536,11 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo int nn_K = (K + TILE_K - 1) / TILE_K; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -4543,7 +4571,11 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -4625,6 +4657,8 @@ static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top int nn_K = (K + TILE_K - 1) / TILE_K; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -4655,7 +4689,11 @@ static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -4720,10 +4758,16 @@ static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top // int nn_N = (N + TILE_N - 1) / TILE_N; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -4803,7 +4847,11 @@ static int gemm_AT_BT_arm_bf16s(const Mat& AT, const Mat& BT, const Mat& C, Mat& Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp index f3140cb04..cb0aa87e4 100644 --- a/src/layer/arm/gemm_arm_asimdhp.cpp +++ b/src/layer/arm/gemm_arm_asimdhp.cpp @@ -2360,7 +2360,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl int nn_K = (K + TILE_K - 1) / TILE_K; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -2391,7 +2395,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -2472,6 +2480,8 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to int nn_K = (K + TILE_K - 1) / TILE_K; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -2502,7 +2512,11 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -2566,10 +2580,16 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to // int nn_N = (N + TILE_N - 1) / TILE_N; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -2648,7 +2668,11 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -2910,6 +2934,8 @@ int Gemm_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vector TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -160,6 +168,8 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top int nn_K = (K + TILE_K - 1) / TILE_K; Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); + if (BT.empty()) + return -100; const int nn_NK = nn_N * nn_K; @@ -190,7 +200,11 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -255,10 +269,16 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top // int nn_N = (N + TILE_N - 1) / TILE_N; Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); + if (ATX.empty()) + return -100; Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) @@ -338,7 +358,11 @@ static int gemm_AT_BT_arm_fp16s(const Mat& AT, const Mat& BT, const Mat& C, Mat& Mat topT; if (K > TILE_K || broadcast_type_C == 3 || output_transpose) + { topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); + if (topT.empty()) + return -100; + } #pragma omp parallel for num_threads(nT) for (int ppi = 0; ppi < nn_M; ppi++) diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp index 2d3bafab8..99e93d6fb 100644 --- a/src/layer/arm/innerproduct_arm.cpp +++ b/src/layer/arm/innerproduct_arm.cpp @@ -415,6 +415,8 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio opt_flatten.blob_allocator = opt.workspace_allocator; flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + if (bottom_blob_flattened.empty()) + return -100; } size_t elemsize = bottom_blob_flattened.elemsize; @@ -1064,6 +1066,8 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const opt_flatten.blob_allocator = opt.workspace_allocator; flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + if (bottom_blob_flattened.empty()) + return -100; } size_t elemsize = bottom_blob_flattened.elemsize; @@ -1278,6 +1282,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co Option opt_q = opt; opt_q.blob_allocator = opt.workspace_allocator; quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + if (bottom_blob_int8.empty()) + return -100; } if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input) @@ -1287,6 +1293,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co Option opt_unpack = opt; opt_unpack.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); + if (bottom_blob_int8_unpacked.empty()) + return -100; int h = bottom_blob_int8_unpacked.h; @@ -1684,6 +1692,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co Option opt_flatten = opt; opt_flatten.blob_allocator = opt.workspace_allocator; flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); + if (bottom_blob_int8_flattened.empty()) + return -100; } // int elempack = bottom_blob_int8_flattened.elempack; diff --git a/src/layer/arm/innerproduct_arm_asimdhp.cpp b/src/layer/arm/innerproduct_arm_asimdhp.cpp index de475d5ca..e29dea2a3 100644 --- a/src/layer/arm/innerproduct_arm_asimdhp.cpp +++ b/src/layer/arm/innerproduct_arm_asimdhp.cpp @@ -480,6 +480,8 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons opt_flatten.blob_allocator = opt.workspace_allocator; flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + if (bottom_blob_flattened.empty()) + return -100; } size_t elemsize = bottom_blob_flattened.elemsize; diff --git a/src/layer/arm/innerproduct_arm_vfpv4.cpp b/src/layer/arm/innerproduct_arm_vfpv4.cpp index 306d37ad7..063cb00d1 100644 --- a/src/layer/arm/innerproduct_arm_vfpv4.cpp +++ b/src/layer/arm/innerproduct_arm_vfpv4.cpp @@ -75,6 +75,8 @@ int InnerProduct_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const opt_flatten.blob_allocator = opt.workspace_allocator; flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + if (bottom_blob_flattened.empty()) + return -100; } size_t elemsize = bottom_blob_flattened.elemsize; diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index 37323a225..a9493f414 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -292,9 +292,11 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v opt.use_bf16_storage &= support_bf16_storage; Mat attn_mask_blob_unpacked; - if (attn_mask_blob.elempack != 1) + if (attn_mask && attn_mask_blob.elempack != 1) { convert_packing(attn_mask_blob, attn_mask_blob_unpacked, 1, opt); + if (attn_mask_blob_unpacked.empty()) + return -100; } else { @@ -310,12 +312,21 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v size_t elemsize = q_blob.elemsize / q_blob.elempack; Mat q_affine; - q_gemm->forward(q_blob, q_affine, opt); + int retq = q_gemm->forward(q_blob, q_affine, opt); + if (retq != 0) + return retq; Mat k_affine; - k_gemm->forward(k_blob, k_affine, opt); + int retk = k_gemm->forward(k_blob, k_affine, opt); + if (retk != 0) + return retk; Mat qk_cross(dst_seqlen, src_seqlen * num_heads, elemsize, opt.blob_allocator); + if (qk_cross.empty()) + return -100; + + std::vector retqks; + retqks.resize(num_heads); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < num_heads; i++) { @@ -331,18 +342,32 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); Option opt1 = opt; opt1.num_threads = 1; - qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); + retqks[i] = qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); + } + for (int i = 0; i < num_heads; i++) + { + if (retqks[i] != 0) + return retqks[i]; } q_affine.release(); k_affine.release(); - qk_softmax->forward_inplace(qk_cross, opt); + int retqk = qk_softmax->forward_inplace(qk_cross, opt); + if (retqk != 0) + return retqk; Mat v_affine; - v_gemm->forward(v_blob, v_affine, opt); + int retv = v_gemm->forward(v_blob, v_affine, opt); + if (retv != 0) + return retv; Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, elemsize, opt.blob_allocator); + if (qkv_cross.empty()) + return -100; + + std::vector retqkvs; + retqkvs.resize(num_heads); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < num_heads; i++) { @@ -353,12 +378,19 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qkv_top_blobs[0] = qkv_cross.row_range(i * embed_dim_per_head, embed_dim_per_head); Option opt1 = opt; opt1.num_threads = 1; - qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); + retqkvs[i] = qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); + } + for (int i = 0; i < num_heads; i++) + { + if (retqkvs[i] != 0) + return retqkvs[i]; } v_affine.release(); - o_gemm->forward(qkv_cross, top_blobs[0], opt); + int reto = o_gemm->forward(qkv_cross, top_blobs[0], opt); + if (reto != 0) + return reto; return 0; } diff --git a/src/layer/arm/padding_arm.cpp b/src/layer/arm/padding_arm.cpp index 3a2463911..2258e5a59 100644 --- a/src/layer/arm/padding_arm.cpp +++ b/src/layer/arm/padding_arm.cpp @@ -238,6 +238,8 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op opt_pack1.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + if (bottom_blob_unpacked.empty()) + return -100; } return Padding::forward(bottom_blob_unpacked, top_blob, opt); @@ -616,6 +618,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons opt_pack1.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + if (bottom_blob_unpacked.empty()) + return -100; } return Padding::forward(bottom_blob_unpacked, top_blob, opt); @@ -770,6 +774,8 @@ int Padding_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio opt_pack1.blob_allocator = opt.workspace_allocator; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + if (bottom_blob_unpacked.empty()) + return -100; } return Padding::forward(bottom_blob_unpacked, top_blob, opt); diff --git a/src/layer/arm/shufflechannel_arm.cpp b/src/layer/arm/shufflechannel_arm.cpp index db6142b1d..571e98316 100644 --- a/src/layer/arm/shufflechannel_arm.cpp +++ b/src/layer/arm/shufflechannel_arm.cpp @@ -1,4 +1,4 @@ -// Tencent is pleased to support the open source community by making ncnn available. +// // Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. // @@ -143,6 +143,8 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt Mat bottom_blob_unpacked; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack); + if (bottom_blob_unpacked.empty()) + return -100; Mat top_blob_unpacked; int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack); @@ -389,6 +391,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo Mat bottom_blob_unpacked; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack); + if (bottom_blob_unpacked.empty()) + return -100; Mat top_blob_unpacked; int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack); @@ -396,6 +400,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo return ret; convert_packing(top_blob_unpacked, top_blob, elempack, opt); + if (top_blob.empty()) + return -100; return 0; } @@ -618,6 +624,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo Mat bottom_blob_unpacked; convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack); + if (bottom_blob_unpacked.empty()) + return -100; Mat top_blob_unpacked; int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack); @@ -625,6 +633,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo return ret; convert_packing(top_blob_unpacked, top_blob, elempack, opt); + if (top_blob.empty()) + return -100; return 0; } diff --git a/src/layer/arm/slice_arm.cpp b/src/layer/arm/slice_arm.cpp index a1c04e3f7..5fdf20994 100644 --- a/src/layer/arm/slice_arm.cpp +++ b/src/layer/arm/slice_arm.cpp @@ -167,6 +167,8 @@ int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& t if (elempack > out_elempack) { convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + if (bottom_blob_unpacked.empty()) + return -100; } const float* ptr = bottom_blob_unpacked; @@ -331,6 +333,8 @@ int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& t if (elempack > out_elempack) { convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + if (bottom_blob_unpacked.empty()) + return -100; } int p = 0; @@ -704,6 +708,8 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::ve if (elempack > out_elempack) { convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + if (bottom_blob_unpacked.empty()) + return -100; } const unsigned short* ptr = bottom_blob_unpacked; @@ -934,6 +940,8 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::ve if (elempack > out_elempack) { convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + if (bottom_blob_unpacked.empty()) + return -100; } int p = 0;