| @@ -159,6 +159,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| if (elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| } | |||
| @@ -284,6 +286,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| if (elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| } | |||
| @@ -617,6 +621,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v | |||
| if (elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| } | |||
| @@ -816,6 +822,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v | |||
| if (elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| } | |||
| @@ -5578,7 +5578,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_til | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -5605,12 +5605,16 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -5634,6 +5638,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -5659,6 +5665,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -5688,6 +5696,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) | |||
| @@ -7256,7 +7266,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_til | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -7283,12 +7293,16 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -7312,6 +7326,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -7337,6 +7353,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -7366,6 +7384,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) | |||
| @@ -9292,7 +9312,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_til | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -9319,12 +9339,16 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -9348,6 +9372,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -9373,6 +9399,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -9402,4 +9430,6 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -920,7 +920,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& t | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -947,12 +947,16 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -976,6 +980,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -1001,6 +1007,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -1030,6 +1038,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| conv3x3s1_winograd23_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT) | |||
| @@ -2497,7 +2507,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& t | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -2524,12 +2534,16 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -2553,6 +2567,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -2578,6 +2594,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -2607,6 +2625,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| conv3x3s1_winograd43_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT) | |||
| @@ -4428,7 +4448,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& t | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -4455,12 +4475,16 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -4484,6 +4508,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -4509,6 +4535,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -4538,4 +4566,6 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| conv3x3s1_winograd63_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -2916,7 +2916,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_fp16sa(const Mat& | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -2943,12 +2943,16 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -2972,6 +2976,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -2997,6 +3003,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -3026,6 +3034,8 @@ static void conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| conv3x3s1_winograd23_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd43_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) | |||
| @@ -4262,7 +4272,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -4289,12 +4299,16 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -4318,6 +4332,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -4343,6 +4359,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -4372,6 +4390,8 @@ static void conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| conv3x3s1_winograd43_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd63_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) | |||
| @@ -5873,7 +5893,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -5900,12 +5920,16 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -5929,6 +5953,8 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -5954,6 +5980,8 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -5983,4 +6011,6 @@ static void conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| conv3x3s1_winograd63_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -4257,7 +4257,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -4284,12 +4284,16 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -4313,6 +4317,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| // #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -4340,6 +4346,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| bottom_blob.release(); | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -4369,6 +4377,8 @@ static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk) | |||
| @@ -5604,7 +5614,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) | |||
| static int conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) | |||
| { | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| @@ -5631,12 +5641,16 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| if (nT > 1 && nn_NK < nT) | |||
| { | |||
| Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator); | |||
| if (B_tile.empty()) | |||
| return -100; | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| { | |||
| @@ -5660,6 +5674,8 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| else | |||
| { | |||
| Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator); | |||
| if (B_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppjk = 0; ppjk < nn_NK; ppjk++) | |||
| @@ -5687,6 +5703,8 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| bottom_blob.release(); | |||
| Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (top_tileX.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -5716,4 +5734,6 @@ static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat | |||
| conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -488,22 +488,25 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| int ret = 0; | |||
| if (prefer_winograd23) | |||
| { | |||
| conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); | |||
| ret = conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); | |||
| } | |||
| else if (prefer_winograd43) | |||
| { | |||
| conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); | |||
| ret = conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); | |||
| } | |||
| else if (prefer_winograd63) | |||
| { | |||
| conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); | |||
| ret = conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); | |||
| } | |||
| else | |||
| { | |||
| // should never reach here | |||
| } | |||
| if (ret != 0) | |||
| return ret; | |||
| if (activation) | |||
| { | |||
| @@ -559,7 +562,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| int ret = convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| if (activation) | |||
| { | |||
| @@ -1072,22 +1077,25 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| int ret = 0; | |||
| if (prefer_winograd23) | |||
| { | |||
| conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); | |||
| ret = conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt); | |||
| } | |||
| else if (prefer_winograd43) | |||
| { | |||
| conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); | |||
| ret = conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt); | |||
| } | |||
| else if (prefer_winograd63) | |||
| { | |||
| conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); | |||
| ret = conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt); | |||
| } | |||
| else | |||
| { | |||
| // should never reach here | |||
| } | |||
| if (ret != 0) | |||
| return ret; | |||
| if (activation) | |||
| { | |||
| @@ -1143,7 +1151,9 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| int ret = convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| if (activation) | |||
| { | |||
| @@ -1307,6 +1317,8 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| Option opt_q = opt; | |||
| opt_q.blob_allocator = opt.workspace_allocator; | |||
| quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); | |||
| if (bottom_blob_int8.empty()) | |||
| return -100; | |||
| } | |||
| // NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h); | |||
| @@ -1381,21 +1393,24 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| int ret = 0; | |||
| if (opt.use_winograd_convolution && prefer_winograd) | |||
| { | |||
| if (opt.use_winograd43_convolution && !weight_winograd43_data.empty()) | |||
| conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); | |||
| ret = conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); | |||
| else | |||
| conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); | |||
| ret = conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| ret = convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| if (ret != 0) | |||
| return ret; | |||
| bottom_blob_bordered.release(); | |||
| @@ -37,9 +37,9 @@ void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Ma | |||
| convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt); | |||
| } | |||
| void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| } | |||
| } // namespace ncnn | |||
| @@ -372,22 +372,25 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| int ret = 0; | |||
| if (prefer_winograd23) | |||
| { | |||
| conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt); | |||
| ret = conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt); | |||
| } | |||
| else if (prefer_winograd43) | |||
| { | |||
| conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt); | |||
| ret = conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt); | |||
| } | |||
| else if (prefer_winograd63) | |||
| { | |||
| conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt); | |||
| ret = conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt); | |||
| } | |||
| else | |||
| { | |||
| // should never reach here | |||
| } | |||
| if (ret != 0) | |||
| return ret; | |||
| if (activation) | |||
| { | |||
| @@ -471,7 +474,9 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| int ret = convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| if (activation) | |||
| { | |||
| @@ -37,9 +37,9 @@ void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& | |||
| convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt); | |||
| } | |||
| void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| } | |||
| } // namespace ncnn | |||
| @@ -6829,7 +6829,7 @@ static void convolution_im2col_gemm_transform_kernel(const Mat& kernel, Mat& AT, | |||
| } | |||
| } | |||
| static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| static int convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -6847,6 +6847,8 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -6870,7 +6872,11 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const | |||
| Mat topT_tileX; | |||
| if (K > TILE_K) | |||
| { | |||
| topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT_tileX.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -6901,4 +6907,6 @@ static void convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -6010,7 +6010,7 @@ static void convolution_im2col_gemm_transform_kernel_bf16s(const Mat& kernel, Ma | |||
| } | |||
| } | |||
| static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| static int convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -6028,6 +6028,8 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -6051,7 +6053,11 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, | |||
| Mat topT_tileX; | |||
| if (K > TILE_K) | |||
| { | |||
| topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT_tileX.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -6082,4 +6088,6 @@ static void convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -3145,7 +3145,7 @@ static void convolution_im2col_gemm_transform_kernel_fp16sa(const Mat& kernel, M | |||
| } | |||
| } | |||
| static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| static int convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| // NCNN_LOGE("convolution_im2col_gemm_fp16sa %p %p %p %p", bottom_blob.data, top_blob.data, AT.data, bias.data); | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -3164,6 +3164,8 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -3187,7 +3189,11 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob | |||
| Mat topT_tileX; | |||
| if (K > TILE_K) | |||
| { | |||
| topT_tileX.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (topT_tileX.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -3218,4 +3224,6 @@ static void convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -14,12 +14,12 @@ | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 | |||
| void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt); | |||
| void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 | |||
| void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt); | |||
| void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| #endif | |||
| static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk) | |||
| @@ -10944,21 +10944,19 @@ static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat | |||
| } | |||
| } | |||
| static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| static int convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 | |||
| if (ncnn::cpu_support_arm_i8mm()) | |||
| { | |||
| convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return; | |||
| return convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 | |||
| if (ncnn::cpu_support_arm_asimddp()) | |||
| { | |||
| convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return; | |||
| return convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| } | |||
| #endif | |||
| @@ -10978,6 +10976,8 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, | |||
| // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K); | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -11001,7 +11001,11 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, | |||
| Mat topT_tileX; | |||
| if (K > TILE_K) | |||
| { | |||
| topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT_tileX.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppj = 0; ppj < nn_M; ppj++) | |||
| @@ -11032,4 +11036,6 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -539,6 +539,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); | |||
| if (bottom_blob_bordered_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_unpacked = top_blob; | |||
| @@ -560,13 +562,17 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| opt_g.blob_allocator = top_blob_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, 4, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -959,6 +965,8 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); | |||
| if (bottom_blob_bordered_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_unpacked = top_blob; | |||
| @@ -980,13 +988,17 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo | |||
| opt_g.blob_allocator = top_blob_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, 4, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -1073,6 +1085,8 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ | |||
| Option opt_q = opt; | |||
| opt_q.blob_allocator = opt.workspace_allocator; | |||
| quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); | |||
| if (bottom_blob_int8.empty()) | |||
| return -100; | |||
| } | |||
| Mat bottom_blob_bordered; | |||
| @@ -1537,6 +1551,8 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); | |||
| if (bottom_blob_bordered_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_unpacked = top_blob; | |||
| @@ -1558,13 +1574,17 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ | |||
| opt_g.blob_allocator = top_blob_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -260,17 +260,19 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo | |||
| // unpacking | |||
| Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; | |||
| if (elempack == 4 && g_elempack == 1) | |||
| if (elempack > g_elempack) | |||
| { | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); | |||
| convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); | |||
| if (bottom_blob_bordered_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_unpacked = top_blob; | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| if (out_g_elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); | |||
| top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| @@ -286,13 +288,17 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo | |||
| opt_g.blob_allocator = top_blob_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| if (out_g_elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, 4, opt); | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -603,6 +609,8 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); | |||
| if (bottom_blob_bordered_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_unpacked = top_blob; | |||
| @@ -624,13 +632,17 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl | |||
| opt_g.blob_allocator = top_blob_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -443,6 +443,8 @@ int Crop_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| return Crop::forward(bottom_blob_unpacked, top_blob, opt); | |||
| @@ -780,6 +782,8 @@ int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat reference_blob_unpacked = reference_blob; | |||
| @@ -789,6 +793,8 @@ int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1); | |||
| if (reference_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| std::vector<Mat> bottom_blobs_unpacked(2); | |||
| @@ -305,7 +305,9 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| Mat top_col2im; | |||
| Option opt_b = opt; | |||
| opt_b.blob_allocator = top_blob_bordered.allocator; | |||
| gemm->forward(bottom_blob_2, top_col2im, opt_b); | |||
| int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b); | |||
| if (ret != 0) | |||
| return ret; | |||
| { | |||
| // col2im | |||
| @@ -556,7 +556,9 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| Mat top_col2im; | |||
| Option opt_b = opt; | |||
| opt_b.blob_allocator = top_blob_bordered.allocator; | |||
| gemm->forward(bottom_blob_2, top_col2im, opt_b); | |||
| int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b); | |||
| if (ret != 0) | |||
| return ret; | |||
| { | |||
| // col2im | |||
| @@ -418,6 +418,8 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_bordered_unpacked = top_blob_bordered; | |||
| @@ -439,13 +441,17 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| { | |||
| convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -791,6 +797,8 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_bordered_unpacked = top_blob_bordered; | |||
| @@ -812,13 +820,17 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b | |||
| opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| { | |||
| convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -329,6 +329,8 @@ int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_b | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_bordered_unpacked = top_blob_bordered; | |||
| @@ -350,13 +352,17 @@ int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_b | |||
| opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack == 1 && out_elempack == 4) | |||
| { | |||
| convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -622,6 +628,8 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| Option opt_p = opt; | |||
| opt_p.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| Mat top_blob_bordered_unpacked = top_blob_bordered; | |||
| @@ -643,13 +651,17 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| // packing | |||
| if (out_g_elempack < out_elempack) | |||
| { | |||
| convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -3801,7 +3801,11 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -3832,7 +3836,11 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -3913,6 +3921,8 @@ static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -3943,7 +3953,11 @@ static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -4007,10 +4021,16 @@ static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, | |||
| // int nn_N = (N + TILE_N - 1) / TILE_N; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -4089,7 +4109,11 @@ static int gemm_AT_BT_arm(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_b | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -4512,7 +4536,11 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -4543,7 +4571,11 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -4625,6 +4657,8 @@ static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -4655,7 +4689,11 @@ static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -4720,10 +4758,16 @@ static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top | |||
| // int nn_N = (N + TILE_N - 1) / TILE_N; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -4803,7 +4847,11 @@ static int gemm_AT_BT_arm_bf16s(const Mat& AT, const Mat& BT, const Mat& C, Mat& | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -2360,7 +2360,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -2391,7 +2395,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -2472,6 +2480,8 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -2502,7 +2512,11 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -2566,10 +2580,16 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to | |||
| // int nn_N = (N + TILE_N - 1) / TILE_N; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -2648,7 +2668,11 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -2910,6 +2934,8 @@ int Gemm_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| { | |||
| Mat CT_data; | |||
| CT_data.create_like(C, opt.workspace_allocator); | |||
| if (CT_data.empty()) | |||
| return -100; | |||
| const int size = C.total() * C.elempack; | |||
| const __fp16* ptr = C; | |||
| @@ -47,7 +47,11 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -78,7 +82,11 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -160,6 +168,8 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top | |||
| int nn_K = (K + TILE_K - 1) / TILE_K; | |||
| Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator); | |||
| if (BT.empty()) | |||
| return -100; | |||
| const int nn_NK = nn_N * nn_K; | |||
| @@ -190,7 +200,11 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -255,10 +269,16 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top | |||
| // int nn_N = (N + TILE_N - 1) / TILE_N; | |||
| Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator); | |||
| if (ATX.empty()) | |||
| return -100; | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -338,7 +358,11 @@ static int gemm_AT_BT_arm_fp16s(const Mat& AT, const Mat& BT, const Mat& C, Mat& | |||
| Mat topT; | |||
| if (K > TILE_K || broadcast_type_C == 3 || output_transpose) | |||
| { | |||
| topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator); | |||
| if (topT.empty()) | |||
| return -100; | |||
| } | |||
| #pragma omp parallel for num_threads(nT) | |||
| for (int ppi = 0; ppi < nn_M; ppi++) | |||
| @@ -415,6 +415,8 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); | |||
| if (bottom_blob_flattened.empty()) | |||
| return -100; | |||
| } | |||
| size_t elemsize = bottom_blob_flattened.elemsize; | |||
| @@ -1064,6 +1066,8 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); | |||
| if (bottom_blob_flattened.empty()) | |||
| return -100; | |||
| } | |||
| size_t elemsize = bottom_blob_flattened.elemsize; | |||
| @@ -1278,6 +1282,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co | |||
| Option opt_q = opt; | |||
| opt_q.blob_allocator = opt.workspace_allocator; | |||
| quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); | |||
| if (bottom_blob_int8.empty()) | |||
| return -100; | |||
| } | |||
| if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input) | |||
| @@ -1287,6 +1293,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co | |||
| Option opt_unpack = opt; | |||
| opt_unpack.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); | |||
| if (bottom_blob_int8_unpacked.empty()) | |||
| return -100; | |||
| int h = bottom_blob_int8_unpacked.h; | |||
| @@ -1684,6 +1692,8 @@ int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, co | |||
| Option opt_flatten = opt; | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); | |||
| if (bottom_blob_int8_flattened.empty()) | |||
| return -100; | |||
| } | |||
| // int elempack = bottom_blob_int8_flattened.elempack; | |||
| @@ -480,6 +480,8 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); | |||
| if (bottom_blob_flattened.empty()) | |||
| return -100; | |||
| } | |||
| size_t elemsize = bottom_blob_flattened.elemsize; | |||
| @@ -75,6 +75,8 @@ int InnerProduct_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); | |||
| if (bottom_blob_flattened.empty()) | |||
| return -100; | |||
| } | |||
| size_t elemsize = bottom_blob_flattened.elemsize; | |||
| @@ -292,9 +292,11 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v | |||
| opt.use_bf16_storage &= support_bf16_storage; | |||
| Mat attn_mask_blob_unpacked; | |||
| if (attn_mask_blob.elempack != 1) | |||
| if (attn_mask && attn_mask_blob.elempack != 1) | |||
| { | |||
| convert_packing(attn_mask_blob, attn_mask_blob_unpacked, 1, opt); | |||
| if (attn_mask_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| @@ -310,12 +312,21 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v | |||
| size_t elemsize = q_blob.elemsize / q_blob.elempack; | |||
| Mat q_affine; | |||
| q_gemm->forward(q_blob, q_affine, opt); | |||
| int retq = q_gemm->forward(q_blob, q_affine, opt); | |||
| if (retq != 0) | |||
| return retq; | |||
| Mat k_affine; | |||
| k_gemm->forward(k_blob, k_affine, opt); | |||
| int retk = k_gemm->forward(k_blob, k_affine, opt); | |||
| if (retk != 0) | |||
| return retk; | |||
| Mat qk_cross(dst_seqlen, src_seqlen * num_heads, elemsize, opt.blob_allocator); | |||
| if (qk_cross.empty()) | |||
| return -100; | |||
| std::vector<int> retqks; | |||
| retqks.resize(num_heads); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = 0; i < num_heads; i++) | |||
| { | |||
| @@ -331,18 +342,32 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v | |||
| qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); | |||
| Option opt1 = opt; | |||
| opt1.num_threads = 1; | |||
| qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); | |||
| retqks[i] = qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); | |||
| } | |||
| for (int i = 0; i < num_heads; i++) | |||
| { | |||
| if (retqks[i] != 0) | |||
| return retqks[i]; | |||
| } | |||
| q_affine.release(); | |||
| k_affine.release(); | |||
| qk_softmax->forward_inplace(qk_cross, opt); | |||
| int retqk = qk_softmax->forward_inplace(qk_cross, opt); | |||
| if (retqk != 0) | |||
| return retqk; | |||
| Mat v_affine; | |||
| v_gemm->forward(v_blob, v_affine, opt); | |||
| int retv = v_gemm->forward(v_blob, v_affine, opt); | |||
| if (retv != 0) | |||
| return retv; | |||
| Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, elemsize, opt.blob_allocator); | |||
| if (qkv_cross.empty()) | |||
| return -100; | |||
| std::vector<int> retqkvs; | |||
| retqkvs.resize(num_heads); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = 0; i < num_heads; i++) | |||
| { | |||
| @@ -353,12 +378,19 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v | |||
| qkv_top_blobs[0] = qkv_cross.row_range(i * embed_dim_per_head, embed_dim_per_head); | |||
| Option opt1 = opt; | |||
| opt1.num_threads = 1; | |||
| qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); | |||
| retqkvs[i] = qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); | |||
| } | |||
| for (int i = 0; i < num_heads; i++) | |||
| { | |||
| if (retqkvs[i] != 0) | |||
| return retqkvs[i]; | |||
| } | |||
| v_affine.release(); | |||
| o_gemm->forward(qkv_cross, top_blobs[0], opt); | |||
| int reto = o_gemm->forward(qkv_cross, top_blobs[0], opt); | |||
| if (reto != 0) | |||
| return reto; | |||
| return 0; | |||
| } | |||
| @@ -238,6 +238,8 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| return Padding::forward(bottom_blob_unpacked, top_blob, opt); | |||
| @@ -616,6 +618,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| return Padding::forward(bottom_blob_unpacked, top_blob, opt); | |||
| @@ -770,6 +774,8 @@ int Padding_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| return Padding::forward(bottom_blob_unpacked, top_blob, opt); | |||
| @@ -1,4 +1,4 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| @@ -143,6 +143,8 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| Mat bottom_blob_unpacked; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| Mat top_blob_unpacked; | |||
| int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack); | |||
| @@ -389,6 +391,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo | |||
| Mat bottom_blob_unpacked; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| Mat top_blob_unpacked; | |||
| int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack); | |||
| @@ -396,6 +400,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo | |||
| return ret; | |||
| convert_packing(top_blob_unpacked, top_blob, elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| return 0; | |||
| } | |||
| @@ -618,6 +624,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo | |||
| Mat bottom_blob_unpacked; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| Mat top_blob_unpacked; | |||
| int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack); | |||
| @@ -625,6 +633,8 @@ int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blo | |||
| return ret; | |||
| convert_packing(top_blob_unpacked, top_blob, elempack, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| return 0; | |||
| } | |||
| @@ -167,6 +167,8 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t | |||
| if (elempack > out_elempack) | |||
| { | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| const float* ptr = bottom_blob_unpacked; | |||
| @@ -331,6 +333,8 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t | |||
| if (elempack > out_elempack) | |||
| { | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| int p = 0; | |||
| @@ -704,6 +708,8 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::ve | |||
| if (elempack > out_elempack) | |||
| { | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| const unsigned short* ptr = bottom_blob_unpacked; | |||
| @@ -934,6 +940,8 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::ve | |||
| if (elempack > out_elempack) | |||
| { | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); | |||
| if (bottom_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| int p = 0; | |||