diff --git a/dnn/src/fallback/conv_bias/im2col/algos.cpp b/dnn/src/fallback/conv_bias/im2col/algos.cpp index dc1c785d..21651d0d 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.cpp +++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp @@ -10,12 +10,12 @@ */ #include "src/fallback/conv_bias/im2col/algos.h" +#include "src/fallback/conv_bias/im2col/factory.h" #include "megdnn/opr_param_defs.h" #include "src/common/opr_delegate.h" #include "src/fallback/conv_bias/common.h" #include "src/fallback/conv_bias/opr_impl.h" #include "src/fallback/conv_bias/winograd/strategy.h" -#include "src/fallback/convolution/img2col_helper.h" #include "src/naive/convolution/helper.h" #if MEGDNN_X86 #include "src/x86/conv_bias/postprocess_helper.h" @@ -25,7 +25,7 @@ MIDOUT_DECL(megdnn_fallback_im2col) using namespace megdnn; using namespace fallback; - +using namespace im2col; #if MEGDNN_X86 using namespace x86; #endif @@ -39,557 +39,287 @@ struct Im2colBundelIndex { static constexpr size_t BUNDLE_PADDING_INDEX = 0_z; static constexpr size_t BUNDLE_PACKA_INDEX = 1_z; static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; - static constexpr size_t THREAD_BUNDLE_PACKB_INDEX = 0_z; - static constexpr size_t THREAD_BUNDLE_IM2COL_INDEX = 1_z; - static constexpr size_t THREAD_BUNDLE_MATMUL_DST_INDEX = 2_z; - static constexpr size_t THREAD_BUNDLE_BIAS_INDEX = 3_z; - static constexpr size_t THREAD_BUNDLE_COMPUTE_INDEX = 4_z; -}; - -/*! - * *\brief PtrGetter is get the im2col needed ptr according to the provided - * *conditions - */ -class PtrGetter { -public: - template - static inline dtype* get_matmul_dst_ptr( - const ConvBiasImpl::NCBKernParam& param, - const WorkspaceBundle& bundle_thread, size_t bundle_id, - size_t oc_cur_index, size_t OHW, bool is_dst_8bit, - bool ohw_bigger_ohwblock, size_t batch_id, size_t group_id) { - if (is_dst_8bit || !ohw_bigger_ohwblock) { - return static_cast(bundle_thread.get(bundle_id)); - } else { - dtype* dst = - param.dst(batch_id, group_id) + oc_cur_index * OHW; - return static_cast(dst); - } - } - - template - static inline bias_ctype* get_bias_temp_ptr( - const ConvBiasImpl::NCBKernParam& param, - const WorkspaceBundle& bundle_thread) { - bias_ctype* bias_tmp_ptr = - param.bias_mode == megdnn::BiasMode::BIAS - ? static_cast(bundle_thread.get( - Im2colBundelIndex::THREAD_BUNDLE_BIAS_INDEX)) - : nullptr; - return bias_tmp_ptr; - } - - template - static inline dtype* get_bundle_offset_byte_ptr( - const WorkspaceBundle& bundle, size_t bundle_id, size_t offset) { - return reinterpret_cast( - reinterpret_cast(bundle.get(bundle_id)) + offset); - } }; using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; //! Process one input channel copy padding -template static void copy_padding_kern(WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& param, - ConvBiasImpl::NCBKernIndex ncb_index) { - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(N); - MEGDNN_MARK_USED_VAR(OC); - MEGDNN_MARK_USED_VAR(OH); - MEGDNN_MARK_USED_VAR(OW); - MEGDNN_MARK_USED_VAR(FH); - MEGDNN_MARK_USED_VAR(FW); - MEGDNN_MARK_USED_VAR(SH); - MEGDNN_MARK_USED_VAR(SW); + const ConvBiasImpl::NCBKernIndex& ncb_index, + StrategyBase* im2colstrategy) { + im2colstrategy->copy_padding_kern(bundle, param, ncb_index); +} - size_t IW2 = IW + 2 * PW; - size_t IH2 = IH + 2 * PH; - size_t group_id = ncb_index.ndrange_id[0]; - size_t batch_id = ncb_index.ndrange_id[1]; - size_t channel_id = ncb_index.ndrange_id[2]; - - size_t padding_group_size = IH2 * IW2 * IC; - size_t workspace_channel_offset = IH2 * IW2 * channel_id; - size_t workspace_group_offset = group_id * padding_group_size; - size_t workspace_batch_offset = - param.filter_meta.group * batch_id * padding_group_size; - bundle.set(param.workspace_ptr); - - src_ctype src_zp = static_cast(0); - if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { - src_zp = param.src_type.param().zero_point; - } - src_ctype* src = const_cast( - param.src(batch_id, group_id, channel_id)); - src_ctype* src2; - src2 = static_cast( - bundle.get(Im2colBundelIndex::BUNDLE_PADDING_INDEX)) + - workspace_group_offset + workspace_batch_offset + - workspace_channel_offset; - src_ctype* src2_ptr = src2; - const src_ctype* src_ptr = src; - if (PH != 0) { - std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); - src2_ptr += PH * IW2; - } - rep(ih, IH) { - if (PW != 0) - rep(pw, PW) * (src2_ptr++) = src_zp; - std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); - src2_ptr += IW; - src_ptr += IW; - if (PW != 0) - rep(pw, PW) * (src2_ptr++) = src_zp; - } - if (PH != 0) { - std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); - src2_ptr += PH * IW2; - } -}; +//! packA_kern +static void packA_kern(WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, + StrategyBase* im2colstrategy) { + im2colstrategy->packA_kern(bundle, param, matmulparam, matmul_algo, + ncb_index); +} /*! * *\brief Im2colKerns collects all the im2col kerns in it */ -#define COPY_BIAS() \ - const bias_ctype* bias_ptr = static_cast( \ - param.bias(batch_id, group_id)); \ - bias_ctype* bias_temp_ptr = \ - PtrGetter::get_bias_temp_ptr(param, bundle_thread); \ - if (param.bias_mode == megdnn::BiasMode::BIAS) { \ - bias_ctype* copy_dst = bias_temp_ptr; \ - const bias_ctype* copy_src = \ - bias_ptr + oc_cur_index * OH * OW + ohw_cur_index; \ - for (size_t oc = oc_cur_index; oc < oc_end_index; oc++) { \ - std::memcpy(copy_dst, copy_src, \ - sizeof(bias_ctype) * output_block_size); \ - copy_dst += output_block_size; \ - copy_src += OH * OW; \ - } \ - } - -#define IM2COL() \ - src_ctype* im2col_dst = nullptr; \ - src_ctype* no_padding_src = \ - const_cast(param.src(batch_id, group_id)) + \ - ohw_cur_index; \ - if (!special_1x1) { \ - size_t padding_group_size = IH2 * IW2 * IC * sizeof(src_ctype); \ - src_ctype* src2 = PtrGetter::get_bundle_offset_byte_ptr( \ - bundle, Im2colBundelIndex::BUNDLE_PADDING_INDEX, \ - (ncb_index.ndrange_id[0] + \ - param.filter_meta.group * ncb_index.ndrange_id[1]) * \ - padding_group_size); \ - if (PH == 0 && PW == 0) { \ - src2 = const_cast( \ - param.src(batch_id, group_id)); \ - } \ - im2col_dst = static_cast(bundle_thread.get( \ - Im2colBundelIndex::THREAD_BUNDLE_IM2COL_INDEX)); \ - if (SH == 1 && SW == 1) { \ - if (is_xcorr) { \ - img2col(src2, im2col_dst, OC, OH, OW, IC, IH2, IW2, FH, \ - FW, ohw_cur_index, output_block_size); \ - } else { \ - img2col(src2, im2col_dst, OC, OH, OW, IC, IH2, IW2, FH, \ - FW, ohw_cur_index, output_block_size); \ - } \ - } else { \ - if (is_xcorr) { \ - img2col_stride(src2, im2col_dst, OC, OH, OW, IC, IH2, \ - IW2, FH, FW, SH, SW, ohw_cur_index, \ - output_block_size); \ - } else { \ - img2col_stride(src2, im2col_dst, OC, OH, OW, IC, IH2, \ - IW2, FH, FW, SH, SW, ohw_cur_index, \ - output_block_size); \ - } \ - } \ - } - -#define POSTPROCESS_AND_COPYDST() \ - PostProcess::run( \ - matmul_dst, \ - param.bias_mode == megdnn::BiasMode::BIAS \ - ? bias_temp_ptr \ - : const_cast(bias_ptr + oc_cur_index), \ - matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type, \ - param.dst_type, 1_z, output_block_oc_size, 1_z, \ - output_block_size); \ - if (!skip_copy_dst) { \ - dst_ctype* dst_tmp_ptr = reinterpret_cast(matmul_dst); \ - dst_ctype* dst = param.dst(batch_id, group_id) + \ - oc_cur_index * OHW + ohw_cur_index; \ - for (size_t oc = 0; oc < output_block_oc_size; oc++) { \ - std::memcpy(dst, dst_tmp_ptr, \ - sizeof(dst_ctype) * output_block_size); \ - dst_tmp_ptr += output_block_size; \ - dst += OHW; \ - } \ - } - -#define PREPAR_MATMUL_DATA() \ - size_t packA_per_oc_block_size = \ - round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) * \ - oc_tile_size * matmul_algo->get_packA_type_size(); \ - size_t packA_group_size = \ - matmul_algo->get_bundle(matmul_param).get_size(0); \ - src_ctype* a_panel = PtrGetter::get_bundle_offset_byte_ptr( \ - bundle, Im2colBundelIndex::BUNDLE_PACKA_INDEX, \ - ncb_index.ndrange_id[0] * packA_group_size + \ - ncb_index.ndrange_id[3] * packA_per_oc_block_size); \ - src_ctype* b_panel = PtrGetter::get_bundle_offset_byte_ptr( \ - bundle_thread, Im2colBundelIndex::THREAD_BUNDLE_PACKB_INDEX, 0); \ - /*In pack mode, the matmul dst and im2col dst is the same workspace*/ \ - bias_ctype* matmul_dst = PtrGetter::get_matmul_dst_ptr( \ - param, bundle_thread, \ - Im2colBundelIndex::THREAD_BUNDLE_IM2COL_INDEX, oc_cur_index, OHW, \ - is_dst_8bit, is_ohw_size_bigger, batch_id, group_id); - -#define MATMUL_COMPUTE() \ - auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param); \ - matmul_param.M = output_block_oc_size; \ - matmul_param.N = output_block_size; \ - matmul_param.LDB = special_1x1 ? OH * OW : output_block_size; \ - matmul_param.LDC = output_block_size; \ - matmul_param.A_ptr = a_panel; \ - matmul_param.B_ptr = im2col_dst ? im2col_dst : no_padding_src; \ - matmul_param.C_ptr = matmul_dst; \ - matmul_algo->pack_B(matmul_param, b_panel, 0, output_block_size); \ - matmul_kern_naked(matmul_param, a_panel, b_panel); - template class Im2colKerns; template <> class Im2colKerns { public: - //! packA kern - template - static void packA_kern(WorkspaceBundle bundle, - const ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernSizeParam matmulparam, - fallback::MatrixMulImpl::AlgoBase* matmul_algo, - ConvBiasImpl::NCBKernIndex ncb_index) { - bundle.set(param.workspace_ptr); - fallback::MatrixMulImpl::KernParam matmul_param; - size_t group_id = ncb_index.ndrange_id[0]; - static_cast(matmul_param) = - matmulparam; - size_t packA_group_size = - matmul_algo->get_bundle(matmul_param).get_size(0); - size_t packed_per_oc_block_size = - round_up(matmul_param.K, - matmul_algo->get_inner_block_size().k) * - matmul_algo->get_inner_block_size().m * - matmul_algo->get_packA_type_size(); - size_t a_panel_offset = - ncb_index.ndrange_id[2] * packed_per_oc_block_size; - int8_t* a_panel = static_cast(bundle.get( - Im2colBundelIndex::BUNDLE_PACKA_INDEX)) + - group_id * packA_group_size + a_panel_offset; - matmul_param.A_ptr = - const_cast(param.filter(group_id)); - matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[2], - matmul_algo->get_inner_block_size().m); - }; - //! conv kernel - template static void kerns( WorkspaceBundle bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::AlgoBase* matmul_algo, + StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, - size_t ohw_tile_size, size_t oc_tile_size) { - auto is_xcorr = !param.filter_meta.should_flip; - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(N); - auto IH2 = IH + 2 * PH; - auto IW2 = IW + 2 * PW; - size_t OHW = OH * OW; - size_t group_id = ncb_index.ndrange_id[0]; - size_t batch_id = ncb_index.ndrange_id[1]; + size_t ohw_tile_size, StrategyBase* im2colstrategy) { + size_t OC = param.filter_meta.ocpg; size_t output_block_size = std::min( - ohw_tile_size, OHW - ncb_index.ndrange_id[2] * ohw_tile_size); + ohw_tile_size, + strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); size_t output_block_oc_size = std::min( - oc_tile_size, OC - ncb_index.ndrange_id[3] * oc_tile_size); - - //! misc flags - bool special_1x1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 && - PH == 0 && PW == 0); - bool is_dst_8bit = - (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - bool is_ohw_size_bigger = (ohw_tile_size >= OHW); - bool skip_copy_dst = is_ohw_size_bigger && !is_dst_8bit; - - //! misc index - size_t ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; - size_t oc_cur_index = ncb_index.ndrange_id[3] * oc_tile_size; - size_t oc_end_index = oc_cur_index + output_block_oc_size; + strategyparam.oc_tile_size, + OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); + + strategyparam.batch_id = ncb_index.ndrange_id[0]; + strategyparam.group_id = ncb_index.ndrange_id[1]; + strategyparam.oc_cur_index = + ncb_index.ndrange_id[3] * + strategyparam.oc_tile_size; + strategyparam.oc_end_index = strategyparam.oc_cur_index + + output_block_oc_size; + strategyparam.ohw_cur_index = + ncb_index.ndrange_id[2] * ohw_tile_size; + strategyparam.output_block_oc_size = output_block_oc_size; + strategyparam.output_block_size = output_block_size; bundle.set(param.workspace_ptr); - bundle_thread.set(PtrGetter::get_bundle_offset_byte_ptr( - bundle, Im2colBundelIndex::BUNDLE_THREAD_INDEX, - bundle_thread.total_size_in_bytes() * ncb_index.thread_id)); - + bundle_thread.set( + static_cast( + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); fallback::MatrixMulImpl::KernParam matmul_param; static_cast(matmul_param) = matmul_kernsize_param; - matmul_param.workspace_ptr = bundle_thread.get( - Im2colBundelIndex::THREAD_BUNDLE_COMPUTE_INDEX); - //! 1.Copy bias if need - COPY_BIAS(); + //! 1.Im2col + im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, + matmul_param, matmul_algo); - //! 2.Im2col - IM2COL(); + //! 2.packb and matmul compute + im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, + matmul_param, matmul_algo, ncb_index); - //! 3.packb and matmul compute - PREPAR_MATMUL_DATA(); - MATMUL_COMPUTE(); + //! 3.postprocess and copy dst if need + im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); + } - //! 4.postprocess and copy dst if need - POSTPROCESS_AND_COPYDST(); -#undef PREPAR_MATMUL_DATA -#undef MATMUL_COMPUTE + WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, + MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + size_t oc_tile_size) { + size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + + size_t im2col = 0, packb = 0, bias_temp = 0; + bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT; + megdnn_assert(default_pack, "only support default packa"); + size_t im2col_dst_size = + IC * FH * FW * ohw_tile_size * sizeof(param.src_type); + size_t matmul_dst_size = + oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + //! matmul_dst and im2col_dst use the same memory + WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); + packb = wb.get_size(1); + im2col = std::max(im2col_dst_size, matmul_dst_size); + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + } + return {nullptr, {packb, im2col, bias_temp}}; } }; -#define PREPAR_MATMUL_DATA() \ - bias_ctype* matmul_dst = nullptr; \ - src_ctype* b_panel = nullptr; \ - size_t packA_group_size = \ - bundle.get_size(Im2colBundelIndex::BUNDLE_PACKA_INDEX) / \ - param.filter_meta.group; \ - size_t a_panel_offset = ncb_index.ndrange_id[3] * \ - matmul_algo->get_bundle(matmul_param).get_size(0); \ - \ - src_ctype* a_panel = PtrGetter::get_bundle_offset_byte_ptr( \ - bundle, Im2colBundelIndex::BUNDLE_PACKA_INDEX, \ - group_id * packA_group_size + a_panel_offset); \ - matmul_dst = PtrGetter::get_matmul_dst_ptr( \ - param, bundle_thread, \ - Im2colBundelIndex::THREAD_BUNDLE_MATMUL_DST_INDEX, oc_cur_index, \ - OHW, is_dst_8bit, is_ohw_size_bigger, batch_id, group_id); - -#define MATMUL_COMPUTE() \ - auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param); \ - matmul_param.M = output_block_oc_size; \ - matmul_param.N = output_block_size; \ - matmul_param.LDB = special_1x1 ? OH * OW : output_block_size; \ - matmul_param.LDC = output_block_size; \ - matmul_param.A_ptr = a_panel; \ - matmul_param.B_ptr = im2col_dst ? im2col_dst : no_padding_src; \ - matmul_param.C_ptr = matmul_dst; \ - matmul_kern_naked(matmul_param, a_panel, b_panel); - template <> class Im2colKerns { public: - //! packA kern - template - static void packA_kern(WorkspaceBundle bundle, - const ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernSizeParam matmulparam, - fallback::MatrixMulImpl::AlgoBase* matmul_algo, - ConvBiasImpl::NCBKernIndex ncb_index) { - bundle.set(param.workspace_ptr); - fallback::MatrixMulImpl::KernParam matmul_param; - static_cast(matmul_param) = - matmulparam; - size_t OC = param.filter_meta.ocpg; - size_t oc_tile_size = matmul_param.M; - size_t group_id = ncb_index.ndrange_id[0]; - size_t output_block_oc_size = std::min( - oc_tile_size, OC - ncb_index.ndrange_id[2] * oc_tile_size); - size_t oc_cur_index = ncb_index.ndrange_id[2] * oc_tile_size; - size_t packA_group_size = - bundle.get_size(Im2colBundelIndex::BUNDLE_PACKA_INDEX) / - param.filter_meta.group; - size_t a_panel_offset = - ncb_index.ndrange_id[2] * - matmul_algo->get_bundle(matmul_param).get_size(0); - int8_t* a_panel = static_cast(bundle.get( - Im2colBundelIndex::BUNDLE_PACKA_INDEX)) + - group_id * packA_group_size + a_panel_offset; - matmul_param.A_ptr = - const_cast(param.filter(group_id)) + - oc_cur_index * matmul_param.K; - matmul_param.M = output_block_oc_size; - matmul_algo->pack_A(matmul_param, a_panel, 0_z, 0_z); - }; - //! conv kernel - template static void kerns( WorkspaceBundle bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::AlgoBase* matmul_algo, + StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, - size_t ohw_tile_size, size_t oc_tile_size) { - auto is_xcorr = !param.filter_meta.should_flip; - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(N); - auto IH2 = IH + 2 * PH; - auto IW2 = IW + 2 * PW; - size_t group_id = ncb_index.ndrange_id[0]; - size_t batch_id = ncb_index.ndrange_id[1]; - size_t OHW = OH * OW; + size_t ohw_tile_size, StrategyBase* im2colstrategy) { + size_t OC = param.filter_meta.ocpg; size_t output_block_size = std::min( - ohw_tile_size, OHW - ncb_index.ndrange_id[2] * ohw_tile_size); + ohw_tile_size, + strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); size_t output_block_oc_size = std::min( - oc_tile_size, OC - ncb_index.ndrange_id[3] * oc_tile_size); - - //! misc flags - bool special_1x1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 && - PH == 0 && PW == 0); - bool is_dst_8bit = - (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - bool is_ohw_size_bigger = (ohw_tile_size >= OHW); - bool skip_copy_dst = is_ohw_size_bigger && !is_dst_8bit; - - //! misc index - size_t ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; - size_t oc_cur_index = ncb_index.ndrange_id[3] * oc_tile_size; - size_t oc_end_index = oc_cur_index + output_block_oc_size; + strategyparam.oc_tile_size, + OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); bundle.set(param.workspace_ptr); - bundle_thread.set(PtrGetter::get_bundle_offset_byte_ptr( - bundle, Im2colBundelIndex::BUNDLE_THREAD_INDEX, - bundle_thread.total_size_in_bytes() * ncb_index.thread_id)); + bundle_thread.set( + static_cast( + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); fallback::MatrixMulImpl::KernParam matmul_param; static_cast(matmul_param) = matmul_kernsize_param; - matmul_param.workspace_ptr = bundle_thread.get( - Im2colBundelIndex::THREAD_BUNDLE_COMPUTE_INDEX); - - //! 1.Copy bias if need - COPY_BIAS(); - //! 2.Im2col - IM2COL(); - - //! 3.packb and matmul compute - PREPAR_MATMUL_DATA(); - MATMUL_COMPUTE(); + strategyparam.batch_id = ncb_index.ndrange_id[0]; + strategyparam.group_id = ncb_index.ndrange_id[1]; + strategyparam.oc_cur_index = + ncb_index.ndrange_id[3] * + strategyparam.oc_tile_size; + strategyparam.oc_end_index = strategyparam.oc_cur_index + + output_block_oc_size; + strategyparam.ohw_cur_index = + ncb_index.ndrange_id[2] * ohw_tile_size; + strategyparam.output_block_oc_size = output_block_oc_size; + strategyparam.output_block_size = output_block_size; + + //! 1.Im2col + im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, + matmul_param, matmul_algo); + + //! 2.packb and matmul compute + im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, + matmul_param, matmul_algo, ncb_index); + + //! 3.postprocess and copy dst if need + im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); + } + WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, + MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + size_t oc_tile_size) { + size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + + size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0; + bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; + megdnn_assert(only_packA, "onlysupport onlypackA mode"); + size_t im2col_dst_size = + IC * FH * FW * ohw_tile_size * sizeof(param.src_type); + size_t matmul_dst_size = + oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + //! matmul_dst and im2col_dst use the same memory + WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); + packb = wb.get_size(1); + im2col = im2col_dst_size; + matmul_dst = matmul_dst_size; + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + } - //! 4.postprocess and copy dst if need - POSTPROCESS_AND_COPYDST(); -#undef PREPAR_MATMUL_DATA -#undef MATMUL_COMPUTE + return {nullptr, {packb, im2col, matmul_dst, bias_temp}}; } }; -#define PREPAR_MATMUL_DATA() \ - bias_ctype* matmul_dst = nullptr; \ - const src_ctype* filter = \ - param.filter(group_id) + oc_cur_index * IC * FH * FW; \ - matmul_dst = PtrGetter::get_matmul_dst_ptr( \ - param, bundle_thread, \ - Im2colBundelIndex::THREAD_BUNDLE_MATMUL_DST_INDEX, oc_cur_index, \ - OHW, is_dst_8bit, is_ohw_size_bigger, batch_id, group_id); - -#define MATMUL_COMPUTE() \ - matmul_param.M = output_block_oc_size; \ - matmul_param.N = output_block_size; \ - matmul_param.LDB = special_1x1 ? OH * OW : output_block_size; \ - matmul_param.LDC = output_block_size; \ - matmul_param.A_ptr = filter; \ - matmul_param.B_ptr = im2col_dst ? im2col_dst : no_padding_src; \ - matmul_param.C_ptr = matmul_dst; \ - auto matmul_kern_t = matmul_algo->get_kern(matmul_param); \ - matmul_kern_t(matmul_param); - template <> class Im2colKerns { public: //! conv kernel - template static void kerns( WorkspaceBundle bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::AlgoBase* matmul_algo, + StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, - size_t ohw_tile_size, size_t oc_tile_size) { - auto is_xcorr = !param.filter_meta.should_flip; - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(N); - auto IH2 = IH + 2 * PH; - auto IW2 = IW + 2 * PW; - size_t group_id = ncb_index.ndrange_id[0]; - size_t batch_id = ncb_index.ndrange_id[1]; - size_t OHW = OH * OW; + size_t ohw_tile_size, StrategyBase* im2colstrategy) { + size_t OC = param.filter_meta.ocpg; size_t output_block_size = std::min( - ohw_tile_size, OHW - ncb_index.ndrange_id[2] * ohw_tile_size); + ohw_tile_size, + strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); size_t output_block_oc_size = std::min( - oc_tile_size, OC - ncb_index.ndrange_id[3] * oc_tile_size); - //! misc flags - bool special_1x1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 && - PH == 0 && PW == 0); - bool is_dst_8bit = - (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - bool is_ohw_size_bigger = (ohw_tile_size >= OHW); - bool skip_copy_dst = is_ohw_size_bigger && !is_dst_8bit; - - //! misc index - size_t ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; - size_t oc_cur_index = ncb_index.ndrange_id[3] * oc_tile_size; - size_t oc_end_index = oc_cur_index + output_block_oc_size; + strategyparam.oc_tile_size, + OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); + + strategyparam.batch_id = ncb_index.ndrange_id[0]; + strategyparam.group_id = ncb_index.ndrange_id[1]; + strategyparam.oc_cur_index = + ncb_index.ndrange_id[3] * + strategyparam.oc_tile_size; + strategyparam.oc_end_index = strategyparam.oc_cur_index + + output_block_oc_size; + strategyparam.ohw_cur_index = + ncb_index.ndrange_id[2] * ohw_tile_size; + strategyparam.output_block_oc_size = output_block_oc_size; + strategyparam.output_block_size = output_block_size; bundle.set(param.workspace_ptr); - bundle_thread.set(PtrGetter::get_bundle_offset_byte_ptr( - bundle, Im2colBundelIndex::BUNDLE_THREAD_INDEX, - bundle_thread.total_size_in_bytes() * ncb_index.thread_id)); + bundle_thread.set( + static_cast( + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); fallback::MatrixMulImpl::KernParam matmul_param; static_cast(matmul_param) = matmul_kernsize_param; - matmul_param.workspace_ptr = bundle_thread.get( - Im2colBundelIndex::THREAD_BUNDLE_COMPUTE_INDEX); - //! 1.Copy bias if need - COPY_BIAS(); + //! 1.Im2col + im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, + matmul_param, matmul_algo); - //! 2.Im2col - IM2COL(); + //! 2.packb and matmul compute + im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, + matmul_param, matmul_algo, ncb_index); - //! 3.packb and matmul compute - PREPAR_MATMUL_DATA(); - MATMUL_COMPUTE(); + //! 3.postprocess and copy dst if need + im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); + } + WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, + MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + size_t oc_tile_size) { + size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + size_t ohw = param.osz[0] * param.osz[1]; - //! 4.postprocess and copy dst if need - POSTPROCESS_AND_COPYDST(); + size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0; + bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK; + megdnn_assert(no_pack, "only support no pack"); + bool is_dst_8bit = + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + size_t im2col_dst_size = + IC * FH * FW * ohw_tile_size * sizeof(param.src_type); + size_t matmul_dst_size = + oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + im2col = im2col_dst_size; + if (is_dst_8bit) { + matmul_dst = matmul_dst_size; + } else { + matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size; + } + matmul_compute = matmul_algo->get_workspace(im2col_kern_param); + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + } -#undef PREPAR_MATMUL_DATA -#undef MATMUL_COMPUTE + return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}}; } }; -#undef COPY_BIAS -#undef IM2COL -#undef POSTPROCESS_AND_COPYDST +#undef FILL_IM2COL_STRATEGY_PARAM + fallback::MatrixMulImpl::KernSizeParam ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param, size_t ohw_tile_size, @@ -698,51 +428,27 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( sizeof(param.src_type); //! for padding } packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size - WorkspaceBundle ws = get_thread_bundle(param); - return {nullptr, - {padding, packa_size, ws.total_size_in_bytes() * nr_threads}}; -} - -WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_thread_bundle( - const NCBKernSizeParam& param) const { - size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], - FW = param.filter_meta.spatial[1]; - size_t ohw = param.osz[0] * param.osz[1]; - - size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0, - matmul_compute = 0; + WorkspaceBundle ws = {nullptr, {}}; auto im2col_kern_param = get_matmul_kern_param(param, m_ohw_tile_size, m_oc_tile_size); - bool default_pack = m_matmul_algo->packmode() == Pack_Mode::DEFAULT; - bool only_packA = m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; - bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - size_t im2col_dst_size = - IC * FH * FW * m_ohw_tile_size * sizeof(param.src_type); - size_t matmul_dst_size = - m_oc_tile_size * m_ohw_tile_size * sizeof(param.bias_type); - if (default_pack || only_packA) { - //! matmul_dst and im2col_dst use the same memory - WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param); - packb = wb.get_size(1); - im2col = only_packA ? im2col_dst_size - : std::max(im2col_dst_size, matmul_dst_size); - matmul_dst = only_packA ? matmul_dst_size : 0; + if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) { + Im2colKerns defaultkern; + ws = defaultkern.get_thread_bundle(param, im2col_kern_param, + m_matmul_algo, m_ohw_tile_size, + m_oc_tile_size); + } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) { + Im2colKerns onlypackakern; + ws = onlypackakern.get_thread_bundle(param, im2col_kern_param, + m_matmul_algo, m_ohw_tile_size, + m_oc_tile_size); } else { - im2col = im2col_dst_size; - if (is_dst_8bit) { - matmul_dst = matmul_dst_size; - } else { - matmul_dst = m_ohw_tile_size >= ohw ? 0 : matmul_dst_size; - } - matmul_compute = m_matmul_algo->get_workspace(im2col_kern_param); - } - if (param.bias_mode == megdnn::BiasMode::BIAS) { - bias_temp = m_oc_tile_size * m_ohw_tile_size * sizeof(param.bias_type); + Im2colKerns nopackkern; + ws = nopackkern.get_thread_bundle(param, im2col_kern_param, + m_matmul_algo, m_ohw_tile_size, + m_oc_tile_size); } - return {nullptr, {packb, im2col, matmul_dst, bias_temp, matmul_compute}}; + return {nullptr, + {padding, packa_size, ws.total_size_in_bytes() * nr_threads}}; } size_t ConvBiasImpl::AlgoIm2col::get_workspace( @@ -755,200 +461,151 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace( } SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( - ConvBiasImpl*, const NCBKernSizeParam& param) const { + ConvBiasImpl* opr, const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1) { - size_t ohw = param.osz[0] * param.osz[1]; + UNPACK_CONV_F32_NCB_KERN_SIZES(param); + MEGDNN_MARK_USED_VAR(SH); + MEGDNN_MARK_USED_VAR(SW); + MEGDNN_MARK_USED_VAR(IH); + MEGDNN_MARK_USED_VAR(IW); + MEGDNN_MARK_USED_VAR(FH); + MEGDNN_MARK_USED_VAR(FW); + size_t ohw = OH * OW; size_t ohw_parallel_times = div_ceil(ohw, m_ohw_tile_size); size_t GROUP = param.filter_meta.group; - size_t IC = param.filter_meta.icpg; - size_t OC = param.filter_meta.ocpg; - size_t PH = param.filter_meta.padding[0]; - size_t PW = param.filter_meta.padding[1]; WorkspaceBundle bundle = get_bundle(param); - WorkspaceBundle bundle_thread = get_thread_bundle(param); - - size_t oc_parallel_times = div_ceil(OC, m_oc_tile_size); + WorkspaceBundle bundle_thread = {nullptr, {}}; + size_t oc_parallel_times = div_ceil(OC, m_oc_tile_size); bool need_padding = (PH != 0 || PW != 0); - bool default_pack = m_matmul_algo->packmode() == Pack_Mode::DEFAULT; - bool no_pack = m_matmul_algo->packmode() == Pack_Mode::NO_PACK; - bool only_packA = m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; + Pack_Mode packmode = m_matmul_algo->packmode(); + bool default_pack = packmode == Pack_Mode::DEFAULT; + bool no_pack = packmode == Pack_Mode::NO_PACK; + bool only_packA = packmode == Pack_Mode::ONLY_PACKA; size_t packa_parallel_times = 0; if (only_packA) { - packa_parallel_times = div_ceil(OC, m_oc_tile_size); + packa_parallel_times = div_ceil(OC, m_oc_tile_size); } else if (default_pack) { - packa_parallel_times = - div_ceil(OC, m_matmul_algo->get_inner_block_size().m); + packa_parallel_times = div_ceil( + OC, m_matmul_algo->get_inner_block_size().m); } auto matmul_param = get_matmul_kern_param( param, m_ohw_tile_size, only_packA ? m_oc_tile_size : OC); + if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) { + Im2colKerns defaultkern; + bundle_thread = defaultkern.get_thread_bundle( + param, matmul_param, m_matmul_algo, m_ohw_tile_size, + m_oc_tile_size); + } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) { + Im2colKerns onlypackakern; + bundle_thread = onlypackakern.get_thread_bundle( + param, matmul_param, m_matmul_algo, m_ohw_tile_size, + m_oc_tile_size); + } else { + Im2colKerns nopackkern; + bundle_thread = nopackkern.get_thread_bundle( + param, matmul_param, m_matmul_algo, m_ohw_tile_size, + m_oc_tile_size); + } - SmallVector ret_kern; + StrategyParam strategyparam; + strategyparam.ohw = ohw; + strategyparam.is_dst_8bit = + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + strategyparam.is_ohw_size_bigger = (m_ohw_tile_size >= ohw); + strategyparam.skip_copy_dst = + strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit; + strategyparam.oc_tile_size = m_oc_tile_size; -#define RETURN_KERNS() \ - if (default_pack) { \ - ret_kern.push_back( \ - {kern_default_packA, {GROUP, 1_z, packa_parallel_times}}); \ - } \ - if (only_packA) { \ - ret_kern.push_back( \ - {kern_only_packA, {GROUP, 1_z, packa_parallel_times}}); \ - } \ - if (need_padding) { \ - ret_kern.push_back({kern_padding, {GROUP, param.n, IC}}); \ - } \ - if (default_pack) { \ - ret_kern.push_back( \ - {kern_compute_default, \ - {GROUP, param.n, ohw_parallel_times, oc_parallel_times}}); \ - } \ - if (no_pack) { \ - ret_kern.push_back( \ - {kern_compute_nopack, \ - {GROUP, param.n, ohw_parallel_times, oc_parallel_times}}); \ - } \ - if (only_packA) { \ - ret_kern.push_back( \ - {kern_compute_onlypackA, \ - {GROUP, param.n, ohw_parallel_times, oc_parallel_times}}); \ - } \ - return ret_kern; - -#define COMPUTE_KERN(_name, _pack_mode, _dt, _post_ctype, _postprocess_mode) \ - auto kern_compute_##_name = [bundle, bundle_thread, matmul_param, \ - matmul_algo = m_matmul_algo, \ - ohw_tile_size = m_ohw_tile_size, \ - oc_tile_size = m_oc_tile_size]( \ - const NCBKernParam& param, \ - const NCBKernIndex& ncb_index) { \ - Im2colKerns<_pack_mode>::kerns<_dt, _dt, _dt, _post_ctype, \ - _post_ctype, _postprocess_mode>( \ - bundle, bundle_thread, param, matmul_param, matmul_algo, \ - ncb_index, ohw_tile_size, oc_tile_size); \ - }; - -#define cb(_dt, _post_ctype, _postprocess_mode, _midout_tags) \ - do { \ - if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ - MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1, _midout_tags) { \ - auto kern_padding = [bundle](const NCBKernParam& param, \ - const NCBKernIndex& ncb_index) { \ - copy_padding_kern<_dt>(bundle, param, ncb_index); \ - }; \ - auto kern_default_packA = \ - [bundle, matmul_algo = m_matmul_algo, matmul_param]( \ - const NCBKernParam& param, \ - const NCBKernIndex& ncb_index) { \ - Im2colKerns::packA_kern<_dt>( \ - bundle, param, matmul_param, matmul_algo, \ - ncb_index); \ - }; \ - auto kern_only_packA = [bundle, matmul_algo = m_matmul_algo, \ - matmul_param]( \ - const NCBKernParam& param, \ - const NCBKernIndex& \ - ncb_index) { \ - Im2colKerns::packA_kern<_dt>( \ - bundle, param, matmul_param, matmul_algo, \ - ncb_index); \ - }; \ - COMPUTE_KERN(default, Pack_Mode::DEFAULT, _dt, _post_ctype, \ - _postprocess_mode); \ - COMPUTE_KERN(nopack, Pack_Mode::NO_PACK, _dt, _post_ctype, \ - _postprocess_mode); \ - COMPUTE_KERN(onlypackA, Pack_Mode::ONLY_PACKA, _dt, \ - _post_ctype, _postprocess_mode); \ - RETURN_KERNS(); \ - } \ - MIDOUT_END(); \ - return {}; \ - } \ - } while (0); - - cb(dt_float32, dt_float32, PostprocessMode::FLOAT, 0); -#if !MEGDNN_DISABLE_FLOAT16 - cb(dt_float16, dt_float16, PostprocessMode::NO_PROCESS, 2); -#endif -#undef cb -#undef COMPUTE_KERN - -#define COMPUTE_KERN(_name, _pack_mode, _src_ctype, _bias_ctype, _dst_ctype, \ - _i_bias_type, _i_dst_type, _postprocess_mode) \ - auto kern_compute_##_name = [bundle, bundle_thread, matmul_param, \ - matmul_algo = m_matmul_algo, \ - ohw_tile_size = m_ohw_tile_size, \ - oc_tile_size = m_oc_tile_size]( \ - const NCBKernParam& param, \ - const NCBKernIndex& ncb_index) { \ - Im2colKerns<_pack_mode>::kerns<_src_ctype, _bias_ctype, _dst_ctype, \ - DTypeTrait<_i_bias_type>::ctype, \ - DTypeTrait<_i_dst_type>::ctype, \ - _postprocess_mode>( \ - bundle, bundle_thread, param, matmul_param, matmul_algo, \ - ncb_index, ohw_tile_size, oc_tile_size); \ - }; - -#define cb(_i_src_type, _i_bias_type, _i_dst_type, _src_ctype, _bias_ctype, \ - _dst_ctype, _postprocess_mode, _midout_tags) \ - do { \ - if (param.filter_type.enumv() == param.src_type.enumv() && \ - param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ - param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ - MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1, _midout_tags) { \ - auto kern_padding = [bundle](const NCBKernParam& param, \ - const NCBKernIndex& ncb_index) { \ - copy_padding_kern<_src_ctype>(bundle, param, ncb_index); \ - }; \ - auto kern_default_packA = [bundle, \ - matmul_algo = m_matmul_algo, \ - matmul_param]( \ - const NCBKernParam& param, \ - const NCBKernIndex& \ - ncb_index) { \ - Im2colKerns::packA_kern<_src_ctype>( \ - bundle, param, matmul_param, matmul_algo, \ - ncb_index); \ - }; \ - auto kern_only_packA = \ - [bundle, matmul_algo = m_matmul_algo, matmul_param]( \ - const NCBKernParam& param, \ - const NCBKernIndex& ncb_index) { \ - Im2colKerns::packA_kern< \ - _src_ctype>(bundle, param, matmul_param, \ - matmul_algo, ncb_index); \ - }; \ - COMPUTE_KERN(default, Pack_Mode::DEFAULT, _src_ctype, \ - _bias_ctype, _dst_ctype, _i_bias_type, \ - _i_dst_type, _postprocess_mode); \ - COMPUTE_KERN(nopack, Pack_Mode::NO_PACK, _src_ctype, \ - _bias_ctype, _dst_ctype, _i_bias_type, \ - _i_dst_type, _postprocess_mode); \ - COMPUTE_KERN(onlypackA, Pack_Mode::ONLY_PACKA, _src_ctype, \ - _bias_ctype, _dst_ctype, _i_bias_type, \ - _i_dst_type, _postprocess_mode); \ - RETURN_KERNS(); \ - } \ - MIDOUT_END(); \ - return {}; \ - } \ - } while (0); - - cb(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32, - PostprocessMode::NO_PROCESS, 3); - - cb(dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, dt_int16, - PostprocessMode::NO_PROCESS, 4); - - cb(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS32, - dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, 7); - - cb(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, - dt_int32, dt_int8, PostprocessMode::QUANTIZED, 8); -#undef COMPUTE_KERN -#undef RETURN_KERNS -#undef cb - megdnn_throw("unsupported data type on im2col matmul algo"); + SmallVector ret_kern; + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv("ConvBiasImpl::AlgoIm2col::dispatch_kerns"_hash)) { + StrategyBase* im2colstrategy = Factory::get_im2col_strategy( + param, m_matmul_algo, opr->param().format); + auto kern_padding = [bundle, im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) { + copy_padding_kern(bundle, param, ncb_index, im2colstrategy); + }; + + auto kern_packA = [bundle, matmul_algo = m_matmul_algo, + matmul_param, + im2colstrategy](const NCBKernParam& param, + const NCBKernIndex& ncb_index) { + packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, + im2colstrategy); + }; + if (default_pack) { + auto kern_compute_default = + [bundle, bundle_thread, matmul_param, + matmul_algo = m_matmul_algo, + ohw_tile_size = m_ohw_tile_size, + strategyparam = strategyparam, + im2colstrategy](const NCBKernParam& param, + const NCBKernIndex& ncb_index) { + Im2colKerns::kerns( + bundle, bundle_thread, param, matmul_param, + matmul_algo, strategyparam, ncb_index, + ohw_tile_size, im2colstrategy); + }; + ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); + + if (need_padding) { + ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); + } + ret_kern.push_back( + {kern_compute_default, + {N, GROUP, ohw_parallel_times, oc_parallel_times}}); + } else if (only_packA) { + auto kern_compute_onlypackA = + [bundle, bundle_thread, matmul_param, + matmul_algo = m_matmul_algo, + strategyparam = strategyparam, + ohw_tile_size = m_ohw_tile_size, + im2colstrategy](const NCBKernParam& param, + const NCBKernIndex& ncb_index) { + Im2colKerns::kerns( + bundle, bundle_thread, param, matmul_param, + matmul_algo, strategyparam, ncb_index, + ohw_tile_size, im2colstrategy); + }; + ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); + if (need_padding) { + ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); + } + ret_kern.push_back( + {kern_compute_onlypackA, + {N, GROUP, ohw_parallel_times, oc_parallel_times}}); + } else if (no_pack) { + auto kern_compute_nopack = + [bundle, bundle_thread, matmul_param, + matmul_algo = m_matmul_algo, + strategyparam = strategyparam, + ohw_tile_size = m_ohw_tile_size, + im2colstrategy](const NCBKernParam& param, + const NCBKernIndex& ncb_index) { + Im2colKerns::kerns( + bundle, bundle_thread, param, matmul_param, + matmul_algo, strategyparam, ncb_index, + ohw_tile_size, im2colstrategy); + }; + + if (need_padding) { + ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); + } + ret_kern.push_back( + {kern_compute_nopack, + {N, GROUP, ohw_parallel_times, oc_parallel_times}}); + } + return ret_kern; + } + MIDOUT_END(); + return {}; } MIDOUT_END(); return {}; @@ -977,8 +634,14 @@ bool ConvBiasImpl::AlgoIm2col::usable( bool matmulusable = m_matmul_algo->usable(matmul_param); return matmulusable && (opr->param().format == param::ConvBias::Format::NCHW) && - (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && - (param.filter_meta.spatial[0] <= 7)) && + ((param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && + (param.filter_meta.spatial[0] <= 7) && + (param.filter_meta.spatial[0] >= 2)) || + (param.filter_meta.spatial[0] != param.filter_meta.spatial[1] && + (param.filter_meta.spatial[0] <= 7) && + (param.filter_meta.spatial[0] >= 1) && + (param.filter_meta.spatial[1] <= 7) && + (param.filter_meta.spatial[1] >= 1))) && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && diff --git a/dnn/src/fallback/conv_bias/im2col/algos.h b/dnn/src/fallback/conv_bias/im2col/algos.h index 80569e6a..5f65ddcb 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.h +++ b/dnn/src/fallback/conv_bias/im2col/algos.h @@ -67,8 +67,7 @@ public: } auto&& fm = param.filter_meta; auto OC = fm.ocpg, IC = fm.icpg; - return (fm.spatial[0] == fm.spatial[1] && fm.spatial[0] == 1) || - OC >= 32 || IC >= 32; + return OC >= 32 || IC >= 32; } private: diff --git a/dnn/src/fallback/conv_bias/im2col/factory.h b/dnn/src/fallback/conv_bias/im2col/factory.h new file mode 100644 index 00000000..2ab4300a --- /dev/null +++ b/dnn/src/fallback/conv_bias/im2col/factory.h @@ -0,0 +1,473 @@ +/** + * \file dnn/src/fallback/conv_bias/im2col/factory.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#pragma once +#include +#include "src/fallback/conv_bias/im2col/strategy_base.h" +#include "src/fallback/conv_bias/opr_impl.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_fallback_im2col_factory_make_strategy) + +namespace megdnn { +namespace fallback { +namespace im2col { + +enum class StrategyType : uint32_t { + FLOAT = 0, +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + FLOAT_FP16 = 1, +#else +#if !MEGDNN_DISABLE_FLOAT16 + FLOAT16_FLOAT16 = 2, +#endif +#endif + INT8x8x32 = 3, + INT8x8x16 = 4, +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + QUINT8x8x32 = 5, + QUINT8x8x32x8 = 6, +#endif + QINT8x8x32 = 7, + QINT8x8x32x8 = 8 +}; + +struct StrategyHashParam { + fallback::ConvBiasImpl::NCBKernSizeParam param; + param::ConvBias::Format format; + fallback::MatrixMulImpl::AlgoBase::PackMode packmode; + size_t block_m; + size_t block_n; + size_t block_k; +}; + +struct StrategyHashParamHash { + std::size_t operator()(const StrategyHashParam& sparam) const { + constexpr size_t base = 1; //! avoid hashkey is zero + std::size_t result = + static_cast(sparam.param.src_type.enumv()) + base; + result = result ^ + ((static_cast(sparam.param.dst_type.enumv()) + + base) + << 3); + result = result ^ + ((static_cast(sparam.param.filter_type.enumv()) + + base) + << 6); + result = result ^ + ((static_cast(sparam.param.bias_type.enumv()) + + base) + << 9); + result = result ^ + ((static_cast(sparam.format) + base) << 12); + result = result ^ + ((static_cast(sparam.packmode) + base) << 15); + result = result ^ + ((static_cast(sparam.block_m) + base) << 18); + result = result ^ + ((static_cast(sparam.block_n) + base) << 22); + result = result ^ + ((static_cast(sparam.block_k) + base) << 26); + return result; + }; +}; + +struct StrategyHashParamEqual { + std::size_t operator()(const StrategyHashParam& param1, + const StrategyHashParam& param2) const { + bool flags = true; + flags = param1.param.src_type == param2.param.src_type && flags; + flags = param1.param.filter_type == param2.param.filter_type && flags; + flags = param1.param.bias_type == param2.param.bias_type && flags; + flags = param1.param.dst_type == param2.param.dst_type && flags; + flags = param1.format == param2.format && flags; + flags = param1.packmode == param2.packmode && flags; + flags = param1.block_m == param2.block_m && flags; + flags = param1.block_n == param2.block_n && flags; + flags = param1.block_k == param2.block_k && flags; + return flags; + }; +}; + +class StrategyDelegationStorage { + std::mutex m_mtx; + std::unordered_map, + StrategyHashParamHash, StrategyHashParamEqual> + map_strategys; + +public: + ~StrategyDelegationStorage() = default; + + template + Strategy* get(param::ConvBias::Format format, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + StrategyType stype); +}; + +class Factory { +public: + static StrategyBase* get_im2col_strategy( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + param::ConvBias::Format format) { + static StrategyDelegationStorage storage; + StrategyType strategytype = get_strategy_type(param); + return storage.get(format, matmul_algo, param, + strategytype); + } + + static StrategyType get_strategy_type( + const fallback::ConvBiasImpl::NCBKernSizeParam& param) { +#define cb1(_dt, _post_ctype, _strategytype) \ + if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ + return _strategytype; \ + } + +#define cb2(_i_src_type, _i_bias_type, _i_dst_type, _src_ctype, _bias_ctype, \ + _dst_ctype, _strategytype) \ + if (param.filter_type.enumv() == param.src_type.enumv() && \ + param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ + param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ + return _strategytype; \ + } + + cb1(dt_float32, dt_float32, StrategyType::FLOAT); +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + cb1(dt_float16, __fp16, StrategyType::FLOAT_FP16); +#else +#if !MEGDNN_DISABLE_FLOAT16 + cb1(dt_float16, dt_float16, StrategyType::FLOAT16_FLOAT16); +#endif +#endif + + cb2(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32, + StrategyType::INT8x8x32); + + cb2(dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, dt_int16, + StrategyType::INT8x8x16); + +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + cb2(dtype::Quantized8Asymm, dtype::QuantizedS32, dtype::QuantizedS32, + dt_uint8, dt_int32, dt_int32, StrategyType::QUINT8x8x32); + + cb2(dtype::Quantized8Asymm, dtype::QuantizedS32, dtype::Quantized8Asymm, + dt_uint8, dt_int32, dt_uint8, StrategyType::QUINT8x8x32x8); +#endif + cb2(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS32, + dt_int8, dt_int32, dt_int32, StrategyType::QINT8x8x32); + + cb2(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS8, + dt_int8, dt_int32, dt_int8, StrategyType::QINT8x8x32x8); +#undef cb1 +#undef cb2 + megdnn_throw("not support datatype in im2col strategy\n"); + } + +#define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag) \ + MIDOUT_BEGIN(megdnn_fallback_im2col_factory_make_strategy, \ + midout_iv(_midout_tag)) { \ + if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ + return std::make_unique< \ + Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \ + _postprocess_mode, PackMode::_packmode>>(); \ + } \ + } \ + MIDOUT_END(); \ + return {}; + +#define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \ + _bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag) \ + MIDOUT_BEGIN(megdnn_fallback_im2col_factory_make_strategy, \ + midout_iv(_midout_tag)) { \ + if (param.filter_type.enumv() == param.src_type.enumv() && \ + param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ + param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ + return std::make_unique< \ + Strategy<_src_ctype, _bias_ctype, _dst_ctype, \ + DTypeTrait<_i_bias_type>::ctype, \ + DTypeTrait<_i_dst_type>::ctype, \ + _postprocess_mode, PackMode::_packmode>>(); \ + } \ + } \ + MIDOUT_END(); \ + return {}; + + static std::unique_ptr make_default_strategy( + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + param::ConvBias::Format format, StrategyType strategytype) { + MEGDNN_MARK_USED_VAR(matmul_algo); + MEGDNN_MARK_USED_VAR(format); + switch (strategytype) { + case StrategyType::FLOAT: + cb1(DEFAULT, dt_float32, dt_float32, PostprocessMode::FLOAT, + "DefaultStrategyType::FLOAT"_hash); + break; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case StrategyType::FLOAT_FP16: + cb1(DEFAULT, dt_float16, __fp16, PostprocessMode::FLOAT, + "DefaultStrategyType::FLOAT_FP16"_hash); + break; +#else +#if !MEGDNN_DISABLE_FLOAT16 + case StrategyType::FLOAT16_FLOAT16: + cb1(DEFAULT, dt_float16, dt_float16, + PostprocessMode::NO_PROCESS, + "DefaultStrategyType::FLOAT16_FLOAT16"_hash); + break; +#endif +#endif + case StrategyType::INT8x8x32: + cb2(DEFAULT, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, + "DefaultStrategyType::INT8x8x32"_hash); + break; + + case StrategyType::INT8x8x16: + cb2(DEFAULT, dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, + dt_int16, PostprocessMode::NO_PROCESS, + "DefaultStrategyType::INT8x8x16"_hash); + break; +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + case StrategyType::QUINT8x8x32: + cb2(DEFAULT, dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "DefaultStrategyType::QUINT8x8x32"_hash); + break; + + case StrategyType::QUINT8x8x32x8: + cb2(DEFAULT, dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, + PostprocessMode::QUANTIZED, + "DefaultStrategyType::QUINT8x8x32x8"_hash); + break; +#endif + case StrategyType::QINT8x8x32: + cb2(DEFAULT, dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "DefaultStrategyType::QINT8x8x32"_hash); + break; + + case StrategyType::QINT8x8x32x8: + cb2(DEFAULT, dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, + PostprocessMode::QUANTIZED, + "DefaultStrategyType::QINT8x8x32x8"_hash); + break; + } + megdnn_throw("error not support strategy type "); + } + + static std::unique_ptr make_nopack_strategy( + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + param::ConvBias::Format format, StrategyType strategytype) { + MEGDNN_MARK_USED_VAR(matmul_algo); + MEGDNN_MARK_USED_VAR(format); + switch (strategytype) { + case StrategyType::FLOAT: + cb1(NO_PACK, dt_float32, dt_float32, PostprocessMode::FLOAT, + "NoPackStrategyType::FLOAT"_hash); + break; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case StrategyType::FLOAT_FP16: + cb1(NO_PACK, dt_float16, __fp16, PostprocessMode::FLOAT, + "NoPackStrategyType::FLOAT_FP16"_hash); + break; +#else +#if !MEGDNN_DISABLE_FLOAT16 + case StrategyType::FLOAT16_FLOAT16: + cb1(NO_PACK, dt_float16, dt_float16, PostprocessMode::NO_PROCESS, + "NoPackStrategyType::FLOAT16_FLOAT16"_hash); + break; +#endif +#endif + case StrategyType::INT8x8x32: + cb2(NO_PACK, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, + "NoPackStrategyType::INT8x8x32"_hash); + break; + + case StrategyType::INT8x8x16: + cb2(NO_PACK, dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, + dt_int16, PostprocessMode::NO_PROCESS, + "NoPackStrategyType::INT8x8x16"_hash); + break; + +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + case StrategyType::QUINT8x8x32: + cb2(NO_PACK, dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "NoPackStrategyType::QUINT8x8x32"_hash); + break; + + case StrategyType::QUINT8x8x32x8: + cb2(NO_PACK, dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, + PostprocessMode::QUANTIZED, + "NoPackStrategyType::QUINT8x8x32x8"_hash); + break; +#endif + case StrategyType::QINT8x8x32: + cb2(NO_PACK, dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "NoPackStrategyType::QINT8x8x32"_hash); + break; + + case StrategyType::QINT8x8x32x8: + cb2(NO_PACK, dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, + PostprocessMode::QUANTIZED, + "NoPackStrategyType::QINT8x8x32x8"_hash); + break; + } + megdnn_throw("error not support strategy type "); + } + + static std::unique_ptr make_onlypacka_strategy( + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + param::ConvBias::Format format, StrategyType strategytype) { + MEGDNN_MARK_USED_VAR(matmul_algo); + MEGDNN_MARK_USED_VAR(format); + switch (strategytype) { + case StrategyType::FLOAT: + cb1(ONLY_PACKA, dt_float32, dt_float32, PostprocessMode::FLOAT, + "OnlyPackaStrategyType::FLOAT"_hash); + break; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case StrategyType::FLOAT_FP16: + cb1(ONLY_PACKA, dt_float16, __fp16, PostprocessMode::FLOAT, + "OnlyPackaStrategyType::FLOAT_FP16"_hash); + break; +#else +#if !MEGDNN_DISABLE_FLOAT16 + case StrategyType::FLOAT16_FLOAT16: + cb1(ONLY_PACKA, dt_float16, dt_float16, + PostprocessMode::NO_PROCESS, + "OnlyPackaStrategyType::FLOAT16_FLOAT16"_hash); + break; +#endif +#endif + case StrategyType::INT8x8x32: + cb2(ONLY_PACKA, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, + "OnlyPackaStrategyType::INT8x8x32"_hash); + break; + + case StrategyType::INT8x8x16: + cb2(ONLY_PACKA, dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, + dt_int16, PostprocessMode::NO_PROCESS, + "OnlyPackaStrategyType::INT8x8x16"_hash); + break; + +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + case StrategyType::QUINT8x8x32: + cb2(ONLY_PACKA, dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "OnlyPackaStrategyType::QUINT8x8x32"_hash); + break; + + case StrategyType::QUINT8x8x32x8: + cb2(ONLY_PACKA, dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, + PostprocessMode::QUANTIZED, + "OnlyPackaStrategyType::QUINT8x8x32x8"_hash); + break; +#endif + case StrategyType::QINT8x8x32: + cb2(ONLY_PACKA, dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "OnlyPackaStrategyType::QINT8x8x32"_hash); + break; + + case StrategyType::QINT8x8x32x8: + cb2(ONLY_PACKA, dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, + PostprocessMode::QUANTIZED, + "OnlyPackaStrategyType::QINT8x8x32x8"_hash); + break; + } + megdnn_throw("error not support strategy type "); + } + +#undef cb1 +#undef cb2 + + static std::unique_ptr make_strategy( + param::ConvBias::Format format, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + fallback::MatrixMulImpl::AlgoBase::PackMode packmode, + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + StrategyType stype) { + switch (packmode) { + case MatrixMulImpl::AlgoBase::PackMode::DEFAULT: + return make_default_strategy(matmul_algo, param, format, stype); + break; + case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA: + return make_onlypacka_strategy(matmul_algo, param, format, + stype); + break; + case MatrixMulImpl::AlgoBase::PackMode::NO_PACK: + return make_nopack_strategy(matmul_algo, param, format, stype); + break; + default: + megdnn_throw( + "not support packmode except default onlypackA " + "nopack"); + break; + } + megdnn_throw( + "factory make Strategy error please check your code"); + } +}; + +template +Strategy* StrategyDelegationStorage::get( + param::ConvBias::Format format, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + StrategyType stype) { + fallback::MatrixMulImpl::AlgoBase::PackMode packmode = + matmul_algo->packmode(); + //! nopack mode block_m block_n block_k is zero + size_t block_m = 0, block_n = 0, block_k = 0; + if (packmode == fallback::MatrixMulImpl::AlgoBase::PackMode::DEFAULT || + packmode == fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { + block_m = matmul_algo->get_inner_block_size().m; + block_n = matmul_algo->get_inner_block_size().n; + block_k = matmul_algo->get_inner_block_size().k; + } + StrategyHashParam sparam; + sparam.param = param; + sparam.format = format; + sparam.packmode = packmode; + sparam.block_m = block_m; + sparam.block_n = block_n; + sparam.block_k = block_k; + if (map_strategys.find(sparam) == map_strategys.end()) { + MEGDNN_LOCK_GUARD(m_mtx); + auto strategy = Factory::make_strategy(format, matmul_algo, packmode, + param, stype); + map_strategys[sparam] = std::move(strategy); + } + return static_cast(map_strategys[sparam].get()); +} +} // namespace im2col +} // namespace fallback +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_base.h b/dnn/src/fallback/conv_bias/im2col/strategy_base.h new file mode 100644 index 00000000..ed27f8dc --- /dev/null +++ b/dnn/src/fallback/conv_bias/im2col/strategy_base.h @@ -0,0 +1,259 @@ +/** + * \file dnn/src/fallback/conv_bias/im2col/strategy_base.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once +#include "src/fallback/conv_bias/opr_impl.h" +namespace megdnn { + +using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode; + +struct StrategyParam { + size_t batch_id; + size_t group_id; + size_t oc_tile_size; + size_t oc_cur_index; + size_t oc_end_index; + size_t ohw_cur_index; + size_t output_block_size; + size_t output_block_oc_size; + size_t ohw; + size_t block_m; + size_t block_n; + size_t block_k; + bool skip_copy_dst; + bool is_dst_8bit; + bool is_ohw_size_bigger; +}; + +class StrategyBase { +public: + StrategyBase() = default; + virtual ~StrategyBase() = default; + virtual void copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0; + virtual void packA_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0; + + virtual void exec_im2col( + WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo) = 0; + + virtual void exec_matmul( + const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0; + + virtual void exec_postprocess( + const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle_thread) = 0; +}; + +template +class Strategy; + +template +class Strategy : public StrategyBase { +public: + constexpr static size_t BUNDLE_PADDING_INDEX = 0; + constexpr static size_t BUNDLE_PACKA_INDEX = 1; + constexpr static size_t THREAD_BUNDLE_PACKB_INDEX = 0; + constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; + constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; + + Strategy(); + + void copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void packA_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; + + void exec_matmul( + const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, + WorkspaceBundle bundle_thread) override; + + void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, + const void* matmul_dst, const StrategyParam& sparam); + + void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, + WorkspaceBundle bundle_thread, const StrategyParam& sparam); + + void* get_bias_temp_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread); + void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam); +}; + +template +class Strategy : public StrategyBase { +public: + constexpr static size_t BUNDLE_PADDING_INDEX = 0; + constexpr static size_t BUNDLE_PACKA_INDEX = 1; + constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 0; + constexpr static size_t THREAD_BUNDLE_MATMULDST_INDEX = 1; + constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; + constexpr static size_t THREAD_BUNDLE_MATCOMP_INDEX = 3; + + Strategy(); + + void copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void packA_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void exec_matmul( + const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam); + + inline void* get_bias_temp_ptr( + const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread) { + bias_ctype* bias_tmp_ptr = + param.bias_mode == megdnn::BiasMode::BIAS + ? static_cast( + bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX)) + : nullptr; + return bias_tmp_ptr; + } + + void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; + void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, + WorkspaceBundle bundle_thread) override; + void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, + const void* matmul_dst, const StrategyParam& sparam); + + void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, + WorkspaceBundle bundle_thread, const StrategyParam& sparam); +}; + +template +class Strategy : public StrategyBase { +public: + constexpr static size_t BUNDLE_PADDING_INDEX = 0; + constexpr static size_t BUNDLE_PACKA_INDEX = 1; + constexpr static size_t THREAD_BUNDLE_PACKB_INDEX = 0; + constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; + constexpr static size_t THREAD_BUNDLE_MATMULDST_INDEX = 2; + constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 3; + + Strategy(); + + void copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void packA_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; + + void exec_matmul( + const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; + + void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam); + inline void* get_bias_temp_ptr( + const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread) { + bias_ctype* bias_tmp_ptr = + param.bias_mode == megdnn::BiasMode::BIAS + ? static_cast( + bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX)) + : nullptr; + return bias_tmp_ptr; + } + void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, + WorkspaceBundle bundle_thread) override; + void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, + const void* matmul_dst, const StrategyParam& sparam); + + void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, + WorkspaceBundle bundle_thread, const StrategyParam& sparam); +}; +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp new file mode 100644 index 00000000..75b0503b --- /dev/null +++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp @@ -0,0 +1,379 @@ +/** + * \file dnn/src/fallback/conv_bias/im2col/strategy_default.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "megdnn/opr_param_defs.h" +#include "src/common/utils.h" +#include "src/fallback/conv_bias/im2col/strategy_base.h" +#include "src/fallback/convolution/img2col_helper.h" +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#endif + +using namespace megdnn; +#if MEGDNN_X86 +using namespace x86; +#endif +namespace megdnn { + +template +Strategy::Strategy() + : StrategyBase() {} + +template +void Strategy:: + copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + UNPACK_CONV_F32_NCB_KERN_SIZES(param); + MEGDNN_MARK_USED_VAR(N); + MEGDNN_MARK_USED_VAR(OC); + MEGDNN_MARK_USED_VAR(OH); + MEGDNN_MARK_USED_VAR(OW); + MEGDNN_MARK_USED_VAR(FH); + MEGDNN_MARK_USED_VAR(FW); + MEGDNN_MARK_USED_VAR(SH); + MEGDNN_MARK_USED_VAR(SW); + + size_t IW2 = IW + 2 * PW; + size_t IH2 = IH + 2 * PH; + size_t batch_id = ncb_index.ndrange_id[0]; + size_t group_id = ncb_index.ndrange_id[1]; + size_t channel_id = ncb_index.ndrange_id[2]; + + size_t padding_group_size = IH2 * IW2 * IC; + size_t workspace_channel_offset = IH2 * IW2 * channel_id; + size_t workspace_group_offset = group_id * padding_group_size; + size_t workspace_batch_offset = + param.filter_meta.group * batch_id * padding_group_size; + bundle.set(param.workspace_ptr); + + src_ctype src_zp = static_cast(0); + if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { + src_zp = param.src_type.param().zero_point; + } + src_ctype* src = const_cast( + param.src(batch_id, group_id, channel_id)); + src_ctype* src2; + src2 = static_cast(bundle.get(BUNDLE_PADDING_INDEX)) + + workspace_group_offset + workspace_batch_offset + + workspace_channel_offset; + src_ctype* src2_ptr = src2; + const src_ctype* src_ptr = src; + if (PH != 0) { + std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); + src2_ptr += PH * IW2; + } + rep(ih, IH) { + if (PW != 0) + rep(pw, PW) * (src2_ptr++) = src_zp; + std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); + src2_ptr += IW; + src_ptr += IW; + if (PW != 0) + rep(pw, PW) * (src2_ptr++) = src_zp; + } + if (PH != 0) { + std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); + src2_ptr += PH * IW2; + } +} + +template +void Strategy:: + packA_kern(WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + bundle.set(param.workspace_ptr); + fallback::MatrixMulImpl::KernParam matmul_param; + size_t group_id = ncb_index.ndrange_id[0]; + static_cast(matmul_param) = + matmulparam; + size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); + size_t packed_per_oc_block_size = + round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) * + matmul_algo->get_inner_block_size().m * + matmul_algo->get_packA_type_size(); + size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size; + int8_t* a_panel = static_cast(bundle.get(BUNDLE_PACKA_INDEX)) + + group_id * packA_group_size + a_panel_offset; + matmul_param.A_ptr = + const_cast(param.filter(group_id)); + matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1], + matmul_algo->get_inner_block_size().m); +} + +template +void Strategy:: + exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo + ) { + size_t m_sh = param.filter_meta.stride[0]; + size_t m_sw = param.filter_meta.stride[1]; + size_t m_oc = param.filter_meta.ocpg; + size_t m_oh = param.osz[0]; + size_t m_ow = param.osz[1]; + size_t m_ic = param.filter_meta.icpg; + size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2; + size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2; + size_t m_fh = param.filter_meta.spatial[0]; + size_t m_fw = param.filter_meta.spatial[1]; + size_t m_is_xcorr = !param.filter_meta.should_flip; + + size_t input_offset = + m_ih * m_iw * m_ic * + (sparam.group_id + param.filter_meta.group * sparam.batch_id) * + sizeof(src_ctype); + + src_ctype* src2 = reinterpret_cast( + reinterpret_cast(bundle.get(BUNDLE_PADDING_INDEX)) + + input_offset); + bool is_phpwzero = param.filter_meta.padding[0] == 0 && + param.filter_meta.padding[1] == 0; + if (is_phpwzero) { + src2 = const_cast( + param.src(sparam.batch_id, sparam.group_id)); + } + src_ctype* im2col_dst = static_cast( + bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); + if (m_sh == 1 && m_sw == 1) { + if (m_is_xcorr) { + img2col(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, + m_fh, m_fw, sparam.ohw_cur_index, + sparam.output_block_size); + } else { + img2col(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, + m_fh, m_fw, sparam.ohw_cur_index, + sparam.output_block_size); + } + } else { + if (m_is_xcorr) { + img2col_stride(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, + m_iw, m_fh, m_fw, m_sh, m_sw, + sparam.ohw_cur_index, + sparam.output_block_size); + } else { + img2col_stride(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, + m_ih, m_iw, m_fh, m_fw, m_sh, m_sw, + sparam.ohw_cur_index, + sparam.output_block_size); + } + } + matmul_param.M = sparam.output_block_oc_size; + matmul_param.N = sparam.output_block_size; + matmul_param.LDB = sparam.output_block_size; + matmul_param.LDC = sparam.output_block_size; + matmul_param.B_ptr = im2col_dst; + + src_ctype* b_panel = + reinterpret_cast(reinterpret_cast( + bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); + matmul_algo->pack_B(matmul_param, b_panel, 0, matmul_param.N); +} + +template +void* Strategy:: + get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam) { + if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) { + return static_cast( + bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); + } else { + bias_ctype* dst = + param.dst(sparam.batch_id, sparam.group_id) + + sparam.oc_cur_index * sparam.ohw; + return static_cast(dst); + } +} + +template +void Strategy:: + exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + size_t packA_per_oc_block_size = + round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) * + sparam.oc_tile_size * matmul_algo->get_packA_type_size(); + size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); + size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size + + ncb_index.ndrange_id[3] * packA_per_oc_block_size; + + void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); + + src_ctype* a_panel = reinterpret_cast( + reinterpret_cast(bundle.get(BUNDLE_PACKA_INDEX)) + + a_panel_offset); + src_ctype* b_panel = + reinterpret_cast(reinterpret_cast( + bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); + + matmul_param.M = sparam.output_block_oc_size; + matmul_param.N = sparam.output_block_size; + matmul_param.LDB = sparam.output_block_size; + matmul_param.LDC = sparam.output_block_size; + matmul_param.C_ptr = matmul_dst; + + auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param); + matmul_kern_naked(matmul_param, a_panel, b_panel); +} + +template +void Strategy:: + exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, + WorkspaceBundle bundle_thread) { + copy_bias(param, bundle_thread, sparam); + void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); + + const bias_ctype* bias_ptr = static_cast( + param.bias(sparam.batch_id, sparam.group_id)); + void* bias_temp_ptr = get_bias_temp_ptr(param, bundle_thread); + void* bias_preprocess_ptr = const_cast( + param.bias_mode == megdnn::BiasMode::BIAS + ? bias_temp_ptr + : static_cast(const_cast( + bias_ptr + sparam.oc_cur_index))); + + PostProcess::run( + matmul_dst, bias_preprocess_ptr, matmul_dst, param.bias_mode, + param.nonlineMode, param.bias_type, param.dst_type, 1_z, + sparam.output_block_oc_size, 1_z, sparam.output_block_size); + copy_dst(param, matmul_dst, sparam); +} + +template +void Strategy:: + copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, + const void* matmul_dst, const StrategyParam& sparam) { + if (!sparam.skip_copy_dst) { + dst_ctype* dst_tmp_ptr = + reinterpret_cast(const_cast(matmul_dst)); + dst_ctype* dst = + param.dst(sparam.batch_id, sparam.group_id) + + sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index; + for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) { + std::memcpy(dst, dst_tmp_ptr, + sizeof(dst_ctype) * sparam.output_block_size); + dst_tmp_ptr += sparam.output_block_size; + dst += sparam.ohw; + } + } +} + +template +void* Strategy:: + get_bias_temp_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread) { + bias_ctype* bias_tmp_ptr = + param.bias_mode == megdnn::BiasMode::BIAS + ? static_cast( + bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX)) + : nullptr; + return bias_tmp_ptr; +} + +template +void Strategy:: + copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, + WorkspaceBundle bundle_thread, const StrategyParam& sparam) { + const bias_ctype* bias_ptr = static_cast( + param.bias(sparam.batch_id, sparam.group_id)); + bias_ctype* bias_temp_ptr = + static_cast(get_bias_temp_ptr(param, bundle_thread)); + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_ctype* copy_dst = bias_temp_ptr; + const bias_ctype* copy_src = bias_ptr + + sparam.oc_cur_index * sparam.ohw + + sparam.ohw_cur_index; + for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) { + std::memcpy(copy_dst, copy_src, + sizeof(bias_ctype) * sparam.output_block_size); + copy_dst += sparam.output_block_size; + copy_src += sparam.ohw; + } + } +} + +#define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ + _op_dtype, _postprocess_mode) \ + template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ + _op_dtype, _postprocess_mode, PackMode::DEFAULT>; + +INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, + megdnn::PostprocessMode::FLOAT) + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, + megdnn::PostprocessMode::FLOAT) +#else +#if !MEGDNN_DISABLE_FLOAT16 +INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, + megdnn::PostprocessMode::NO_PROCESS) +#endif +#endif + +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 +//! x86 do not have uint8 matmul so only armv7 armv8 support uint8 +INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, + megdnn::PostprocessMode::QUANTIZED) +INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32, + megdnn::PostprocessMode::NO_PROCESS) +#endif + +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8, + megdnn::PostprocessMode::QUANTIZED) +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, + megdnn::PostprocessMode::NO_PROCESS) +INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, + megdnn::PostprocessMode::NO_PROCESS) +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32, + megdnn::PostprocessMode::NO_PROCESS) + +#undef INSTANTIAL_CLASS +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp new file mode 100644 index 00000000..86879313 --- /dev/null +++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp @@ -0,0 +1,343 @@ +/** + * \file dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/opr_param_defs.h" +#include "src/common/utils.h" +#include "src/fallback/conv_bias/im2col/strategy_base.h" +#include "src/fallback/convolution/img2col_helper.h" +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#endif + +using namespace megdnn; +#if MEGDNN_X86 +using namespace x86; +#endif +namespace megdnn { +template +Strategy::Strategy() + : StrategyBase() {} + +template +void Strategy:: + copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + UNPACK_CONV_F32_NCB_KERN_SIZES(param); + MEGDNN_MARK_USED_VAR(N); + MEGDNN_MARK_USED_VAR(OC); + MEGDNN_MARK_USED_VAR(OH); + MEGDNN_MARK_USED_VAR(OW); + MEGDNN_MARK_USED_VAR(FH); + MEGDNN_MARK_USED_VAR(FW); + MEGDNN_MARK_USED_VAR(SH); + MEGDNN_MARK_USED_VAR(SW); + + size_t IW2 = IW + 2 * PW; + size_t IH2 = IH + 2 * PH; + size_t batch_id = ncb_index.ndrange_id[0]; + size_t group_id = ncb_index.ndrange_id[1]; + size_t channel_id = ncb_index.ndrange_id[2]; + + size_t padding_group_size = IH2 * IW2 * IC; + size_t workspace_channel_offset = IH2 * IW2 * channel_id; + size_t workspace_group_offset = group_id * padding_group_size; + size_t workspace_batch_offset = + param.filter_meta.group * batch_id * padding_group_size; + bundle.set(param.workspace_ptr); + + src_ctype src_zp = static_cast(0); + if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { + src_zp = param.src_type.param().zero_point; + } + src_ctype* src = const_cast( + param.src(batch_id, group_id, channel_id)); + src_ctype* src2; + src2 = static_cast(bundle.get(BUNDLE_PADDING_INDEX)) + + workspace_group_offset + workspace_batch_offset + + workspace_channel_offset; + src_ctype* src2_ptr = src2; + const src_ctype* src_ptr = src; + if (PH != 0) { + std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); + src2_ptr += PH * IW2; + } + rep(ih, IH) { + if (PW != 0) + rep(pw, PW) * (src2_ptr++) = src_zp; + std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); + src2_ptr += IW; + src_ptr += IW; + if (PW != 0) + rep(pw, PW) * (src2_ptr++) = src_zp; + } + if (PH != 0) { + std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); + src2_ptr += PH * IW2; + } +} + +template +void Strategy:: + packA_kern(WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + MEGDNN_MARK_USED_VAR(bundle); + MEGDNN_MARK_USED_VAR(param); + MEGDNN_MARK_USED_VAR(matmulparam); + MEGDNN_MARK_USED_VAR(matmul_algo); + MEGDNN_MARK_USED_VAR(ncb_index); + megdnn_throw( + "nopack mode should not call packA_kern please check your code"); +} + +template +void* Strategy:: + get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam) { + if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) { + return static_cast( + bundle_thread.get(THREAD_BUNDLE_MATMULDST_INDEX)); + } else { + bias_ctype* dst = + param.dst(sparam.batch_id, sparam.group_id) + + sparam.oc_cur_index * sparam.ohw; + return static_cast(dst); + } +} + +template +void Strategy:: + exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + MEGDNN_MARK_USED_VAR(bundle); + MEGDNN_MARK_USED_VAR(ncb_index); + matmul_param.workspace_ptr = bundle_thread.get(THREAD_BUNDLE_MATCOMP_INDEX); + void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); + + src_ctype* im2col_dst = static_cast( + bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); + const void* filter = param.filter(sparam.group_id) + + sparam.oc_cur_index * param.filter_meta.icpg * + param.filter_meta.spatial[0] * + param.filter_meta.spatial[1]; + matmul_param.M = sparam.output_block_oc_size; + matmul_param.N = sparam.output_block_size; + matmul_param.LDB = sparam.output_block_size; + matmul_param.LDC = sparam.output_block_size; + matmul_param.A_ptr = filter; + matmul_param.B_ptr = im2col_dst; + matmul_param.C_ptr = matmul_dst; + auto matmul_kern = matmul_algo->get_kern(matmul_param); + matmul_kern(matmul_param); +} + +template +void Strategy:: + exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo + ) { + MEGDNN_MARK_USED_VAR(matmul_param); + MEGDNN_MARK_USED_VAR(matmul_algo); + size_t m_sh = param.filter_meta.stride[0]; + size_t m_sw = param.filter_meta.stride[1]; + size_t m_oc = param.filter_meta.ocpg; + size_t m_oh = param.osz[0]; + size_t m_ow = param.osz[1]; + size_t m_ic = param.filter_meta.icpg; + size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2; + size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2; + size_t m_fh = param.filter_meta.spatial[0]; + size_t m_fw = param.filter_meta.spatial[1]; + size_t m_is_xcorr = !param.filter_meta.should_flip; + + size_t input_offset = + m_ih * m_iw * m_ic * + (sparam.group_id + param.filter_meta.group * sparam.batch_id) * + sizeof(src_ctype); + + src_ctype* src2 = reinterpret_cast( + reinterpret_cast(bundle.get(BUNDLE_PADDING_INDEX)) + + input_offset); + + bool is_phpwzero = param.filter_meta.padding[0] == 0 && + param.filter_meta.padding[1] == 0; + if (is_phpwzero) { + src2 = const_cast( + param.src(sparam.batch_id, sparam.group_id)); + } + src_ctype* im2col_dst = static_cast( + bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); + if (m_sh == 1 && m_sw == 1) { + if (m_is_xcorr) { + img2col(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, + m_fh, m_fw, sparam.ohw_cur_index, + sparam.output_block_size); + } else { + img2col(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, + m_fh, m_fw, sparam.ohw_cur_index, + sparam.output_block_size); + } + } else { + if (m_is_xcorr) { + img2col_stride(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, + m_iw, m_fh, m_fw, m_sh, m_sw, + sparam.ohw_cur_index, + sparam.output_block_size); + } else { + img2col_stride(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, + m_ih, m_iw, m_fh, m_fw, m_sh, m_sw, + sparam.ohw_cur_index, + sparam.output_block_size); + } + } +} + +template +void Strategy:: + exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, + WorkspaceBundle bundle_thread) { + copy_bias(param, bundle_thread, sparam); + void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); + + const bias_ctype* bias_ptr = static_cast( + param.bias(sparam.batch_id, sparam.group_id)); + bias_ctype* bias_temp_ptr = + static_cast(get_bias_temp_ptr(param, bundle_thread)); + PostProcess::run( + matmul_dst, + const_cast( + param.bias_mode == megdnn::BiasMode::BIAS + ? bias_temp_ptr + : static_cast(const_cast( + bias_ptr + sparam.oc_cur_index))), + matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type, + param.dst_type, 1_z, sparam.output_block_oc_size, 1_z, + sparam.output_block_size); + copy_dst(param, matmul_dst, sparam); +} + +template +void Strategy:: + copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, + const void* matmul_dst, const StrategyParam& sparam) { + if (!sparam.skip_copy_dst) { + dst_ctype* dst_tmp_ptr = + reinterpret_cast(const_cast(matmul_dst)); + dst_ctype* dst = + param.dst(sparam.batch_id, sparam.group_id) + + sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index; + for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) { + std::memcpy(dst, dst_tmp_ptr, + sizeof(dst_ctype) * sparam.output_block_size); + dst_tmp_ptr += sparam.output_block_size; + dst += sparam.ohw; + } + } +} + +template +void Strategy:: + copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, + WorkspaceBundle bundle_thread, const StrategyParam& sparam) { + const bias_ctype* bias_ptr = static_cast( + param.bias(sparam.batch_id, sparam.group_id)); + bias_ctype* bias_temp_ptr = + static_cast(get_bias_temp_ptr(param, bundle_thread)); + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_ctype* copy_dst = bias_temp_ptr; + const bias_ctype* copy_src = bias_ptr + + sparam.oc_cur_index * sparam.ohw + + sparam.ohw_cur_index; + for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) { + std::memcpy(copy_dst, copy_src, + sizeof(bias_ctype) * sparam.output_block_size); + copy_dst += sparam.output_block_size; + copy_src += sparam.ohw; + } + } +} + +#define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ + _op_dtype, _postprocess_mode) \ + template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ + _op_dtype, _postprocess_mode, PackMode::NO_PACK>; + +INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, + megdnn::PostprocessMode::FLOAT) + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, + megdnn::PostprocessMode::FLOAT) +#else +#if !MEGDNN_DISABLE_FLOAT16 +INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, + megdnn::PostprocessMode::NO_PROCESS) +#endif +#endif + +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 +//! x86 do not have uint8 matmul so only armv7 armv8 support uint8 +INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, + megdnn::PostprocessMode::QUANTIZED) +INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32, + megdnn::PostprocessMode::NO_PROCESS) +#endif + +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8, + megdnn::PostprocessMode::QUANTIZED) +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, + megdnn::PostprocessMode::NO_PROCESS) +INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, + megdnn::PostprocessMode::NO_PROCESS) +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32, + megdnn::PostprocessMode::NO_PROCESS) + +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp new file mode 100644 index 00000000..f60fe86f --- /dev/null +++ b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp @@ -0,0 +1,349 @@ +/** + * \file dnn/src/fallback/conv_bias/im2col/algos.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megdnn/opr_param_defs.h" +#include "src/fallback/conv_bias/im2col/strategy_base.h" +#include "src/fallback/convolution/img2col_helper.h" +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#endif + +using namespace megdnn; +#if MEGDNN_X86 +using namespace x86; +#endif +namespace megdnn { +template +Strategy::Strategy() + : StrategyBase() {} + +template +void Strategy:: + copy_padding_kern( + WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + UNPACK_CONV_F32_NCB_KERN_SIZES(param); + MEGDNN_MARK_USED_VAR(N); + MEGDNN_MARK_USED_VAR(OC); + MEGDNN_MARK_USED_VAR(OH); + MEGDNN_MARK_USED_VAR(OW); + MEGDNN_MARK_USED_VAR(FH); + MEGDNN_MARK_USED_VAR(FW); + MEGDNN_MARK_USED_VAR(SH); + MEGDNN_MARK_USED_VAR(SW); + + size_t IW2 = IW + 2 * PW; + size_t IH2 = IH + 2 * PH; + size_t batch_id = ncb_index.ndrange_id[0]; + size_t group_id = ncb_index.ndrange_id[1]; + size_t channel_id = ncb_index.ndrange_id[2]; + + size_t padding_group_size = IH2 * IW2 * IC; + size_t workspace_channel_offset = IH2 * IW2 * channel_id; + size_t workspace_group_offset = group_id * padding_group_size; + size_t workspace_batch_offset = + param.filter_meta.group * batch_id * padding_group_size; + bundle.set(param.workspace_ptr); + + src_ctype src_zp = static_cast(0); + if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { + src_zp = param.src_type.param().zero_point; + } + src_ctype* src = const_cast( + param.src(batch_id, group_id, channel_id)); + src_ctype* src2; + src2 = static_cast(bundle.get(BUNDLE_PADDING_INDEX)) + + workspace_group_offset + workspace_batch_offset + + workspace_channel_offset; + src_ctype* src2_ptr = src2; + const src_ctype* src_ptr = src; + if (PH != 0) { + std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); + src2_ptr += PH * IW2; + } + rep(ih, IH) { + if (PW != 0) + rep(pw, PW) * (src2_ptr++) = src_zp; + std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); + src2_ptr += IW; + src_ptr += IW; + if (PW != 0) + rep(pw, PW) * (src2_ptr++) = src_zp; + } + if (PH != 0) { + std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); + src2_ptr += PH * IW2; + } +} + +template +void Strategy:: + packA_kern(WorkspaceBundle bundle, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmulparam, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + bundle.set(param.workspace_ptr); + fallback::MatrixMulImpl::KernParam matmul_param; + static_cast(matmul_param) = + matmulparam; + size_t OC = param.filter_meta.ocpg; + size_t oc_tile_size = matmul_param.M; + size_t group_id = ncb_index.ndrange_id[0]; + size_t output_block_oc_size = + std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size); + size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size; + size_t packA_group_size = + bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group; + size_t a_panel_offset = ncb_index.ndrange_id[1] * + matmul_algo->get_bundle(matmul_param).get_size(0); + int8_t* a_panel = static_cast(bundle.get(BUNDLE_PACKA_INDEX)) + + group_id * packA_group_size + a_panel_offset; + matmul_param.A_ptr = + const_cast(param.filter(group_id)) + + oc_cur_index * matmul_param.K; + matmul_param.M = output_block_oc_size; + matmul_algo->pack_A(matmul_param, a_panel, 0_z, 0_z); +} + +template +void* Strategy:: + get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, + const WorkspaceBundle& bundle_thread, + const StrategyParam& sparam) { + if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) { + return static_cast( + bundle_thread.get(THREAD_BUNDLE_MATMULDST_INDEX)); + } else { + bias_ctype* dst = + param.dst(sparam.batch_id, sparam.group_id) + + sparam.oc_cur_index * sparam.ohw; + return static_cast(dst); + } +} + +template +void Strategy:: + exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, WorkspaceBundle bundle, + WorkspaceBundle bundle_thread, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { + size_t packA_group_size = + bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group; + size_t a_panel_offset = ncb_index.ndrange_id[3] * + matmul_algo->get_bundle(matmul_param).get_size(0); + a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset; + + void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); + + src_ctype* a_panel = reinterpret_cast( + reinterpret_cast(bundle.get(BUNDLE_PACKA_INDEX)) + + a_panel_offset); + src_ctype* b_panel = nullptr; + + src_ctype* im2col_dst = static_cast( + bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); + + matmul_param.M = sparam.output_block_oc_size; + matmul_param.N = sparam.output_block_size; + matmul_param.LDB = sparam.output_block_size; + matmul_param.LDC = sparam.output_block_size; + matmul_param.B_ptr = im2col_dst; + matmul_param.C_ptr = matmul_dst; + + auto matmul_kern = matmul_algo->get_kern_naked(matmul_param); + matmul_kern(matmul_param, a_panel, b_panel); +} + +template +void Strategy:: + exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, + const StrategyParam& sparam, + const fallback::ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernParam matmul_param, + fallback::MatrixMulImpl::AlgoBase* matmul_algo + ) { + MEGDNN_MARK_USED_VAR(matmul_param); + MEGDNN_MARK_USED_VAR(matmul_algo); + size_t m_sh = param.filter_meta.stride[0]; + size_t m_sw = param.filter_meta.stride[1]; + size_t m_oc = param.filter_meta.ocpg; + size_t m_oh = param.osz[0]; + size_t m_ow = param.osz[1]; + size_t m_ic = param.filter_meta.icpg; + size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2; + size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2; + size_t m_fh = param.filter_meta.spatial[0]; + size_t m_fw = param.filter_meta.spatial[1]; + size_t m_is_xcorr = !param.filter_meta.should_flip; + + size_t input_offset = + m_ih * m_iw * m_ic * + (sparam.group_id + param.filter_meta.group * sparam.batch_id) * + sizeof(src_ctype); + + src_ctype* src2 = reinterpret_cast( + reinterpret_cast(bundle.get(BUNDLE_PADDING_INDEX)) + + input_offset); + bool is_phpwzero = param.filter_meta.padding[0] == 0 && + param.filter_meta.padding[1] == 0; + if (is_phpwzero) { + src2 = const_cast( + param.src(sparam.batch_id, sparam.group_id)); + } + src_ctype* im2col_dst = static_cast( + bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); + if (m_sh == 1 && m_sw == 1) { + if (m_is_xcorr) { + img2col(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, + m_fh, m_fw, sparam.ohw_cur_index, + sparam.output_block_size); + } else { + img2col(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, + m_fh, m_fw, sparam.ohw_cur_index, + sparam.output_block_size); + } + } else { + if (m_is_xcorr) { + img2col_stride(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, + m_iw, m_fh, m_fw, m_sh, m_sw, + sparam.ohw_cur_index, + sparam.output_block_size); + } else { + img2col_stride(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, + m_ih, m_iw, m_fh, m_fw, m_sh, m_sw, + sparam.ohw_cur_index, + sparam.output_block_size); + } + } +} + +template +void Strategy:: + exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, + const StrategyParam& sparam, + WorkspaceBundle bundle_thread) { + void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); + + const bias_ctype* bias_ptr = static_cast( + param.bias(sparam.batch_id, sparam.group_id)); + bias_ctype* bias_temp_ptr = + static_cast(get_bias_temp_ptr(param, bundle_thread)); + + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_ctype* copy_dst = bias_temp_ptr; + const bias_ctype* copy_src = bias_ptr + + sparam.oc_cur_index * sparam.ohw + + sparam.ohw_cur_index; + for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) { + std::memcpy(copy_dst, copy_src, + sizeof(bias_ctype) * sparam.output_block_size); + copy_dst += sparam.output_block_size; + copy_src += sparam.ohw; + } + } + + PostProcess::run( + matmul_dst, + const_cast( + param.bias_mode == megdnn::BiasMode::BIAS + ? bias_temp_ptr + : static_cast(const_cast( + bias_ptr + sparam.oc_cur_index))), + matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type, + param.dst_type, 1_z, sparam.output_block_oc_size, 1_z, + sparam.output_block_size); + copy_dst(param, matmul_dst, sparam); +} + +template +void Strategy:: + copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, + const void* matmul_dst, const StrategyParam& sparam) { + if (!sparam.skip_copy_dst) { + dst_ctype* dst_tmp_ptr = + reinterpret_cast(const_cast(matmul_dst)); + dst_ctype* dst = + param.dst(sparam.batch_id, sparam.group_id) + + sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index; + for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) { + std::memcpy(dst, dst_tmp_ptr, + sizeof(dst_ctype) * sparam.output_block_size); + dst_tmp_ptr += sparam.output_block_size; + dst += sparam.ohw; + } + } +} + +#define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ + _op_dtype, _postprocess_mode) \ + template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, \ + _op_ctype, _op_dtype, _postprocess_mode,PackMode::ONLY_PACKA>; + +INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, + megdnn::PostprocessMode::FLOAT) + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, + megdnn::PostprocessMode::FLOAT) +#else +#if !MEGDNN_DISABLE_FLOAT16 +INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, + megdnn::PostprocessMode::NO_PROCESS) +#endif +#endif + +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 +//! x86 do not have uint8 matmul so only armv7 armv8 support uint8 +INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, + megdnn::PostprocessMode::QUANTIZED) +INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32, + megdnn::PostprocessMode::NO_PROCESS) +#endif + +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8, + megdnn::PostprocessMode::QUANTIZED) +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, + megdnn::PostprocessMode::NO_PROCESS) +INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, + megdnn::PostprocessMode::NO_PROCESS) +INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32, + megdnn::PostprocessMode::NO_PROCESS) + +#undef INSTANTIAL_CLASS +} // namespace megdnn diff --git a/dnn/src/fallback/convolution/img2col_helper.h b/dnn/src/fallback/convolution/img2col_helper.h index b337b703..bc9e5546 100644 --- a/dnn/src/fallback/convolution/img2col_helper.h +++ b/dnn/src/fallback/convolution/img2col_helper.h @@ -8,7 +8,6 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include #include "src/common/utils.h" namespace { @@ -42,7 +41,8 @@ void img2col_stride(const dtype* __restrict src, dtype* __restrict dst, } } -//! add for im2col matmul multithread +//!add for im2col matmul multithread + template void img2col_stride(const dtype* __restrict src, dtype* __restrict dst, const int OC, const int OH, const int OW, const int IC, diff --git a/dnn/src/x86/elemwise_helper/kimpl/op_unary_base.h b/dnn/src/x86/elemwise_helper/kimpl/op_unary_base.h index 897fb0c2..e0df6b5a 100644 --- a/dnn/src/x86/elemwise_helper/kimpl/op_unary_base.h +++ b/dnn/src/x86/elemwise_helper/kimpl/op_unary_base.h @@ -323,6 +323,7 @@ struct UnaryOpBase init(src_scale, dst_scale); } }; + template <> struct UnaryOpBase : OpBase { @@ -330,20 +331,24 @@ struct UnaryOpBase using src_ctype = dt_qint32; using dst_ctype = dt_quint8; float scale, scale_src, scale_dst; - void init(float src_scale, float dst_scale) { + uint8_t dzp; + void init(float src_scale, float dst_scale, uint8_t dst_zp) { scale_src = src_scale; - scale_dst = 1.f / dst_scale; + scale_dst = 1.0f / dst_scale; + dzp = dst_zp; scale = src_scale / dst_scale; } UnaryOpBase(DType src_dtype, DType dst_dtype) { float src_scale = src_dtype.param().scale; - float dst_scale = dst_dtype.param().scale; - init(src_scale, dst_scale); + float dst_scale = dst_dtype.param().scale; + uint8_t dst_zp = dst_dtype.param().zero_point; + init(src_scale, dst_scale, dst_zp); } - UnaryOpBase(float src_scale, float dst_scale) { - init(src_scale, dst_scale); + UnaryOpBase(float src_scale, float dst_scale, uint8_t dst_zp) { + init(src_scale, dst_scale, dst_zp); } }; + #define OP_BASE(_simd_type, _simd_target, _simd_data_type, _func_prefix) \ template <> \ struct UnaryOpBase<_simd_type, dt_float32, dt_qint8> \ @@ -828,7 +833,6 @@ template struct UnaryQuantizationOp : UnaryOpBase { using UnaryOpBase::UnaryOpBase; - constexpr static size_t SIMD_WIDTH = 8; Op op; void operator()(const dt_qint32& src, dt_quint8* dst) const { diff --git a/dnn/src/x86/matrix_mul/algos.cpp b/dnn/src/x86/matrix_mul/algos.cpp index 26b70004..50f830cb 100644 --- a/dnn/src/x86/matrix_mul/algos.cpp +++ b/dnn/src/x86/matrix_mul/algos.cpp @@ -195,10 +195,10 @@ MatrixMulImpl::kern_t MatrixMulImpl::AlgoInt8x8x32Vnni::get_kern( return int8x8x32_kern_vnni; } -MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(AlgoInt8x8x32Vnni, - megdnn_x86_matmul_kern, 5, - x86::matmul::gemm_int8_vnni_12x32x4, - dt_int8, dt_int32, dt_uint8); +MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(AlgoInt8x8x32Vnni, + megdnn_x86_matmul_kern, 5, + x86::matmul::gemm_int8_vnni_12x32x4, + dt_int8, dt_int32, dt_uint8); #endif /* ===================== Int8 mkldnn algo ===================== */ @@ -364,7 +364,9 @@ size_t MatrixMulImpl::AlgoInt8x8x32AVX2M4N16K2::get_workspace( m, n, k, trans_a, trans_b, strategy, cacheline) .get_workspace_size(); } - +MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( + AlgoInt8x8x32AVX2M4N16K2, megdnn_x86_matmul_kern, 8, + x86::matmul::gemm_avx2_s8s8s32_4x16x2, dt_int8, dt_int32, dt_int16); MatrixMulImpl::kern_t MatrixMulImpl::AlgoInt8x8x32AVX2M2N4K16::get_kern( const KernSizeParam&) const { @@ -437,6 +439,10 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace( .get_workspace_size(); } +MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( + AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9, + x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16); + /*************************AlgoF32MK8_8x8********************/ MatrixMulImpl::kern_t MatrixMulImpl::AlgoF32MK8_8x8::get_kern( const KernSizeParam&) const { diff --git a/dnn/src/x86/matrix_mul/algos.h b/dnn/src/x86/matrix_mul/algos.h index 88bf8023..f388983e 100644 --- a/dnn/src/x86/matrix_mul/algos.h +++ b/dnn/src/x86/matrix_mul/algos.h @@ -68,7 +68,7 @@ public: size_t get_workspace(const KernSizeParam&) const override; kern_t get_kern(const KernSizeParam&) const override; void* type() const override { return sm_x86_algo_type; } - PackMode packmode() const override { return PackMode::NO_PACK; } + MEGDNN_REG_GEMM_FUNC_FOR_IM2COL(); }; class MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2 : public AlgoBase { @@ -79,7 +79,7 @@ public: size_t get_workspace(const KernSizeParam&) const override; kern_t get_kern(const KernSizeParam&) const override; void* type() const override { return sm_x86_algo_type; } - PackMode packmode() const override { return PackMode::NO_PACK; } + MEGDNN_REG_GEMM_FUNC_FOR_IM2COL(); }; class MatrixMulImpl::AlgoF32MK8_8x8 : public AlgoBase { diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp index 8cdad38a..deb2a843 100644 --- a/dnn/test/x86/conv_bias.cpp +++ b/dnn/test/x86/conv_bias.cpp @@ -741,7 +741,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { TensorShape{oc, ic, kernel, kernel}, TensorShape{}); }; - for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) + for (size_t kernel : {2, 3, 4, 5, 6, 7}) for (size_t ic : {1, 4, 8, 16}) for (size_t oc : {1, 4, 8}) for (size_t p : {0, 2}) @@ -751,7 +751,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { run(oc, ic, size, size, kernel, p, nonline_mode); } //! test OC block - run(2046, 1, 8, 8, 1, 0, NonlineMode::IDENTITY); + run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY); Checker checker(handle()); UniformIntRNG rng{-50, 50}; @@ -826,7 +826,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) { (w + 2 * p - kernel) / param.stride_w + 1}); }; - for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) + for (size_t kernel : {2, 3, 4, 5, 6, 7}) for (size_t ic : {1, 4, 8, 16}) for (size_t oc : {1, 4, 8, 16, 300}) for (size_t p : {0, 2}) @@ -895,7 +895,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { (w + 2 * param.pad_w - kernel) / 1 + 1}); }; - for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) + for (size_t kernel : {2, 3, 4, 5, 6, 7}) for (size_t ic : {1, 4, 8, 16}) for (size_t oc : {1, 4, 8, 16}) for (size_t p : {0, 1}) @@ -945,7 +945,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { TensorShape{1, oc, 1, 1}); }; - for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) + for (size_t kernel : {2, 3, 4, 5, 6, 7}) for (size_t ic : {1, 4, 8, 16}) for (size_t oc : {1, 4, 8}) for (size_t p : {0, 2}) @@ -2183,7 +2183,7 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) { std::vector data_type = {dtype::Int8(), dtype::Int8(), dtype::Int32(), dtype::Int32()}; - std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2"; + std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2:192"; // std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16"; // printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n"); benchmark_impl(param, shapes_and_computation, algo_name, RUNS,