GitOrigin-RevId: 61c54ad258
tags/v1.0.0-rc1
| @@ -31,35 +31,10 @@ using namespace im2col; | |||||
| * *Through witch can convenient get the needed ptr | * *Through witch can convenient get the needed ptr | ||||
| */ | */ | ||||
| struct Im2colBundelIndex { | struct Im2colBundelIndex { | ||||
| static constexpr size_t BUNDLE_PADDING_INDEX = 0_z; | |||||
| static constexpr size_t BUNDLE_PACKA_INDEX = 1_z; | |||||
| static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; | static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; | ||||
| }; | }; | ||||
| using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; | using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; | ||||
| //! Process one input channel copy padding | |||||
| static void copy_padding_kern(WorkspaceBundle& bundle, | |||||
| const ConvBiasImpl::NCBKernParam& param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
| StrategyBase* im2colstrategy, size_t pack_oc_size) { | |||||
| im2colstrategy->copy_padding_kern(bundle, param, ncb_index, pack_oc_size); | |||||
| } | |||||
| //! packA_kern | |||||
| static void packA_kern( | |||||
| WorkspaceBundle& bundle, | |||||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | |||||
| StrategyBase* im2colstrategy, | |||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
| size_t pack_oc_size) { | |||||
| im2colstrategy->packA_kern(bundle, param, matmulparam, matmul_algo, | |||||
| ncb_index, matmul_desc, pack_oc_size); | |||||
| } | |||||
| /*! | /*! | ||||
| * *\brief Im2colKerns collects all the im2col kerns in it | * *\brief Im2colKerns collects all the im2col kerns in it | ||||
| */ | */ | ||||
| @@ -124,8 +99,8 @@ public: | |||||
| WorkspaceBundle get_thread_bundle( | WorkspaceBundle get_thread_bundle( | ||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | const fallback::ConvBiasImpl::NCBKernSizeParam& param, | ||||
| fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, | |||||
| MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, | |||||
| const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, | |||||
| size_t oc_tile_size) { | size_t oc_tile_size) { | ||||
| size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], | size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], | ||||
| FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
| @@ -205,8 +180,8 @@ public: | |||||
| } | } | ||||
| WorkspaceBundle get_thread_bundle( | WorkspaceBundle get_thread_bundle( | ||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | const fallback::ConvBiasImpl::NCBKernSizeParam& param, | ||||
| fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, | |||||
| MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, | |||||
| const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, | |||||
| size_t oc_tile_size) { | size_t oc_tile_size) { | ||||
| size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], | size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], | ||||
| FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
| @@ -288,8 +263,8 @@ public: | |||||
| } | } | ||||
| WorkspaceBundle get_thread_bundle( | WorkspaceBundle get_thread_bundle( | ||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | const fallback::ConvBiasImpl::NCBKernSizeParam& param, | ||||
| fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, | |||||
| MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, | |||||
| const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, | |||||
| size_t oc_tile_size) { | size_t oc_tile_size) { | ||||
| size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], | size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], | ||||
| FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
| @@ -322,15 +297,16 @@ public: | |||||
| } | } | ||||
| }; | }; | ||||
| fallback::MatrixMulImpl::KernSizeParam | |||||
| ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param, | |||||
| size_t ohw_tile_size, | |||||
| size_t oc_tile_size) const { | |||||
| namespace { | |||||
| static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param( | |||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||||
| size_t ohw_tile_size, size_t oc_tile_size) { | |||||
| auto format = param::MatrixMul::Format::DEFAULT; | auto format = param::MatrixMul::Format::DEFAULT; | ||||
| size_t pack_oc_size = pack_size(param.filter_meta.format); | size_t pack_oc_size = pack_size(param.filter_meta.format); | ||||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW44) { | if (param.filter_meta.format == param::ConvBias::Format::NCHW44) { | ||||
| format = param::MatrixMul::Format::MK4; | format = param::MatrixMul::Format::MK4; | ||||
| } else if(param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT){ | |||||
| } else if (param.filter_meta.format == | |||||
| param::ConvBias::Format::NCHW44_DOT) { | |||||
| format = param::MatrixMul::Format::MK4_DOT; | format = param::MatrixMul::Format::MK4_DOT; | ||||
| } | } | ||||
| size_t M = oc_tile_size; | size_t M = oc_tile_size; | ||||
| @@ -358,10 +334,23 @@ ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param, | |||||
| format}; | format}; | ||||
| } | } | ||||
| void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block( | |||||
| const NCBKernSizeParam& param, size_t& oc_tile_size, | |||||
| size_t& ohw_tile_size, size_t block_m, size_t block_n, | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const { | |||||
| static void choice_ohw_oc_block( | |||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||||
| size_t& oc_tile_size, size_t& ohw_tile_size, size_t block_m, | |||||
| size_t block_n, const size_t m_ohw_tile_size, | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) { | |||||
| //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, | |||||
| //! when ohw_tile_size < this value ohw_tile_size = ohw | |||||
| static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32; | |||||
| //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, | |||||
| //! oc_tile_size = DEFAULT_OC_TILE_SIZE | |||||
| static constexpr size_t DEFAULT_OC_TILE_SIZE = 512; | |||||
| //! when oc_tile_size > this value m_oc_tile_size = | |||||
| //! DEFAULT_OC_MAX_TILE_SIZE | |||||
| static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024; | |||||
| //! when oc_tile_size < this value oc_tile_size = | |||||
| //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation | |||||
| static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128; | |||||
| size_t nr_threads = param.nr_threads; | size_t nr_threads = param.nr_threads; | ||||
| size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
| size_t ohw = param.osz[0] * param.osz[1]; | size_t ohw = param.osz[0] * param.osz[1]; | ||||
| @@ -393,8 +382,74 @@ void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block( | |||||
| } | } | ||||
| } | } | ||||
| WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( | |||||
| const NCBKernSizeParam& param) const { | |||||
| static size_t packA_group_size( | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
| const fallback::MatrixMulImpl::KernSizeParam& matmul_param, | |||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
| size_t packa_parallel_times) { | |||||
| if (matmul_desc.packmode == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
| return matmul_algo->get_bundle(matmul_param).get_size(0); | |||||
| } else if (matmul_desc.packmode == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
| return packa_parallel_times * | |||||
| matmul_algo->get_bundle(matmul_param).get_size(0); | |||||
| } | |||||
| megdnn_assert(matmul_desc.packmode == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK); | |||||
| //! nopack mode return 0; | |||||
| return 0; | |||||
| } | |||||
| static WorkspaceBundle get_thread_bundle( | |||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
| const fallback::MatrixMulImpl::KernSizeParam& matmul_param, | |||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, | |||||
| size_t oc_tile_size, size_t ohw_tile_size) { | |||||
| if (matmul_desc.packmode == Pack_Mode::DEFAULT) { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) { | |||||
| Im2colKerns<Pack_Mode::DEFAULT> defaultkern; | |||||
| return defaultkern.get_thread_bundle(param, matmul_param, | |||||
| matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } else if (matmul_desc.packmode == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoIm2col::get_bundle_onlypacka"_hash)) { | |||||
| Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern; | |||||
| return onlypackakern.get_thread_bundle(param, matmul_param, | |||||
| matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } else { | |||||
| megdnn_assert(matmul_desc.packmode == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK); | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoIm2col::get_thread_bundle_nopack"_hash)) { | |||||
| Im2colKerns<Pack_Mode::NO_PACK> nopackkern; | |||||
| return nopackkern.get_thread_bundle(param, matmul_param, | |||||
| matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } | |||||
| return {nullptr, {}}; | |||||
| } | |||||
| static WorkspaceBundle get_bundle( | |||||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||||
| MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size, | |||||
| size_t ohw_tile_size) { | |||||
| UNPACK_CONV_F32_NCB_KERN_SIZES(param); | UNPACK_CONV_F32_NCB_KERN_SIZES(param); | ||||
| MEGDNN_MARK_USED_VAR(OC); | MEGDNN_MARK_USED_VAR(OC); | ||||
| MEGDNN_MARK_USED_VAR(OH); | MEGDNN_MARK_USED_VAR(OH); | ||||
| @@ -410,23 +465,20 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( | |||||
| size_t padding = 0, packa_size = 0, packa_group_size = 0; | size_t padding = 0, packa_size = 0, packa_group_size = 0; | ||||
| size_t nr_threads = param.nr_threads; | size_t nr_threads = param.nr_threads; | ||||
| size_t GROUP = param.filter_meta.group; | size_t GROUP = param.filter_meta.group; | ||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc = | |||||
| m_matmul_algo->matmul_description(); | |||||
| bool need_pack = mdesc.packmode == Pack_Mode::DEFAULT; | |||||
| bool only_packA = mdesc.packmode == Pack_Mode::ONLY_PACKA; | |||||
| size_t oc_tile_size = 0, ohw_tile_size = 0; | |||||
| choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | |||||
| mdesc.innerblocksize.m, mdesc.innerblocksize.n, | |||||
| mdesc.packmode); | |||||
| if (need_pack || only_packA) { | |||||
| auto im2col_kern_param = get_matmul_kern_param( | |||||
| param, ohw_tile_size, only_packA ? oc_tile_size : OC); | |||||
| size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | |||||
| WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param); | |||||
| packa_group_size = only_packA ? oc_parallel_times * wb.get_size(0) | |||||
| : wb.get_size(0); | |||||
| } else { //! not support pack,not need pack | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| matmul_algo->matmul_description(); | |||||
| bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; | |||||
| //! packmode is default should use oc | |||||
| //! packmode is onlypackA should use oc_tile_size | |||||
| auto im2col_kern_param = get_matmul_kern_param( | |||||
| param, ohw_tile_size, default_pack ? OC : oc_tile_size); | |||||
| if (is_enable_filter_preprocess(param)) { | |||||
| packa_group_size = 0; | packa_group_size = 0; | ||||
| } else { | |||||
| size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | |||||
| packa_group_size = packA_group_size(matmul_algo, im2col_kern_param, | |||||
| matmul_desc, oc_parallel_times); | |||||
| } | } | ||||
| if (no_need_pading) { | if (no_need_pading) { | ||||
| @@ -437,50 +489,27 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( | |||||
| } | } | ||||
| packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size | packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size | ||||
| WorkspaceBundle ws = {nullptr, {}}; | |||||
| auto im2col_kern_param = | |||||
| get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); | |||||
| if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) { | |||||
| Im2colKerns<Pack_Mode::DEFAULT> defaultkern; | |||||
| ws = defaultkern.get_thread_bundle(param, im2col_kern_param, | |||||
| m_matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_packa"_hash)) { | |||||
| Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern; | |||||
| ws = onlypackakern.get_thread_bundle(param, im2col_kern_param, | |||||
| m_matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } else { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_other"_hash)) { | |||||
| Im2colKerns<Pack_Mode::NO_PACK> nopackkern; | |||||
| ws = nopackkern.get_thread_bundle(param, im2col_kern_param, | |||||
| m_matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } | |||||
| WorkspaceBundle ws = | |||||
| get_thread_bundle(param, matmul_algo, im2col_kern_param, | |||||
| matmul_desc, oc_tile_size, ohw_tile_size); | |||||
| return {nullptr, | return {nullptr, | ||||
| {padding, packa_size, ws.total_size_in_bytes() * nr_threads}}; | {padding, packa_size, ws.total_size_in_bytes() * nr_threads}}; | ||||
| } | } | ||||
| } // namespace | |||||
| size_t ConvBiasImpl::AlgoIm2col::get_workspace( | size_t ConvBiasImpl::AlgoIm2col::get_workspace( | ||||
| const NCBKernSizeParam& p) const { | const NCBKernSizeParam& p) const { | ||||
| MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) { | MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) { | ||||
| return get_bundle(p).total_size_in_bytes(); | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| m_matmul_algo->matmul_description(); | |||||
| size_t oc_tile_size = 0, ohw_tile_size = 0; | |||||
| choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size, | |||||
| matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n, | |||||
| m_ohw_tile_size, matmul_desc.packmode); | |||||
| return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size) | |||||
| .total_size_in_bytes(); | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| return 0; | return 0; | ||||
| @@ -499,22 +528,21 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
| size_t oc_tile_size = 0, ohw_tile_size = 0; | size_t oc_tile_size = 0, ohw_tile_size = 0; | ||||
| size_t ohw = OH * OW; | size_t ohw = OH * OW; | ||||
| size_t GROUP = param.filter_meta.group; | size_t GROUP = param.filter_meta.group; | ||||
| WorkspaceBundle bundle = get_bundle(param); | |||||
| WorkspaceBundle bundle_thread = {nullptr, {}}; | |||||
| bool need_padding = (PH != 0 || PW != 0); | bool need_padding = (PH != 0 || PW != 0); | ||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc = | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| m_matmul_algo->matmul_description(); | m_matmul_algo->matmul_description(); | ||||
| Pack_Mode packmode = mdesc.packmode; | |||||
| bool default_pack = packmode == Pack_Mode::DEFAULT; | |||||
| bool no_pack = packmode == Pack_Mode::NO_PACK; | |||||
| bool only_packA = packmode == Pack_Mode::ONLY_PACKA; | |||||
| bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; | |||||
| bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK; | |||||
| bool only_packA = matmul_desc.packmode == Pack_Mode::ONLY_PACKA; | |||||
| bool enable_filter_preprocess = is_enable_filter_preprocess(param); | |||||
| choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | ||||
| mdesc.innerblocksize.m, mdesc.innerblocksize.n, | |||||
| mdesc.packmode); | |||||
| matmul_desc.innerblocksize.m, | |||||
| matmul_desc.innerblocksize.n, m_ohw_tile_size, | |||||
| matmul_desc.packmode); | |||||
| WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size); | |||||
| size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size); | size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size); | ||||
| size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | ||||
| size_t packa_parallel_times = 0; | size_t packa_parallel_times = 0; | ||||
| @@ -523,28 +551,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
| if (only_packA) { | if (only_packA) { | ||||
| packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | ||||
| } else if (default_pack) { | } else if (default_pack) { | ||||
| packa_parallel_times = div_ceil<size_t>(OC, mdesc.innerblocksize.m); | |||||
| packa_parallel_times = | |||||
| div_ceil<size_t>(OC, matmul_desc.innerblocksize.m); | |||||
| } | } | ||||
| auto matmul_param = get_matmul_kern_param( | auto matmul_param = get_matmul_kern_param( | ||||
| param, ohw_tile_size, only_packA ? oc_tile_size : OC); | |||||
| if (mdesc.packmode == Pack_Mode::DEFAULT) { | |||||
| Im2colKerns<Pack_Mode::DEFAULT> defaultkern; | |||||
| bundle_thread = defaultkern.get_thread_bundle( | |||||
| param, matmul_param, m_matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } else if (mdesc.packmode == Pack_Mode::ONLY_PACKA) { | |||||
| Im2colKerns<Pack_Mode::ONLY_PACKA> onlypackakern; | |||||
| bundle_thread = onlypackakern.get_thread_bundle( | |||||
| param, matmul_param, m_matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } else { | |||||
| Im2colKerns<Pack_Mode::NO_PACK> nopackkern; | |||||
| bundle_thread = nopackkern.get_thread_bundle( | |||||
| param, matmul_param, m_matmul_algo, ohw_tile_size, | |||||
| oc_tile_size); | |||||
| } | |||||
| param, ohw_tile_size, default_pack ? OC : oc_tile_size); | |||||
| WorkspaceBundle bundle_thread = | |||||
| get_thread_bundle(param, m_matmul_algo, matmul_param, | |||||
| matmul_desc, oc_tile_size, ohw_tile_size); | |||||
| StrategyParam strategyparam; | StrategyParam strategyparam; | ||||
| strategyparam.ohw = ohw; | strategyparam.ohw = ohw; | ||||
| strategyparam.is_dst_8bit = | strategyparam.is_dst_8bit = | ||||
| @@ -557,6 +573,9 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
| strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit; | strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit; | ||||
| strategyparam.oc_tile_size = oc_tile_size; | strategyparam.oc_tile_size = oc_tile_size; | ||||
| strategyparam.pack_oc_size = pack_oc_size; | strategyparam.pack_oc_size = pack_oc_size; | ||||
| strategyparam.enable_filter_preprocess = enable_filter_preprocess; | |||||
| strategyparam.packA_group_size = packA_group_size( | |||||
| m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | SmallVector<ConvBiasImpl::NCBKern> ret_kern; | ||||
| MIDOUT_BEGIN( | MIDOUT_BEGIN( | ||||
| @@ -569,88 +588,126 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns( | |||||
| const NCBKernParam& param, | const NCBKernParam& param, | ||||
| const NCBKernIndex& ncb_index) mutable { | const NCBKernIndex& ncb_index) mutable { | ||||
| bundle.set(param.workspace_ptr); | bundle.set(param.workspace_ptr); | ||||
| copy_padding_kern(bundle, param, ncb_index, im2colstrategy, | |||||
| pack_oc_size); | |||||
| im2colstrategy->copy_padding_kern(bundle, param, ncb_index, | |||||
| pack_oc_size); | |||||
| }; | }; | ||||
| auto kern_packA = [bundle, matmul_algo = m_matmul_algo, | auto kern_packA = [bundle, matmul_algo = m_matmul_algo, | ||||
| matmul_param, im2colstrategy, | matmul_param, im2colstrategy, | ||||
| pack_oc_size = pack_oc_size, mdesc = mdesc]( | |||||
| strategyparam = strategyparam, | |||||
| matmul_desc = matmul_desc]( | |||||
| const NCBKernParam& param, | const NCBKernParam& param, | ||||
| const NCBKernIndex& ncb_index) mutable { | const NCBKernIndex& ncb_index) mutable { | ||||
| bundle.set(param.workspace_ptr); | bundle.set(param.workspace_ptr); | ||||
| packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, | |||||
| im2colstrategy, mdesc, pack_oc_size); | |||||
| im2colstrategy->packA_kern(bundle, param, matmul_param, | |||||
| matmul_algo, ncb_index, matmul_desc, | |||||
| strategyparam); | |||||
| }; | }; | ||||
| if (default_pack) { | if (default_pack) { | ||||
| auto kern_compute_default = | |||||
| [bundle, bundle_thread, matmul_param, | |||||
| matmul_algo = m_matmul_algo, | |||||
| ohw_tile_size = ohw_tile_size, | |||||
| strategyparam = strategyparam, matmul_desc = mdesc, | |||||
| im2colstrategy]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| Im2colKerns<Pack_Mode::DEFAULT>::kerns( | |||||
| bundle, bundle_thread, param, matmul_param, | |||||
| matmul_algo, matmul_desc, strategyparam, | |||||
| ncb_index, ohw_tile_size, im2colstrategy); | |||||
| }; | |||||
| ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); | |||||
| if (need_padding) { | |||||
| ret_kern.push_back({kern_padding, | |||||
| {param.n, GROUP, IC / pack_oc_size}}); | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) { | |||||
| auto kern_compute_default = | |||||
| [bundle, bundle_thread, matmul_param, | |||||
| matmul_algo = m_matmul_algo, | |||||
| ohw_tile_size = ohw_tile_size, | |||||
| strategyparam = strategyparam, | |||||
| matmul_desc = matmul_desc, im2colstrategy]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| Im2colKerns<Pack_Mode::DEFAULT>::kerns( | |||||
| bundle, bundle_thread, param, | |||||
| matmul_param, matmul_algo, matmul_desc, | |||||
| strategyparam, ncb_index, ohw_tile_size, | |||||
| im2colstrategy); | |||||
| }; | |||||
| if (!enable_filter_preprocess) { | |||||
| ret_kern.push_back( | |||||
| {kern_packA, {GROUP, packa_parallel_times}}); | |||||
| } | |||||
| if (need_padding) { | |||||
| ret_kern.push_back( | |||||
| {kern_padding, | |||||
| {param.n, GROUP, IC / pack_oc_size}}); | |||||
| } | |||||
| ret_kern.push_back({kern_compute_default, | |||||
| {N, GROUP, ohw_parallel_times, | |||||
| oc_parallel_times}}); | |||||
| return ret_kern; | |||||
| } | } | ||||
| ret_kern.push_back( | |||||
| {kern_compute_default, | |||||
| {N, GROUP, ohw_parallel_times, oc_parallel_times}}); | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } else if (only_packA) { | } else if (only_packA) { | ||||
| auto kern_compute_onlypackA = | |||||
| [bundle, bundle_thread, matmul_param, | |||||
| matmul_algo = m_matmul_algo, | |||||
| strategyparam = strategyparam, | |||||
| ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, | |||||
| im2colstrategy]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns( | |||||
| bundle, bundle_thread, param, matmul_param, | |||||
| matmul_algo, matmul_desc, strategyparam, | |||||
| ncb_index, ohw_tile_size, im2colstrategy); | |||||
| }; | |||||
| ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); | |||||
| if (need_padding) { | |||||
| ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) { | |||||
| auto kern_compute_onlypackA = | |||||
| [bundle, bundle_thread, matmul_param, | |||||
| matmul_algo = m_matmul_algo, | |||||
| strategyparam = strategyparam, | |||||
| ohw_tile_size = ohw_tile_size, | |||||
| matmul_desc = matmul_desc, im2colstrategy]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns( | |||||
| bundle, bundle_thread, param, | |||||
| matmul_param, matmul_algo, matmul_desc, | |||||
| strategyparam, ncb_index, ohw_tile_size, | |||||
| im2colstrategy); | |||||
| }; | |||||
| if (!enable_filter_preprocess) { | |||||
| ret_kern.push_back( | |||||
| {kern_packA, {GROUP, packa_parallel_times}}); | |||||
| } | |||||
| if (need_padding) { | |||||
| ret_kern.push_back( | |||||
| {kern_padding, {param.n, GROUP, IC}}); | |||||
| } | |||||
| ret_kern.push_back({kern_compute_onlypackA, | |||||
| {N, GROUP, ohw_parallel_times, | |||||
| oc_parallel_times}}); | |||||
| return ret_kern; | |||||
| } | } | ||||
| ret_kern.push_back( | |||||
| {kern_compute_onlypackA, | |||||
| {N, GROUP, ohw_parallel_times, oc_parallel_times}}); | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } else if (no_pack) { | } else if (no_pack) { | ||||
| auto kern_compute_nopack = | |||||
| [bundle, bundle_thread, matmul_param, | |||||
| matmul_algo = m_matmul_algo, | |||||
| strategyparam = strategyparam, | |||||
| ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, | |||||
| im2colstrategy]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| Im2colKerns<Pack_Mode::NO_PACK>::kerns( | |||||
| bundle, bundle_thread, param, matmul_param, | |||||
| matmul_algo, matmul_desc, strategyparam, | |||||
| ncb_index, ohw_tile_size, im2colstrategy); | |||||
| }; | |||||
| if (need_padding) { | |||||
| ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) { | |||||
| auto kern_compute_nopack = | |||||
| [bundle, bundle_thread, matmul_param, | |||||
| matmul_algo = m_matmul_algo, | |||||
| strategyparam = strategyparam, | |||||
| ohw_tile_size = ohw_tile_size, | |||||
| matmul_desc = matmul_desc, im2colstrategy]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| Im2colKerns<Pack_Mode::NO_PACK>::kerns( | |||||
| bundle, bundle_thread, param, | |||||
| matmul_param, matmul_algo, matmul_desc, | |||||
| strategyparam, ncb_index, ohw_tile_size, | |||||
| im2colstrategy); | |||||
| }; | |||||
| if (need_padding) { | |||||
| ret_kern.push_back( | |||||
| {kern_padding, {param.n, GROUP, IC}}); | |||||
| } | |||||
| ret_kern.push_back({kern_compute_nopack, | |||||
| {N, GROUP, ohw_parallel_times, | |||||
| oc_parallel_times}}); | |||||
| return ret_kern; | |||||
| } | } | ||||
| ret_kern.push_back( | |||||
| {kern_compute_nopack, | |||||
| {N, GROUP, ohw_parallel_times, oc_parallel_times}}); | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | } | ||||
| return ret_kern; | |||||
| return {}; | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| return {}; | return {}; | ||||
| @@ -694,12 +751,19 @@ bool ConvBiasImpl::AlgoIm2col::usable( | |||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc = | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| m_matmul_algo->matmul_description(); | m_matmul_algo->matmul_description(); | ||||
| //! only matmul's packmode is packa or default support weight preprocess | |||||
| if (is_enable_filter_preprocess(param) && | |||||
| (matmul_desc.packmode == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { | |||||
| return false; | |||||
| } | |||||
| if (format == param::ConvBias::Format::NCHW44 || | if (format == param::ConvBias::Format::NCHW44 || | ||||
| format == param::ConvBias::Format::NCHW44_DOT) { | format == param::ConvBias::Format::NCHW44_DOT) { | ||||
| //! current NCHW44 im2col only support DEFAULT mode matmul | //! current NCHW44 im2col only support DEFAULT mode matmul | ||||
| if (mdesc.packmode != Pack_Mode::DEFAULT) { | |||||
| if (matmul_desc.packmode != Pack_Mode::DEFAULT) { | |||||
| return false; | return false; | ||||
| //! nchw44 hybird mode and channel wise is not support | //! nchw44 hybird mode and channel wise is not support | ||||
| } else if (param.filter_meta.icpg < 4_z || | } else if (param.filter_meta.icpg < 4_z || | ||||
| @@ -711,8 +775,9 @@ bool ConvBiasImpl::AlgoIm2col::usable( | |||||
| size_t oc_tile_size = 0, ohw_tile_size = 0; | size_t oc_tile_size = 0, ohw_tile_size = 0; | ||||
| choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | ||||
| mdesc.innerblocksize.m, mdesc.innerblocksize.n, | |||||
| m_matmul_algo->packmode()); | |||||
| matmul_desc.innerblocksize.m, | |||||
| matmul_desc.innerblocksize.n, m_ohw_tile_size, | |||||
| matmul_desc.packmode); | |||||
| fallback::MatrixMulImpl::KernSizeParam matmul_param = | fallback::MatrixMulImpl::KernSizeParam matmul_param = | ||||
| get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); | get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); | ||||
| bool matmulusable = m_matmul_algo->usable(matmul_param); | bool matmulusable = m_matmul_algo->usable(matmul_param); | ||||
| @@ -731,4 +796,104 @@ bool ConvBiasImpl::AlgoIm2col::usable( | |||||
| return false; | return false; | ||||
| } | } | ||||
| SmallVector<TensorLayout> | |||||
| ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout( | |||||
| const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_im2col, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) { | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| m_matmul_algo->matmul_description(); | |||||
| //! only support default_pack and only_packa mode | |||||
| if (matmul_desc.packmode == Pack_Mode::NO_PACK) { | |||||
| return {}; | |||||
| } | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| SmallVector<TensorLayout> preprocessed_layouts; | |||||
| size_t oc_tile_size = 0, ohw_tile_size = 0; | |||||
| choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | |||||
| matmul_desc.innerblocksize.m, | |||||
| matmul_desc.innerblocksize.n, m_ohw_tile_size, | |||||
| matmul_desc.packmode); | |||||
| auto matmul_param = get_matmul_kern_param( | |||||
| param, ohw_tile_size, default_pack ? OC : oc_tile_size); | |||||
| size_t packa_parallel_times = div_ceil<size_t>( | |||||
| OC, default_pack ? matmul_desc.innerblocksize.m : oc_tile_size); | |||||
| size_t packa_group_size = packA_group_size( | |||||
| m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); | |||||
| preprocessed_layouts.push_back( | |||||
| {{GROUP, packa_group_size}, dtype::Int8()}); | |||||
| return preprocessed_layouts; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> | |||||
| ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns( | |||||
| const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 3) { | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t oc_tile_size = 0, ohw_tile_size = 0; | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| m_matmul_algo->matmul_description(); | |||||
| choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, | |||||
| matmul_desc.innerblocksize.m, | |||||
| matmul_desc.innerblocksize.n, m_ohw_tile_size, | |||||
| matmul_desc.packmode); | |||||
| WorkspaceBundle bundle = | |||||
| get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size); | |||||
| Pack_Mode packmode = matmul_desc.packmode; | |||||
| bool default_pack = packmode == Pack_Mode::DEFAULT; | |||||
| bool only_packA = packmode == Pack_Mode::ONLY_PACKA; | |||||
| size_t packa_parallel_times = 0; | |||||
| if (only_packA) { | |||||
| packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size); | |||||
| } else if (default_pack) { | |||||
| packa_parallel_times = | |||||
| div_ceil<size_t>(OC, matmul_desc.innerblocksize.m); | |||||
| } else { | |||||
| //! if nopack return null so that OprWeightPreprocessProxy can run | |||||
| //! with nopack mode | |||||
| return {}; | |||||
| } | |||||
| auto matmul_param = get_matmul_kern_param( | |||||
| param, ohw_tile_size, default_pack ? OC : oc_tile_size); | |||||
| StrategyParam strategyparam; | |||||
| strategyparam.enable_filter_preprocess = | |||||
| is_enable_filter_preprocess(param); | |||||
| strategyparam.packA_group_size = packA_group_size( | |||||
| m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| StrategyBase* im2colstrategy = | |||||
| Factory::get_im2col_strategy(param, m_matmul_algo); | |||||
| auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param, | |||||
| im2colstrategy, strategyparam = strategyparam, | |||||
| matmul_desc = matmul_desc]( | |||||
| const NCBKernParam& param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| bundle.set(param.workspace_ptr); | |||||
| im2colstrategy->packA_kern(bundle, param, matmul_param, matmul_algo, | |||||
| ncb_index, matmul_desc, strategyparam); | |||||
| }; | |||||
| ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); | |||||
| return ret_kern; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -22,27 +22,6 @@ namespace megdnn { | |||||
| namespace fallback { | namespace fallback { | ||||
| class ConvBiasImpl::AlgoIm2col final : public AlgoBase { | class ConvBiasImpl::AlgoIm2col final : public AlgoBase { | ||||
| //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, | |||||
| //! when m_oc_tile_size < this value m_oc_tile_size = ohw | |||||
| static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32; | |||||
| //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, | |||||
| //! m_oc_tile_size = DEFAULT_OC_TILE_SIZE | |||||
| static constexpr size_t DEFAULT_OC_TILE_SIZE = 512; | |||||
| //! when m_oc_tile_size > this value m_oc_tile_size = | |||||
| //! DEFAULT_OC_MAX_TILE_SIZE | |||||
| static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024; | |||||
| //! when m_oc_tile_size < this value m_oc_tile_size = | |||||
| //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation | |||||
| static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128; | |||||
| fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param( | |||||
| const NCBKernSizeParam& param, size_t ohw_tile_size, | |||||
| size_t oc_tile_size) const; | |||||
| WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; | |||||
| void choice_ohw_oc_block( | |||||
| const NCBKernSizeParam& param, size_t& oc_tile_size, | |||||
| size_t& ohw_tile_size, size_t block_m, size_t block_n, | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const; | |||||
| public: | public: | ||||
| AlgoIm2col(MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size) | AlgoIm2col(MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size) | ||||
| : m_matmul_algo(matmul_algo), | : m_matmul_algo(matmul_algo), | ||||
| @@ -59,10 +38,16 @@ public: | |||||
| bool usable(const NCBKernSizeParam& param, | bool usable(const NCBKernSizeParam& param, | ||||
| AlgoSelectionStrategy algo_selection_strategy) const override; | AlgoSelectionStrategy algo_selection_strategy) const override; | ||||
| size_t get_workspace(const NCBKernSizeParam& param) const override; | size_t get_workspace(const NCBKernSizeParam& param) const override; | ||||
| SmallVector<NCBKern> dispatch_kerns( | |||||
| SmallVector<NCBKern> dispatch_kerns(const NCBKernSizeParam& param) const override; | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| const NCBKernSizeParam& param) const override; | |||||
| size_t get_preprocess_workspace( | |||||
| const NCBKernSizeParam& /*param*/) const override { | |||||
| return 0; | |||||
| } | |||||
| SmallVector<NCBKern> dispatch_preprocess_kerns( | |||||
| const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
| bool is_preferred( | |||||
| const NCBKernSizeParam& param) const override { | |||||
| bool is_preferred(const NCBKernSizeParam& param) const override { | |||||
| if (param.src_type.category() == DTypeCategory::QUANTIZED) { | if (param.src_type.category() == DTypeCategory::QUANTIZED) { | ||||
| static CpuOprDelegationStorage<1> storage; | static CpuOprDelegationStorage<1> storage; | ||||
| auto conv_bias_opr = storage.get<ConvBias, 0>(); | auto conv_bias_opr = storage.get<ConvBias, 0>(); | ||||
| @@ -40,9 +40,11 @@ struct StrategyParam { | |||||
| size_t block_n; | size_t block_n; | ||||
| size_t block_k; | size_t block_k; | ||||
| size_t pack_oc_size; | size_t pack_oc_size; | ||||
| size_t packA_group_size; | |||||
| bool skip_copy_dst; | bool skip_copy_dst; | ||||
| bool is_dst_8bit; | bool is_dst_8bit; | ||||
| bool is_ohw_size_bigger; | bool is_ohw_size_bigger; | ||||
| bool enable_filter_preprocess; | |||||
| }; | }; | ||||
| class StrategyBase { | class StrategyBase { | ||||
| @@ -62,7 +64,7 @@ public: | |||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | ||||
| matmul_desec, | matmul_desec, | ||||
| size_t pack_size) = 0; | |||||
| const StrategyParam& sparam) = 0; | |||||
| virtual void exec_im2col( | virtual void exec_im2col( | ||||
| const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | ||||
| @@ -296,7 +298,7 @@ public: | |||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | ||||
| matmul_desc, | matmul_desc, | ||||
| size_t pack_size) override; | |||||
| const StrategyParam& sparam) override; | |||||
| virtual void exec_im2col( | virtual void exec_im2col( | ||||
| const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | ||||
| const StrategyParam& sparam, | const StrategyParam& sparam, | ||||
| @@ -375,7 +377,7 @@ public: | |||||
| const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | ||||
| size_t pack_size) override; | |||||
| const StrategyParam& sparam) override; | |||||
| void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | ||||
| const StrategyParam& sparam, const WorkspaceBundle& bundle, | const StrategyParam& sparam, const WorkspaceBundle& bundle, | ||||
| @@ -431,7 +433,7 @@ public: | |||||
| const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, | ||||
| size_t pack_size) override; | |||||
| const StrategyParam& sparam) override; | |||||
| void exec_im2col( | void exec_im2col( | ||||
| const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, | ||||
| @@ -25,19 +25,23 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | ||||
| matmul_desc, | matmul_desc, | ||||
| size_t) { | |||||
| const StrategyParam& sparam) { | |||||
| fallback::MatrixMulImpl::KernParam matmul_param; | fallback::MatrixMulImpl::KernParam matmul_param; | ||||
| size_t group_id = ncb_index.ndrange_id[0]; | size_t group_id = ncb_index.ndrange_id[0]; | ||||
| static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | ||||
| matmulparam; | matmulparam; | ||||
| size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); | |||||
| size_t packed_per_oc_block_size = | size_t packed_per_oc_block_size = | ||||
| round_up(matmul_param.K, matmul_desc.innerblocksize.k) * | round_up(matmul_param.K, matmul_desc.innerblocksize.k) * | ||||
| matmul_desc.innerblocksize.m * matmul_desc.packa_type_size; | matmul_desc.innerblocksize.m * matmul_desc.packa_type_size; | ||||
| size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size; | size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size; | ||||
| int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||||
| group_id * packA_group_size + a_panel_offset; | |||||
| int8_t* tmp_ptr = | |||||
| sparam.enable_filter_preprocess | |||||
| ? static_cast<int8_t*>( | |||||
| param.preprocessed_filter->tensors[0].raw_ptr) | |||||
| : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)); | |||||
| int8_t* a_panel = | |||||
| tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset; | |||||
| matmul_param.A_ptr = | matmul_param.A_ptr = | ||||
| const_cast<src_ctype*>(param.filter<src_ctype>(group_id)); | const_cast<src_ctype*>(param.filter<src_ctype>(group_id)); | ||||
| matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1], | matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1], | ||||
| @@ -149,15 +153,20 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
| size_t packA_per_oc_block_size = | size_t packA_per_oc_block_size = | ||||
| round_up(matmul_param.K, matmul_desc.innerblocksize.k) * | round_up(matmul_param.K, matmul_desc.innerblocksize.k) * | ||||
| sparam.oc_tile_size * matmul_desc.packa_type_size; | sparam.oc_tile_size * matmul_desc.packa_type_size; | ||||
| size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); | |||||
| size_t packA_group_size = sparam.packA_group_size; | |||||
| size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size + | size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size + | ||||
| ncb_index.ndrange_id[3] * packA_per_oc_block_size; | ncb_index.ndrange_id[3] * packA_per_oc_block_size; | ||||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | ||||
| src_ctype* a_panel = reinterpret_cast<src_ctype*>( | |||||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||||
| a_panel_offset); | |||||
| int8_t* tmp_ptr = | |||||
| sparam.enable_filter_preprocess | |||||
| ? static_cast<int8_t*>( | |||||
| param.preprocessed_filter->tensors[0].raw_ptr) | |||||
| : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)); | |||||
| src_ctype* a_panel = | |||||
| reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset); | |||||
| src_ctype* b_panel = | src_ctype* b_panel = | ||||
| reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>( | reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>( | ||||
| bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); | bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); | ||||
| @@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase:: | const fallback::MatrixMulImpl::AlgoBase:: | ||||
| MatmulDescription& /*matmul_dsec*/, | MatmulDescription& /*matmul_dsec*/, | ||||
| size_t) { | |||||
| const StrategyParam&) { | |||||
| MEGDNN_MARK_USED_VAR(bundle); | MEGDNN_MARK_USED_VAR(bundle); | ||||
| MEGDNN_MARK_USED_VAR(param); | MEGDNN_MARK_USED_VAR(param); | ||||
| MEGDNN_MARK_USED_VAR(matmulparam); | MEGDNN_MARK_USED_VAR(matmulparam); | ||||
| @@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase:: | const fallback::MatrixMulImpl::AlgoBase:: | ||||
| MatmulDescription& /*matmul_desc*/, | MatmulDescription& /*matmul_desc*/, | ||||
| size_t) { | |||||
| const StrategyParam& sparam) { | |||||
| fallback::MatrixMulImpl::KernParam matmul_param; | fallback::MatrixMulImpl::KernParam matmul_param; | ||||
| static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | ||||
| matmulparam; | matmulparam; | ||||
| @@ -36,12 +36,17 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
| size_t output_block_oc_size = | size_t output_block_oc_size = | ||||
| std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size); | std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size); | ||||
| size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size; | size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size; | ||||
| size_t packA_group_size = | |||||
| bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group; | |||||
| size_t a_panel_offset = ncb_index.ndrange_id[1] * | size_t a_panel_offset = ncb_index.ndrange_id[1] * | ||||
| matmul_algo->get_bundle(matmul_param).get_size(0); | matmul_algo->get_bundle(matmul_param).get_size(0); | ||||
| int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||||
| group_id * packA_group_size + a_panel_offset; | |||||
| int8_t* tmp_ptr = | |||||
| sparam.enable_filter_preprocess | |||||
| ? static_cast<int8_t*>( | |||||
| param.preprocessed_filter->tensors[0].raw_ptr) | |||||
| : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)); | |||||
| int8_t* a_panel = tmp_ptr + | |||||
| group_id * sparam.packA_group_size + a_panel_offset; | |||||
| matmul_param.A_ptr = | matmul_param.A_ptr = | ||||
| const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) + | const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) + | ||||
| oc_cur_index * matmul_param.K; | oc_cur_index * matmul_param.K; | ||||
| @@ -60,20 +65,22 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||||
| fallback::MatrixMulImpl::KernParam matmul_param, | fallback::MatrixMulImpl::KernParam matmul_param, | ||||
| const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | const fallback::MatrixMulImpl::AlgoBase* matmul_algo, | ||||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, | ||||
| const fallback::MatrixMulImpl::AlgoBase:: | |||||
| MatmulDescription& /*matmul_desc*/ | |||||
| ) { | |||||
| size_t packA_group_size = | |||||
| bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group; | |||||
| const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& | |||||
| /*matmul_desc*/) { | |||||
| size_t a_panel_offset = ncb_index.ndrange_id[3] * | size_t a_panel_offset = ncb_index.ndrange_id[3] * | ||||
| matmul_algo->get_bundle(matmul_param).get_size(0); | matmul_algo->get_bundle(matmul_param).get_size(0); | ||||
| a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset; | |||||
| a_panel_offset = | |||||
| sparam.group_id * sparam.packA_group_size + a_panel_offset; | |||||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | ||||
| src_ctype* a_panel = reinterpret_cast<src_ctype*>( | |||||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||||
| a_panel_offset); | |||||
| int8_t* tmp_ptr = | |||||
| sparam.enable_filter_preprocess | |||||
| ? static_cast<int8_t*>( | |||||
| param.preprocessed_filter->tensors[0].raw_ptr) | |||||
| : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)); | |||||
| src_ctype* a_panel = reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset); | |||||
| src_ctype* b_panel = nullptr; | src_ctype* b_panel = nullptr; | ||||
| src_ctype* im2col_dst = static_cast<src_ctype*>( | src_ctype* im2col_dst = static_cast<src_ctype*>( | ||||
| @@ -154,7 +154,8 @@ void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout, | |||||
| bias{nullptr, bias_layout}; | bias{nullptr, bias_layout}; | ||||
| auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, | auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, | ||||
| preprocessed_filter); | preprocessed_filter); | ||||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||||
| //! should not pass workspace_size limit otherwise can not find match algo | |||||
| ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); | |||||
| if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | ||||
| fparam) <= workspace.size) { | fparam) <= workspace.size) { | ||||
| exec_preprocess_with_ncb_kern(fparam, algo); | exec_preprocess_with_ncb_kern(fparam, algo); | ||||
| @@ -299,6 +299,11 @@ private: | |||||
| const PreprocessedFilter* preprocessed_filter); | const PreprocessedFilter* preprocessed_filter); | ||||
| }; | }; | ||||
| inline bool is_enable_filter_preprocess( | |||||
| const ConvBiasImpl::NCBKernSizeParam& param) { | |||||
| return param.preprocessed_filter && | |||||
| param.preprocessed_filter->tensors.size() >= 1; | |||||
| } | |||||
| } // namespace fallback | } // namespace fallback | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -109,7 +109,9 @@ void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout, | |||||
| TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}; | TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}; | ||||
| auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, | auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, | ||||
| workspace); | workspace); | ||||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||||
| //! should not pass workspace_size limit otherwise can not find match algo | |||||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam); | |||||
| if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | ||||
| fparam) <= workspace.size) { | fparam) <= workspace.size) { | ||||
| exec_preprocess_with_ncb_kern(fparam, algo); | exec_preprocess_with_ncb_kern(fparam, algo); | ||||
| @@ -1837,6 +1837,21 @@ void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle, | |||||
| {arg.src, arg.filter, arg.bias, {}, {}}); | {arg.src, arg.filter, arg.bias, {}, {}}); | ||||
| } | } | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2_PREPROCESS) { | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 2, false, false, false), \ | |||||
| handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ | |||||
| dtype::Float32(), dtype::Float32(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_F32K8X12X1") | |||||
| cb("IM2COLMATMUL:AARCH64_F32K4X16X1") | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARMV7_F32") | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| // clang-format off | // clang-format off | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) { | ||||
| #define cb(name) \ | #define cb(name) \ | ||||
| @@ -1851,6 +1866,22 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) { | |||||
| cb("IM2COLMATMUL:ARMV7_F32") | cb("IM2COLMATMUL:ARMV7_F32") | ||||
| #endif | #endif | ||||
| #undef cb | #undef cb | ||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1_PREPROCESS) { | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false), \ | |||||
| handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ | |||||
| dtype::Float32(), dtype::Float32(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_F32K8X12X1") | |||||
| cb("IM2COLMATMUL:AARCH64_F32K4X16X1") | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARMV7_F32") | |||||
| #endif | |||||
| #undef cb | |||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1) { | ||||
| @@ -1899,6 +1930,37 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess(get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \ | |||||
| false, true, true), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ | |||||
| dtype::QuantizedS8(60.25f), name); \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({1}, 2, false, false, false, true, true), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ | |||||
| dtype::QuantizedS8(60.25f), name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| #if __ARM_FEATURE_DOTPROD | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8"); | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16"); | |||||
| #endif | |||||
| #elif MEGDNN_ARMV7 | |||||
| epsilon = 1; | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| #if __ARM_FEATURE_DOTPROD | #if __ARM_FEATURE_DOTPROD | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) { | ||||
| @@ -1924,6 +1986,29 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) { | |||||
| #endif | #endif | ||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess(get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, \ | |||||
| false, false, false, true), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ | |||||
| dtype::QuantizedS8(60.25f), name); \ | |||||
| checker_conv_bias( \ | |||||
| get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ | |||||
| dtype::QuantizedS8(60.25f), name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_S2_FUSE) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_S2_FUSE) { | ||||
| UniformIntRNG rng{-50, 50}; | UniformIntRNG rng{-50, 50}; | ||||
| @@ -1968,6 +2053,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \ | |||||
| true, false, true, false, false, true), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name); \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \ | |||||
| false, false, true), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) { | ||||
| UniformIntRNG rng{-50, 50}; | UniformIntRNG rng{-50, 50}; | ||||
| @@ -1992,6 +2102,30 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \ | |||||
| true, false, true, false, false, true), \ | |||||
| handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(), \ | |||||
| dtype::Int32(), {}, name); \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \ | |||||
| false, false, true), \ | |||||
| handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(), \ | |||||
| dtype::Int32(), {}, name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_CONV1x1_QUANTIZEDSYM_MK4_DOT) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_CONV1x1_QUANTIZEDSYM_MK4_DOT) { | ||||
| UniformIntRNG rng{-50, 50}; | UniformIntRNG rng{-50, 50}; | ||||
| @@ -2055,6 +2189,41 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM) { | |||||
| #endif | #endif | ||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM_FILTERPREPROCESS) { | |||||
| NormalRNG rng(128.f); | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false, \ | |||||
| true, true), \ | |||||
| handle(), &rng, epsilon, \ | |||||
| dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ | |||||
| dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ | |||||
| dtype::QuantizedS32(1.2 * 1.3), \ | |||||
| dtype::Quantized8Asymm(50.3f, (uint8_t)120), name); \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({1}, 2, false, false, false, true, true), \ | |||||
| handle(), &rng, epsilon, \ | |||||
| dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ | |||||
| dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ | |||||
| dtype::QuantizedS32(1.2 * 1.3), \ | |||||
| dtype::Quantized8Asymm(50.3f, (uint8_t)120), name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| #if __ARM_FEATURE_DOTPROD | |||||
| cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8"); | |||||
| #endif | |||||
| #elif MEGDNN_ARMV7 | |||||
| epsilon = 1; | |||||
| cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| #endif | #endif | ||||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | ||||
| @@ -2088,6 +2257,39 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32) { | |||||
| #endif | #endif | ||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32_FILTERPREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| float epsilon = 0.001; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \ | |||||
| handle(), &rng, epsilon, \ | |||||
| dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ | |||||
| dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ | |||||
| dtype::QuantizedS32(1.2 * 1.3), {}, name); \ | |||||
| check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \ | |||||
| handle(), &rng, epsilon, \ | |||||
| dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ | |||||
| dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ | |||||
| dtype::QuantizedS32(1.2 * 1.3), {}, name); | |||||
| #if MEGDNN_AARCH64 | |||||
| #if __ARM_FEATURE_DOTPROD | |||||
| cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8"); | |||||
| #endif | |||||
| #elif MEGDNN_ARMV7 | |||||
| #if __ARM_FEATURE_DOTPROD | |||||
| cb("IM2COLMATMUL:AARCH32_QUINT8_K4X8X4"); | |||||
| #endif | |||||
| cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) { | ||||
| UniformIntRNG rng{-50, 50}; | UniformIntRNG rng{-50, 50}; | ||||
| float epsilon = 0.001; | float epsilon = 0.001; | ||||
| @@ -2127,6 +2329,51 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) { | |||||
| #undef cb | #undef cb | ||||
| #undef cb_nchw44 | #undef cb_nchw44 | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_FILTERPREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| float epsilon = 0.001; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \ | |||||
| handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, \ | |||||
| dtype::Int16{}, dtype::Int16{}, name); \ | |||||
| check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \ | |||||
| handle(), &rng, epsilon, dtype::Int8{}, \ | |||||
| dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \ | |||||
| name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X16_K8X8X8"); | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8"); | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X2X16"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_NOPACK_FILTERPREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| float epsilon = 0.001; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \ | |||||
| handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, \ | |||||
| dtype::Int16{}, dtype::Int16{}, name); \ | |||||
| check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \ | |||||
| handle(), &rng, epsilon, dtype::Int8{}, \ | |||||
| dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \ | |||||
| name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| #endif | #endif | ||||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
| @@ -2147,6 +2394,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16) { | |||||
| dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, \ | dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, \ | ||||
| name); | name); | ||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_F16_K8X24X1"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:AARCH32_F16_K4X16X1"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16_FILTERPREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| param::ConvBias cur_param; | |||||
| std::vector<conv_bias::TestArg> args = | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false); | |||||
| std::vector<conv_bias::TestArg> args1 = | |||||
| get_conv_bias_args({1}, 2, false, false, false); | |||||
| args.insert(args.begin(), args1.begin(), args1.end()); | |||||
| NormalRNG rng(1); | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess(args, handle(), &rng, 0.03, dtype::Float16{}, \ | |||||
| dtype::Float16{}, dtype::Float16{}, \ | |||||
| dtype::Float16{}, name); | |||||
| #if MEGDNN_AARCH64 | #if MEGDNN_AARCH64 | ||||
| cb("IM2COLMATMUL:AARCH64_F16_K8X24X1"); | cb("IM2COLMATMUL:AARCH64_F16_K8X24X1"); | ||||
| #elif MEGDNN_ARMV7 | #elif MEGDNN_ARMV7 | ||||
| @@ -2185,6 +2457,36 @@ void checker_conv_bias_mul_int8x8x32(std::vector<conv_bias::TestArg> args, | |||||
| } | } | ||||
| } | } | ||||
| void checker_conv_bias_int8x8x32_preprocess(std::vector<conv_bias::TestArg> args, | |||||
| Handle* handle, const char* algo_name) { | |||||
| using namespace conv_bias; | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle); | |||||
| checker.set_before_exec_callback( | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); | |||||
| checker.set_dtype(0, dtype::Int8()); | |||||
| checker.set_dtype(1, dtype::Int8()); | |||||
| checker.set_dtype(2, dtype::Int32()); | |||||
| checker.set_dtype(4, dtype::Int32()); | |||||
| for (auto&& arg : args) { | |||||
| checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); | |||||
| } | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| for (auto&& arg : args) { | |||||
| checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||||
| .set_dtype(1, dtype::QuantizedS8(2.5f)) | |||||
| .set_dtype(2, dtype::QuantizedS32(6.25f)) | |||||
| .set_dtype(4, {}) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_rng(2, &rng) | |||||
| .set_param(arg.param) | |||||
| .execs({arg.src, arg.filter, {}, {}, {}}); | |||||
| } | |||||
| } | |||||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | ||||
| #if !__ARM_FEATURE_DOTPROD | #if !__ARM_FEATURE_DOTPROD | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) { | ||||
| @@ -2201,6 +2503,20 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<conv_bias::TestArg> args = | |||||
| get_nchw44_conv_bias_args({2, 5, 7}, 2, false, true, true); | |||||
| #define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<conv_bias::TestArg> args = | std::vector<conv_bias::TestArg> args = | ||||
| @@ -2216,6 +2532,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<conv_bias::TestArg> args = | |||||
| get_nchw44_conv_bias_args({3, 4, 6}, 1, false, true, true); | |||||
| #define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | TEST_F(ARM_COMMON_MULTI_THREADS, | ||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2) { | CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2) { | ||||
| UniformIntRNG rng{-50, 50}; | UniformIntRNG rng{-50, 50}; | ||||
| @@ -2234,6 +2565,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({3, 4, 6}, 2), handle(), &rng, epsilon, \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | TEST_F(ARM_COMMON_MULTI_THREADS, | ||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1) { | CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1) { | ||||
| UniformIntRNG rng{-50, 50}; | UniformIntRNG rng{-50, 50}; | ||||
| @@ -2252,6 +2602,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({2, 5, 7}, 1), handle(), &rng, epsilon, \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name); | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| #if MEGDNN_AARCH64 | #if MEGDNN_AARCH64 | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | TEST_F(ARM_COMMON_MULTI_THREADS, | ||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE) { | CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE) { | ||||
| @@ -2266,6 +2634,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | ||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({3}, 1), handle(), &rng, epsilon, \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name); | |||||
| float epsilon = 0.001; | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); | |||||
| #undef cb | |||||
| } | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -2287,6 +2670,23 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); | cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); | ||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||||
| CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44DOT_FUSE_PREPROCESS) { | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess( \ | |||||
| get_nchw44_conv_bias_args({3}, 1, false, false, false, false, \ | |||||
| true, false, false, false), \ | |||||
| handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ | |||||
| dtype::QuantizedS8(60.25f), name); | |||||
| float epsilon = 0.001; | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); | |||||
| #undef cb | |||||
| } | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -2320,6 +2720,36 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<conv_bias::TestArg> args = | |||||
| get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true); | |||||
| std::vector<conv_bias::TestArg> args1 = | |||||
| get_conv_bias_args({1}, 2, false, true, true); | |||||
| args.insert(args.begin(), args1.begin(), args1.end()); | |||||
| #define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| #if __ARM_FEATURE_DOTPROD | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD"); | |||||
| #else | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8"); | |||||
| cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16"); | |||||
| #endif | |||||
| #elif MEGDNN_ARMV7 | |||||
| #if __ARM_FEATURE_DOTPROD | |||||
| cb("IM2COLMATMUL:AARCH32_INT8_K6X8X4"); | |||||
| #endif | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8"); | |||||
| #endif | |||||
| #if MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X2X16"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | ||||
| @@ -2331,25 +2761,62 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) { | |||||
| #endif | #endif | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | |||||
| {2, 4, 7}, 1, false, false, false, false, false, true,true); | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess(args, handle(), nullptr, 0.001, \ | |||||
| dtype::Float32(), dtype::Float32(), \ | |||||
| dtype::Float32(), dtype::Float32(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | ||||
| {3, 5, 6}, 2, false, false, false, false, false, true, true); | {3, 5, 6}, 2, false, false, false, false, false, true, true); | ||||
| #define cb(name) check_conv_bias(args, handle(), name); | |||||
| #if MEGDNN_AARCH64 | #if MEGDNN_AARCH64 | ||||
| check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); | |||||
| cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); | |||||
| #elif MEGDNN_ARMV7 | #elif MEGDNN_ARMV7 | ||||
| check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); | |||||
| cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); | |||||
| #endif | #endif | ||||
| #undef cb | |||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | |||||
| {3}, 2, false, false, false, false, false, true, true, false); | |||||
| #define cb(name) \ | |||||
| check_conv_bias_preprocess(args, handle(), nullptr, 0.001, \ | |||||
| dtype::Float32(), dtype::Float32(), \ | |||||
| dtype::Float32(), dtype::Float32(), name); | |||||
| #if MEGDNN_AARCH64 | |||||
| cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); | |||||
| #elif MEGDNN_ARMV7 | |||||
| cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); | |||||
| #endif | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | std::vector<conv_bias::TestArg> args = get_nchw44_conv_bias_args( | ||||
| {3}, 2, false, false, false, false, false, true, true, false); | {3}, 2, false, false, false, false, false, true, true, false); | ||||
| #define cb(name) check_conv_bias(args, handle(), name); | |||||
| #if MEGDNN_AARCH64 | #if MEGDNN_AARCH64 | ||||
| check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); | |||||
| cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); | |||||
| #elif MEGDNN_ARMV7 | #elif MEGDNN_ARMV7 | ||||
| check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); | |||||
| cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); | |||||
| #endif | #endif | ||||
| #undef cb | |||||
| } | } | ||||
| /***************************** Conv1x1 Algo Test ***********************/ | /***************************** Conv1x1 Algo Test ***********************/ | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F32) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F32) { | ||||
| @@ -1118,6 +1118,30 @@ void checker_conv_bias_int8x8x16(std::vector<conv_bias::TestArg> args, | |||||
| } | } | ||||
| } | } | ||||
| void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args, | |||||
| Handle* handle, RNG* rng, float epsilon, | |||||
| DType type0, DType type1, DType type2, | |||||
| DType type3, const char* algo_name) { | |||||
| using namespace conv_bias; | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle); | |||||
| checker.set_dtype(0, type0); | |||||
| checker.set_dtype(1, type1); | |||||
| checker.set_dtype(2, type2); | |||||
| checker.set_dtype(4, type3); | |||||
| checker.set_epsilon(epsilon); | |||||
| if (NULL != rng) { | |||||
| checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng); | |||||
| } | |||||
| checker.set_before_exec_callback( | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); | |||||
| for (auto&& arg : args) { | |||||
| checker.set_param(arg.param).execs( | |||||
| {arg.src, arg.filter, arg.bias, {}, {}}); | |||||
| } | |||||
| } | |||||
| void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | ||||
| param::ConvBias param, Handle* handle, | param::ConvBias param, Handle* handle, | ||||
| @@ -58,7 +58,10 @@ std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size); | |||||
| std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size, | std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size, | ||||
| bool compute_float32 = false, | bool compute_float32 = false, | ||||
| bool group_mode = false); | bool group_mode = false); | ||||
| void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args, | |||||
| Handle* handle, RNG* rng, float epsilon, | |||||
| DType type0, DType type1, DType type2, | |||||
| DType type3, const char* algo_name); | |||||
| template <typename Opr> | template <typename Opr> | ||||
| using ConvBiasAlgoChecker = AlgoChecker<Opr>; | using ConvBiasAlgoChecker = AlgoChecker<Opr>; | ||||
| @@ -752,7 +752,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) { | |||||
| } | } | ||||
| } | } | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) { | |||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32) { | |||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| @@ -842,6 +842,98 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) { | |||||
| #undef cb2 | #undef cb2 | ||||
| } | } | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<TestArg> args; | |||||
| auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, | |||||
| size_t p, NonlineMode nonline_mode) { | |||||
| if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
| return; | |||||
| param::ConvBias param; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.pad_h = p; | |||||
| param.pad_w = p; | |||||
| param.nonlineMode = nonline_mode; | |||||
| //! no bias | |||||
| args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, TensorShape{}); | |||||
| }; | |||||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||||
| for (size_t ic : {1, 4, 8, 16}) | |||||
| for (size_t oc : {1, 4, 8}) | |||||
| for (size_t p : {0, 2}) | |||||
| for (size_t size : {20, 21, 24}) | |||||
| for (NonlineMode nonline_mode : | |||||
| {NonlineMode::IDENTITY}) { | |||||
| run(oc, ic, size, size, kernel, p, nonline_mode); | |||||
| } | |||||
| //! test OC block | |||||
| run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY); | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle()); | |||||
| UniformIntRNG rng{-50, 50}; | |||||
| #define cb(algo_name) \ | |||||
| checker.set_before_exec_callback( \ | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \ | |||||
| checker.set_dtype(0, dtype::Int8()); \ | |||||
| checker.set_dtype(1, dtype::Int8()); \ | |||||
| checker.set_dtype(2, dtype::Int32()); \ | |||||
| checker.set_dtype(4, dtype::Int32()); \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \ | |||||
| } \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \ | |||||
| .set_dtype(1, dtype::QuantizedS8(2.5f)) \ | |||||
| .set_dtype(2, dtype::QuantizedS32(6.25f)) \ | |||||
| .set_dtype(4, {}) \ | |||||
| .set_rng(0, &rng) \ | |||||
| .set_rng(1, &rng) \ | |||||
| .set_rng(2, &rng) \ | |||||
| .set_param(arg.param) \ | |||||
| .execs({arg.src, arg.filter, {}, {}, {}}); \ | |||||
| } | |||||
| #define cb2(algo_name) \ | |||||
| checker.set_before_exec_callback( \ | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \ | |||||
| checker.set_dtype(0, dtype::Int8()); \ | |||||
| checker.set_dtype(1, dtype::Int8()); \ | |||||
| checker.set_dtype(2, dtype::Int16()); \ | |||||
| checker.set_dtype(4, dtype::Int16()); \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \ | |||||
| } | |||||
| #if MEGDNN_X86_WITH_MKL_DNN | |||||
| if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN"); | |||||
| } | |||||
| #endif | |||||
| #if MEGDNN_X86_WITH_VNNI | |||||
| if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_VNNI"); | |||||
| } | |||||
| #endif | |||||
| if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16"); | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2"); | |||||
| cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2"); | |||||
| } | |||||
| if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2"); | |||||
| cb2("IM2COLMATMUL:X86_INT8X8X16_SSE"); | |||||
| } | |||||
| #undef cb | |||||
| #undef cb2 | |||||
| } | |||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) { | TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| @@ -950,6 +1042,61 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<TestArg> args; | |||||
| auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, | |||||
| size_t p, NonlineMode nonline_mode) { | |||||
| if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
| return; | |||||
| param::ConvBias param; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.pad_h = p; | |||||
| param.pad_w = p; | |||||
| param.nonlineMode = nonline_mode; | |||||
| //! no bias | |||||
| args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, TensorShape{}); | |||||
| args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc, 1, 1}); | |||||
| args.emplace_back( | |||||
| param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1, | |||||
| (w + 2 * p - kernel) / param.stride_w + 1}); | |||||
| }; | |||||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||||
| for (size_t ic : {1, 4, 8, 16}) | |||||
| for (size_t oc : {1, 4, 8, 16, 300}) | |||||
| for (size_t p : {0, 2}) | |||||
| for (size_t size : {8, 24}) | |||||
| for (NonlineMode nonline_mode : | |||||
| {NonlineMode::IDENTITY, NonlineMode::RELU}) { | |||||
| run(oc, ic, size, size, kernel, p, nonline_mode); | |||||
| } | |||||
| run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle()); | |||||
| #define cb(algo_name) \ | |||||
| checker.set_before_exec_callback( \ | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_param(arg.param).execs( \ | |||||
| {arg.src, arg.filter, arg.bias, {}, {}}); \ | |||||
| } | |||||
| cb("IM2COLMATMUL:X86_F32_BLAS"); | |||||
| #undef cb | |||||
| } | |||||
| #endif | #endif | ||||
| @@ -1020,6 +1167,73 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA_FILTER_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<TestArg> args; | |||||
| auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, | |||||
| size_t p, NonlineMode nonline_mode) { | |||||
| if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
| return; | |||||
| param::ConvBias param; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.pad_h = p; | |||||
| param.pad_w = p; | |||||
| param.nonlineMode = nonline_mode; | |||||
| //! no bias | |||||
| args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, TensorShape{}); | |||||
| args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc, 1, 1}); | |||||
| args.emplace_back( | |||||
| param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1, | |||||
| (w + 2 * p - kernel) / param.stride_w + 1}); | |||||
| param.sparse = param::ConvBias::Sparse::GROUP; | |||||
| args.emplace_back(param, TensorShape{1, 2 * ic, h, w}, | |||||
| TensorShape{2, oc, ic, kernel, kernel}, | |||||
| TensorShape{}); | |||||
| args.emplace_back(param, TensorShape{1, 2 * ic, h, w}, | |||||
| TensorShape{2, oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc * 2, 1, 1}); | |||||
| args.emplace_back( | |||||
| param, TensorShape{1, 2 * ic, h, w}, | |||||
| TensorShape{2, oc, ic, kernel, kernel}, | |||||
| TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1, | |||||
| (w + 2 * param.pad_w - kernel) / 1 + 1}); | |||||
| }; | |||||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||||
| for (size_t ic : {1, 4, 8, 16}) | |||||
| for (size_t oc : {1, 4, 8, 16}) | |||||
| for (size_t p : {0, 1}) | |||||
| for (size_t size : {8, 24}) | |||||
| for (NonlineMode nonline_mode : | |||||
| {NonlineMode::IDENTITY, NonlineMode::RELU}) { | |||||
| run(oc, ic, size, size, kernel, p, nonline_mode); | |||||
| } | |||||
| run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle()); | |||||
| #define cb(algo_name) \ | |||||
| checker.set_before_exec_callback( \ | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_param(arg.param).execs( \ | |||||
| {arg.src, arg.filter, arg.bias, {}, {}}); \ | |||||
| } | |||||
| cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192"); | |||||
| #undef cb | |||||
| } | |||||
| /**************************** Conv1x1 PackA *************************/ | /**************************** Conv1x1 PackA *************************/ | ||||
| namespace { | namespace { | ||||
| void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle, | void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle, | ||||
| @@ -1169,6 +1383,77 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) { | |||||
| using namespace conv_bias; | |||||
| std::vector<TestArg> args; | |||||
| auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, | |||||
| size_t p, NonlineMode nonline_mode) { | |||||
| if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
| return; | |||||
| param::ConvBias param; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.pad_h = p; | |||||
| param.pad_w = p; | |||||
| param.nonlineMode = nonline_mode; | |||||
| //! no bias | |||||
| args.emplace_back(param, TensorShape{1, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, TensorShape{}); | |||||
| //! bias channel | |||||
| args.emplace_back(param, TensorShape{2, ic, h, w}, | |||||
| TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc, 1, 1}); | |||||
| }; | |||||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||||
| for (size_t ic : {1, 4, 8, 16}) | |||||
| for (size_t oc : {1, 4, 8}) | |||||
| for (size_t p : {0, 2}) | |||||
| for (size_t size : {20, 21, 24}) | |||||
| for (NonlineMode nonline_mode : | |||||
| {NonlineMode::IDENTITY, NonlineMode::RELU, | |||||
| NonlineMode::H_SWISH}) { | |||||
| run(oc, ic, size, size, kernel, p, nonline_mode); | |||||
| } | |||||
| run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle()); | |||||
| #define cb(algo_name) \ | |||||
| checker.set_before_exec_callback( \ | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \ | |||||
| UniformIntRNG rng{-50, 50}; \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \ | |||||
| .set_dtype(1, dtype::QuantizedS8(2.5f)) \ | |||||
| .set_dtype(2, dtype::QuantizedS32(6.25f)) \ | |||||
| .set_dtype(4, dtype::QuantizedS8(60.25)) \ | |||||
| .set_rng(0, &rng) \ | |||||
| .set_rng(1, &rng) \ | |||||
| .set_rng(2, &rng) \ | |||||
| .set_param(arg.param) \ | |||||
| .execs({arg.src, arg.filter, {}, {}, {}}); \ | |||||
| } | |||||
| #if MEGDNN_X86_WITH_MKL_DNN | |||||
| if (x86::is_supported(x86::SIMDType::VNNI)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN"); | |||||
| } | |||||
| #endif | |||||
| #if MEGDNN_X86_WITH_VNNI | |||||
| if (x86::is_supported(x86::SIMDType::VNNI)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_VNNI"); | |||||
| } | |||||
| #endif | |||||
| if (x86::is_supported(x86::SIMDType::AVX2)) { | |||||
| cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16"); | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| TEST_F(X86, CONV_BIAS_MATMUL) { | TEST_F(X86, CONV_BIAS_MATMUL) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||