GitOrigin-RevId: 90aa75d51e
tags/v1.0.0-rc1
| @@ -612,7 +612,7 @@ bool PoolingImpl::AlgoFilter3ModexStridexNCHW44::usable( | |||||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | ||||
| FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); | FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); | ||||
| //! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
| //! quint8 | |||||
| //! qint8 | |||||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
| param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
| return avaible; | return avaible; | ||||
| @@ -705,7 +705,7 @@ bool PoolingImpl::AlgoFilter2ModexStridexNCHW44::usable( | |||||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | ||||
| FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); | FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); | ||||
| //! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
| //! quint8 | |||||
| //! qint8 | |||||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
| param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
| return avaible; | return avaible; | ||||
| @@ -799,7 +799,7 @@ bool PoolingImpl::AlgoFilter4ModexStridexNCHW44::usable( | |||||
| FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); | FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); | ||||
| //! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
| //! quint8 | |||||
| //! qint8 | |||||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
| param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
| return avaible; | return avaible; | ||||
| @@ -892,7 +892,7 @@ bool PoolingImpl::AlgoFilter5ModexStridexNCHW44::usable( | |||||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | ||||
| FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); | FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); | ||||
| //! Int8 not support average, because its round mode is different form | //! Int8 not support average, because its round mode is different form | ||||
| //! quint8 | |||||
| //! qint8 | |||||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | ||||
| param.mode == Mode::AVERAGE); | param.mode == Mode::AVERAGE); | ||||
| return avaible; | return avaible; | ||||
| @@ -47,7 +47,7 @@ size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( | |||||
| return round_up<size_t>(oc_block_size_one_thread, 24); | return round_up<size_t>(oc_block_size_one_thread, 24); | ||||
| } | } | ||||
| size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
| WorkspaceBundle ConvBiasImpl::AlgoConv1x1::get_bundle_according_packmode( | |||||
| const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
| size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
| size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
| @@ -58,168 +58,195 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
| auto pack_mode = m_matmul_algo->packmode(); | auto pack_mode = m_matmul_algo->packmode(); | ||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
| return dispatcher | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_bundle_default"_hash)) { | |||||
| return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||||
| .get_bundle(param, matmul_param, m_matmul_algo, | .get_bundle(param, matmul_param, m_matmul_algo, | ||||
| compt_oc_block_size) | |||||
| .total_size_in_bytes(); | |||||
| compt_oc_block_size); | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
| dispatcher; | |||||
| return dispatcher | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_bundle_only_packa"_hash)) { | |||||
| return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||||
| .get_bundle(param, matmul_param, m_matmul_algo, | .get_bundle(param, matmul_param, m_matmul_algo, | ||||
| compt_oc_block_size) | |||||
| .total_size_in_bytes(); | |||||
| compt_oc_block_size); | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } else { | } else { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||||
| return dispatcher | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_bundle_no_pack"_hash)) { | |||||
| return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||||
| .get_bundle(param, matmul_param, m_matmul_algo, | .get_bundle(param, matmul_param, m_matmul_algo, | ||||
| compt_oc_block_size) | |||||
| .total_size_in_bytes(); | |||||
| compt_oc_block_size); | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } | } | ||||
| return 0; | |||||
| return {nullptr, {}}; | |||||
| } | } | ||||
| SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||||
| size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
| const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| return get_bundle_according_packmode(param).total_size_in_bytes(); | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> | |||||
| ConvBiasImpl::AlgoConv1x1::get_kerns_according_packmode( | |||||
| const NCBKernSizeParam& param, bool weight_preprocess) const { | |||||
| size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
| size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | ||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t BATCH = param.n; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||||
| auto pack_mode = m_matmul_algo->packmode(); | |||||
| Conv1x1StrategyBase* conv1x1_strategy = | |||||
| Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
| param.filter_meta.format); | |||||
| auto matmul_param = | auto matmul_param = | ||||
| utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | ||||
| WorkspaceBundle whole_bundle = {nullptr, {}}; | |||||
| WorkspaceBundle thread_bundle = {nullptr, {}}; | |||||
| WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||||
| auto pack_mode = m_matmul_algo->packmode(); | |||||
| WorkspaceBundle whole_bundle = get_bundle_according_packmode(param); | |||||
| //! NO_PACK not implement get_bundle | |||||
| WorkspaceBundle matmul_bundle ={nullptr,{}}; | |||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||||
| matmul_bundle = {nullptr, | |||||
| {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||||
| } else { | |||||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
| } | |||||
| WorkspaceBundle thread_bundle = utils::get_thread_bundle( | |||||
| param, matmul_bundle.get_size(2), compt_oc_block_size); | |||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
| whole_bundle = dispatcher.get_bundle( | |||||
| param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_kern_default"_hash)) { | |||||
| if (!weight_preprocess) { | |||||
| return Conv1x1Kerns< | |||||
| MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||||
| .get_kern(param, whole_bundle, matmul_bundle, | |||||
| thread_bundle, conv1x1_strategy, | |||||
| m_matmul_algo, compt_oc_block_size); | |||||
| } else { | |||||
| return Conv1x1Kerns< | |||||
| MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||||
| .get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||||
| conv1x1_strategy, m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| } | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
| dispatcher; | |||||
| whole_bundle = dispatcher.get_bundle( | |||||
| param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_kern_only_packa"_hash)) { | |||||
| if (!weight_preprocess) { | |||||
| return Conv1x1Kerns< | |||||
| MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||||
| .get_kern(param, whole_bundle, matmul_bundle, | |||||
| thread_bundle, conv1x1_strategy, | |||||
| m_matmul_algo, compt_oc_block_size); | |||||
| } else { | |||||
| return Conv1x1Kerns< | |||||
| MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||||
| .get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||||
| conv1x1_strategy, m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| } | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } else { | } else { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||||
| whole_bundle = dispatcher.get_bundle( | |||||
| param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
| matmul_bundle = { | |||||
| nullptr, | |||||
| {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_kern_no_pack"_hash)) { | |||||
| if (!weight_preprocess) { | |||||
| return Conv1x1Kerns< | |||||
| MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||||
| .get_kern(param, whole_bundle, matmul_bundle, | |||||
| thread_bundle, conv1x1_strategy, | |||||
| m_matmul_algo, compt_oc_block_size); | |||||
| } else { | |||||
| return Conv1x1Kerns< | |||||
| MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||||
| .get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||||
| conv1x1_strategy, m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| } | |||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } | } | ||||
| } | |||||
| //! get thread bundle | |||||
| thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||||
| compt_oc_block_size); | |||||
| Conv1x1StrategyBase* conv1x1_strategy = | |||||
| Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
| param.filter_meta.format); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||||
| const NCBKernSizeParam& param) const { | |||||
| return get_kerns_according_packmode(param, false); | |||||
| } | |||||
| auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||||
| compt_oc_block_size, conv1x1_strategy]( | |||||
| const NCBKernParam& ncb_param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
| compt_oc_block_size, this->m_matmul_algo, param, | |||||
| ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| auto kern_packB = [this, whole_bundle, matmul_bundle, param, | |||||
| conv1x1_strategy]( | |||||
| const NCBKernParam& ncb_param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||||
| this->m_matmul_algo, param, ncb_param, | |||||
| std::move(ncb_index)); | |||||
| }; | |||||
| auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, | |||||
| compt_oc_block_size, conv1x1_strategy]( | |||||
| const NCBKernParam& ncb_param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, | |||||
| compt_oc_block_size, this->m_matmul_algo, param, | |||||
| ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| SmallVector<TensorLayout> | |||||
| ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||||
| const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("deduce_preprocessed_filter_layout"_hash)) { | |||||
| WorkspaceBundle wb = get_bundle_according_packmode(param); | |||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || | |||||
| pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
| //! if enable filter preprocess kern_packA should not dispatch | |||||
| if (!is_enable_filter_preprocess(param)) { | |||||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
| } | |||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
| ret_kern.push_back({kern_packB, {1}}); | |||||
| } | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| SmallVector<TensorLayout> preprocessed_layouts; | |||||
| preprocessed_layouts.push_back( | |||||
| {{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||||
| return preprocessed_layouts; | |||||
| } | } | ||||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> | |||||
| ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||||
| const NCBKernSizeParam& param) const { | |||||
| return get_kerns_according_packmode(param, true); | |||||
| } | } | ||||
| bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | ||||
| AlgoSelectionStrategy) const { | AlgoSelectionStrategy) const { | ||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | ||||
| if (param.filter_meta.format != param::ConvBias::Format::NCHW && | |||||
| param.filter_meta.format != param::ConvBias::Format::NCHW44 && | |||||
| param.filter_meta.format != param::ConvBias::Format::NCHW44_DOT) | |||||
| return false; | |||||
| size_t FH = param.filter_meta.spatial[0], | size_t FH = param.filter_meta.spatial[0], | ||||
| FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
| size_t PH = param.filter_meta.padding[0], | size_t PH = param.filter_meta.padding[0], | ||||
| PW = param.filter_meta.padding[1]; | PW = param.filter_meta.padding[1]; | ||||
| size_t SH = param.filter_meta.stride[0], | size_t SH = param.filter_meta.stride[0], | ||||
| SW = param.filter_meta.stride[1]; | SW = param.filter_meta.stride[1]; | ||||
| if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) | |||||
| auto format = param.filter_meta.format; | |||||
| size_t OH = param.osz[0]; | |||||
| size_t OW = param.osz[1]; | |||||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
| if (format != param::ConvBias::Format::NCHW && | |||||
| format != param::ConvBias::Format::NCHW44 && | |||||
| format != param::ConvBias::Format::NCHW44_DOT) { | |||||
| return false; | return false; | ||||
| if (param.src_type.enumv() != param.filter_type.enumv()) { | |||||
| } | |||||
| //! hybird mode is not support | |||||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||||
| if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||||
| param.filter_meta.ocpg == 1) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| #else | |||||
| if (format != param::ConvBias::Format::NCHW) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| //! only matmul's packmode is packa or default support weight preprocess | |||||
| if (is_enable_filter_preprocess(param) && | |||||
| (m_matmul_algo->packmode() == | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { | |||||
| #endif | |||||
| //! param | |||||
| if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| if (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
| //! data type | |||||
| if (param.src_type.enumv() != param.filter_type.enumv() || | |||||
| (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||||
| #endif | #endif | ||||
| param.src_type.enumv() != DTypeEnum::Float32) { | |||||
| param.src_type.enumv() != DTypeEnum::Float32)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | ||||
| //! is identity otherwise return false mean that 8x8x32 and 8x8x16 | //! is identity otherwise return false mean that 8x8x32 and 8x8x16 | ||||
| //! not support PostProcess | //! not support PostProcess | ||||
| @@ -231,27 +258,13 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||||
| if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||||
| param.filter_meta.ocpg == 1) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| size_t OH = param.osz[0]; | |||||
| size_t OW = param.osz[1]; | |||||
| MatrixMulImpl::KernSizeParam matmul_param = | MatrixMulImpl::KernSizeParam matmul_param = | ||||
| utils::get_matmul_kern_param(param, OH * OW, | utils::get_matmul_kern_param(param, OH * OW, | ||||
| get_oc_tile_size_heuristic(param)); | get_oc_tile_size_heuristic(param)); | ||||
| bool matmul_usable = m_matmul_algo->usable(matmul_param); | bool matmul_usable = m_matmul_algo->usable(matmul_param); | ||||
| auto pack_mode = m_matmul_algo->packmode(); | auto pack_mode = m_matmul_algo->packmode(); | ||||
| bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( | bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( | ||||
| param, pack_mode, param.filter_meta.format); | param, pack_mode, param.filter_meta.format); | ||||
| return matmul_usable && strategy_usable && | return matmul_usable && strategy_usable && | ||||
| (param.filter_meta.dilation[0] == | (param.filter_meta.dilation[0] == | ||||
| param.filter_meta.dilation[1] && | param.filter_meta.dilation[1] && | ||||
| @@ -262,121 +275,6 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||||
| return false; | return false; | ||||
| } | } | ||||
| SmallVector<TensorLayout> | |||||
| ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||||
| const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_conv1x1, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout"_hash)) { | |||||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||||
| m_matmul_algo->matmul_description(); | |||||
| bool default_pack = matmul_desc.packmode == | |||||
| MatrixMulImpl::AlgoBase::PackMode::DEFAULT; | |||||
| bool only_packA = matmul_desc.packmode == | |||||
| MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA; | |||||
| //! only support default_pack and only_packa mode | |||||
| if (matmul_desc.packmode == | |||||
| MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||||
| return {}; | |||||
| } | |||||
| size_t OH = param.osz[0]; | |||||
| size_t OW = param.osz[1]; | |||||
| size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||||
| auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||||
| compt_oc_block_size); | |||||
| WorkspaceBundle wb(nullptr, {}); | |||||
| if (default_pack) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
| wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| } else if (only_packA) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
| dispatcher; | |||||
| wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| } | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| SmallVector<TensorLayout> preprocessed_layouts; | |||||
| preprocessed_layouts.push_back( | |||||
| {{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||||
| return preprocessed_layouts; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> | |||||
| ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||||
| const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_conv1x1, | |||||
| midout_iv( | |||||
| "ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns"_hash)) { | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| size_t OH = param.osz[0]; | |||||
| size_t OW = param.osz[1]; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||||
| auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||||
| compt_oc_block_size); | |||||
| WorkspaceBundle whole_bundle = {nullptr, {}}; | |||||
| WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||||
| auto pack_mode = m_matmul_algo->packmode(); | |||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||||
| midout_iv("get_defaul_matmul_packmode_bundle"_hash)) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> | |||||
| dispatcher; | |||||
| whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||||
| m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_fallback_conv1x1, | |||||
| midout_iv("get_onlypacka_matmul_packmode_bundle"_hash)) { | |||||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||||
| dispatcher; | |||||
| whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||||
| m_matmul_algo, | |||||
| compt_oc_block_size); | |||||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } else { | |||||
| //! if nopack return null so that OprWeightPreprocessProxy can run | |||||
| //! with nopack mode | |||||
| return {}; | |||||
| } | |||||
| Conv1x1StrategyBase* conv1x1_strategy = | |||||
| Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
| param.filter_meta.format); | |||||
| auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||||
| compt_oc_block_size, conv1x1_strategy]( | |||||
| const NCBKernParam& ncb_param, | |||||
| const NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
| compt_oc_block_size, this->m_matmul_algo, | |||||
| param, ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| bool ConvBiasImpl::AlgoConv1x1::is_preferred( | bool ConvBiasImpl::AlgoConv1x1::is_preferred( | ||||
| const NCBKernSizeParam& param) const { | const NCBKernSizeParam& param) const { | ||||
| @@ -20,6 +20,11 @@ namespace megdnn { | |||||
| namespace fallback { | namespace fallback { | ||||
| class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | ||||
| WorkspaceBundle get_bundle_according_packmode( | |||||
| const NCBKernSizeParam& param) const; | |||||
| SmallVector<NCBKern> get_kerns_according_packmode( | |||||
| const NCBKernSizeParam& param, bool weight_preprocess) const; | |||||
| public: | public: | ||||
| AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | ||||
| : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | ||||
| @@ -41,7 +46,7 @@ public: | |||||
| const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
| bool is_preferred(const NCBKernSizeParam&) const override; | bool is_preferred(const NCBKernSizeParam&) const override; | ||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | ||||
| const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
| size_t get_preprocess_workspace( | size_t get_preprocess_workspace( | ||||
| @@ -360,23 +360,23 @@ ConvBiasImpl::AlgoConv1x1Gemv::dispatch_kerns( | |||||
| dt_uint8, PostprocessMode::QUANTIZED, | dt_uint8, PostprocessMode::QUANTIZED, | ||||
| "NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); | "NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); | ||||
| break; | break; | ||||
| //!no support nchw44 8x8x16 | |||||
| case param::ConvBias::Format::NCHW44: | case param::ConvBias::Format::NCHW44: | ||||
| cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, | cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, | ||||
| PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); | PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); | ||||
| cb2(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||||
| dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, | |||||
| cb3(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||||
| dt_int8, dt_int32, dt_int32, PostprocessMode::ADD_BIAS, | |||||
| "NCHW44::GEMV::INT8x8x32_INT32"_hash); | "NCHW44::GEMV::INT8x8x32_INT32"_hash); | ||||
| cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||||
| cb3(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||||
| dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | ||||
| dt_int32, PostprocessMode::NO_PROCESS, | |||||
| dt_int32, PostprocessMode::ADD_BIAS, | |||||
| "NCHW44::GEMV::QINT8x8x32_QINT32"_hash); | "NCHW44::GEMV::QINT8x8x32_QINT32"_hash); | ||||
| cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | ||||
| dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | ||||
| dt_int8, PostprocessMode::QUANTIZED, | dt_int8, PostprocessMode::QUANTIZED, | ||||
| "NCHW44::GEMV::QINT8x8x32_QINT8"_hash); | "NCHW44::GEMV::QINT8x8x32_QINT8"_hash); | ||||
| break; | break; | ||||
| //!no support nchw44-dot 8x8x16 | |||||
| case param::ConvBias::Format::NCHW44_DOT: | case param::ConvBias::Format::NCHW44_DOT: | ||||
| cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, | cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, | ||||
| dt_int32, dt_int8, dt_int32, dt_int32, | dt_int32, dt_int8, dt_int32, dt_int32, | ||||
| @@ -420,81 +420,74 @@ bool ConvBiasImpl::AlgoConv1x1Gemv::usable(const NCBKernSizeParam& param, | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, | MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, | ||||
| midout_iv("AlgoConv1x1Gemv::usable"_hash)) { | midout_iv("AlgoConv1x1Gemv::usable"_hash)) { | ||||
| auto format = param.filter_meta.format; | auto format = param.filter_meta.format; | ||||
| #if MEGDNN_X86 | |||||
| if (format != param::ConvBias::Format::NCHW) | |||||
| return false; | |||||
| #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
| if (format != param::ConvBias::Format::NCHW && | |||||
| format != param::ConvBias::Format::NCHW44 && | |||||
| format != param::ConvBias::Format::NCHW44_DOT) | |||||
| return false; | |||||
| #endif | |||||
| //! whether 1x1 | |||||
| size_t FH = param.filter_meta.spatial[0], | size_t FH = param.filter_meta.spatial[0], | ||||
| FW = param.filter_meta.spatial[1]; | FW = param.filter_meta.spatial[1]; | ||||
| size_t PH = param.filter_meta.padding[0], | size_t PH = param.filter_meta.padding[0], | ||||
| PW = param.filter_meta.padding[1]; | PW = param.filter_meta.padding[1]; | ||||
| size_t SH = param.filter_meta.stride[0], | size_t SH = param.filter_meta.stride[0], | ||||
| SW = param.filter_meta.stride[1]; | SW = param.filter_meta.stride[1]; | ||||
| if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||||
| return false; | |||||
| } | |||||
| //! whether gemv | |||||
| size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
| size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
| if (OH * OW != 1) { | |||||
| //! whether gemv and 1x1 | |||||
| if (OH * OW != 1 || FH != 1 || FW != 1 || PH || PW || SH != 1 || | |||||
| SW != 1) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| //! even no naive support in gemv | |||||
| if ((param.src_type.enumv() == param.filter_type.enumv() && | |||||
| param.src_type.enumv() == DTypeEnum::Int16) && | |||||
| param.dst_type.enumv() == DTypeEnum::Int32) { | |||||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
| if (format != param::ConvBias::Format::NCHW && | |||||
| format != param::ConvBias::Format::NCHW44 && | |||||
| format != param::ConvBias::Format::NCHW44_DOT) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||||
| //! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||||
| //! not support PostProcess | |||||
| if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||||
| param.dst_type.enumv() == DTypeEnum::Int32 || | |||||
| param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||||
| if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| //! supports a few dtypes | |||||
| if (param.src_type.enumv() != param.filter_type.enumv()) { | |||||
| #else | |||||
| if (format != param::ConvBias::Format::NCHW) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| if (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
| #endif | |||||
| //! supports a few dtypes | |||||
| if (param.src_type.enumv() != param.filter_type.enumv() || | |||||
| (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||||
| #endif | #endif | ||||
| param.src_type.enumv() != DTypeEnum::Float32) { | |||||
| param.src_type.enumv() != DTypeEnum::Float32)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
| if (format == param::ConvBias::Format::NCHW44) { | if (format == param::ConvBias::Format::NCHW44) { | ||||
| if (param.src_type.enumv() != DTypeEnum::Float32 && | if (param.src_type.enumv() != DTypeEnum::Float32 && | ||||
| param.src_type.enumv() != DTypeEnum::Int8 && | param.src_type.enumv() != DTypeEnum::Int8 && | ||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8) { | param.src_type.enumv() != DTypeEnum::QuantizedS8) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| //! 8x8x16 is not support nchw44 | |||||
| if (param.src_type.enumv() == DTypeEnum::Int8 && | |||||
| param.dst_type.enumv() == DTypeEnum::Int16) { | |||||
| return false; | |||||
| } | |||||
| } else if (format == param::ConvBias::Format::NCHW44_DOT) { | } else if (format == param::ConvBias::Format::NCHW44_DOT) { | ||||
| if (param.src_type.enumv() != DTypeEnum::Int8 && | |||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8) { | |||||
| if ((param.src_type.enumv() != DTypeEnum::Int8 && | |||||
| param.src_type.enumv() != DTypeEnum::QuantizedS8) || | |||||
| param.dst_type.enumv() == DTypeEnum::Int16) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| //! make sure 8x8x16 and 8x8x32 biasmode nonlineMode is identity | |||||
| //! otherwise return false | |||||
| if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||||
| param.dst_type.enumv() == DTypeEnum::Int32 || | |||||
| param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||||
| if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| //! even no naive support in gemv | |||||
| if ((param.src_type.enumv() == param.filter_type.enumv() && | |||||
| param.src_type.enumv() == DTypeEnum::Int16) && | |||||
| param.dst_type.enumv() == DTypeEnum::Int32) { | |||||
| return false; | |||||
| } | |||||
| return (param.filter_meta.dilation[0] == | return (param.filter_meta.dilation[0] == | ||||
| param.filter_meta.dilation[1] && | param.filter_meta.dilation[1] && | ||||
| param.filter_meta.dilation[0] == 1) && | param.filter_meta.dilation[0] == 1) && | ||||
| @@ -11,14 +11,19 @@ | |||||
| #pragma once | #pragma once | ||||
| #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||||
| #include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" | #include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" | ||||
| #include "src/fallback/conv_bias/opr_impl.h" | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace fallback { | namespace fallback { | ||||
| namespace conv1x1 { | namespace conv1x1 { | ||||
| template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | ||||
| class Conv1x1Kerns { | |||||
| class Conv1x1Kerns; | |||||
| template <> | |||||
| class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> { | |||||
| public: | public: | ||||
| //! get_bundle | //! get_bundle | ||||
| WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | ||||
| @@ -28,13 +33,12 @@ public: | |||||
| size_t GROUP = param.filter_meta.group; | size_t GROUP = param.filter_meta.group; | ||||
| size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
| size_t BATCH = param.n; | size_t BATCH = param.n; | ||||
| //! bundle per thread | //! bundle per thread | ||||
| //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | ||||
| //! * OW this does not bother packb bytes | //! * OW this does not bother packb bytes | ||||
| auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | ||||
| auto thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||||
| oc_tile_size); | |||||
| auto thread_bundle = utils::get_thread_bundle( | |||||
| param, matmul_bundle.get_size(2), oc_tile_size); | |||||
| //! size per thread | //! size per thread | ||||
| size_t all_threads_bytes = | size_t all_threads_bytes = | ||||
| thread_bundle.total_size_in_bytes() * param.nr_threads; | thread_bundle.total_size_in_bytes() * param.nr_threads; | ||||
| @@ -46,11 +50,6 @@ public: | |||||
| is_enable_filter_preprocess(param) | is_enable_filter_preprocess(param) | ||||
| ? 0 | ? 0 | ||||
| : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | ||||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) | |||||
| return WorkspaceBundle{nullptr, | |||||
| {all_packa_bytes, 0, all_threads_bytes}}; | |||||
| //! packb size = N * GROUP * packb_size_per_group | //! packb size = N * GROUP * packb_size_per_group | ||||
| size_t packb_bytes_per_group = matmul_bundle.get_size(1); | size_t packb_bytes_per_group = matmul_bundle.get_size(1); | ||||
| size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | ||||
| @@ -58,6 +57,165 @@ public: | |||||
| return WorkspaceBundle{ | return WorkspaceBundle{ | ||||
| nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | ||||
| } | } | ||||
| SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
| WorkspaceBundle& thread_bundle, | |||||
| Conv1x1StrategyBase* conv1x1_strategy, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
| auto kern_packA = | |||||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
| conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
| oc_block_size, matmul_algo, param, | |||||
| ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| auto kern_packB = | |||||
| [whole_bundle, matmul_bundle, param, matmul_algo, | |||||
| conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||||
| matmul_algo, param, ncb_param, | |||||
| std::move(ncb_index)); | |||||
| }; | |||||
| auto kern_compt = | |||||
| [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||||
| oc_block_size, conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||||
| thread_bundle, oc_block_size, | |||||
| matmul_algo, param, ncb_param, | |||||
| std::move(ncb_index)); | |||||
| }; | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t BATCH = param.n; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| if (!is_enable_filter_preprocess(param)) { | |||||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
| } | |||||
| ret_kern.push_back({kern_packB, {BATCH}}); | |||||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
| Conv1x1StrategyBase* conv1x1_strategy, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
| auto kern_packA = | |||||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
| conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
| oc_block_size, matmul_algo, param, | |||||
| ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| } | |||||
| }; | |||||
| template<> | |||||
| class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> { | |||||
| public: | |||||
| //! get_bundle | |||||
| WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||||
| const MatrixMulImpl::KernSizeParam& matmul_param, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
| size_t oc_tile_size) { | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| //! bundle per thread | |||||
| //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||||
| //! * OW this does not bother packb bytes | |||||
| auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||||
| auto thread_bundle = utils::get_thread_bundle( | |||||
| param, matmul_bundle.get_size(2), oc_tile_size); | |||||
| //! size per thread | |||||
| size_t all_threads_bytes = | |||||
| thread_bundle.total_size_in_bytes() * param.nr_threads; | |||||
| //! packa size = GROUP * packa_size_each_group | |||||
| size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||||
| size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||||
| size_t all_packa_bytes = | |||||
| is_enable_filter_preprocess(param) | |||||
| ? 0 | |||||
| : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||||
| return WorkspaceBundle{nullptr, | |||||
| {all_packa_bytes, 0, all_threads_bytes}}; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
| WorkspaceBundle& thread_bundle, | |||||
| Conv1x1StrategyBase* conv1x1_strategy, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
| auto kern_packA = | |||||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
| conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
| oc_block_size, matmul_algo, param, | |||||
| ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| auto kern_compt = | |||||
| [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||||
| oc_block_size, conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||||
| thread_bundle, oc_block_size, | |||||
| matmul_algo, param, ncb_param, | |||||
| std::move(ncb_index)); | |||||
| }; | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t BATCH = param.n; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| if (!is_enable_filter_preprocess(param)) { | |||||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
| } | |||||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
| Conv1x1StrategyBase* conv1x1_strategy, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
| auto kern_packA = | |||||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||||
| conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
| oc_block_size, matmul_algo, param, | |||||
| ncb_param, std::move(ncb_index)); | |||||
| }; | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| } | |||||
| }; | }; | ||||
| template<> | template<> | ||||
| @@ -69,14 +227,47 @@ public: | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, | const MatrixMulImpl::AlgoBase* matmul_algo, | ||||
| size_t oc_tile_size) { | size_t oc_tile_size) { | ||||
| size_t matmul_size = matmul_algo->get_workspace(matmul_param); | size_t matmul_size = matmul_algo->get_workspace(matmul_param); | ||||
| auto thread_bundle = utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||||
| auto thread_bundle = | |||||
| utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||||
| //! size per thread | //! size per thread | ||||
| size_t all_threads_bytes = | size_t all_threads_bytes = | ||||
| thread_bundle.total_size_in_bytes() * param.nr_threads; | thread_bundle.total_size_in_bytes() * param.nr_threads; | ||||
| return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | ||||
| } | } | ||||
| SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||||
| WorkspaceBundle& thread_bundle, | |||||
| Conv1x1StrategyBase* conv1x1_strategy, | |||||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||||
| auto kern_compt = | |||||
| [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||||
| oc_block_size, conv1x1_strategy]( | |||||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||||
| thread_bundle, oc_block_size, | |||||
| matmul_algo, param, ncb_param, | |||||
| std::move(ncb_index)); | |||||
| }; | |||||
| size_t GROUP = param.filter_meta.group; | |||||
| size_t BATCH = param.n; | |||||
| size_t OC = param.filter_meta.ocpg; | |||||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
| return ret_kern; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||||
| const ConvBiasImpl::NCBKernSizeParam&, WorkspaceBundle&, | |||||
| WorkspaceBundle&, Conv1x1StrategyBase*, | |||||
| const MatrixMulImpl::AlgoBase*, size_t) { | |||||
| return {}; | |||||
| } | |||||
| }; | }; | ||||
| } // namespace conv1x1 | } // namespace conv1x1 | ||||
| } // namespace fallback | } // namespace fallback | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -59,7 +59,8 @@ public: | |||||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | template <typename src_ctype, typename bias_ctype, typename dst_ctype, | ||||
| typename op_ctype, typename op_dtype, | typename op_ctype, typename op_dtype, | ||||
| megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||||
| megdnn::PostprocessMode postprocess_mode, | |||||
| MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||||
| class Conv1x1Strategy : public Conv1x1StrategyBase { | class Conv1x1Strategy : public Conv1x1StrategyBase { | ||||
| public: | public: | ||||
| explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} | explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} | ||||
| @@ -136,32 +137,30 @@ public: | |||||
| size_t packb_bytes_per_group = matmul_bundle.get_size(1); | size_t packb_bytes_per_group = matmul_bundle.get_size(1); | ||||
| size_t GROUP = param.filter_meta.group; | size_t GROUP = param.filter_meta.group; | ||||
| size_t BATCH = param.n; | |||||
| size_t SH = param.filter_meta.stride[0]; | size_t SH = param.filter_meta.stride[0]; | ||||
| size_t SW = param.filter_meta.stride[1]; | size_t SW = param.filter_meta.stride[1]; | ||||
| size_t OH = param.osz[0]; | size_t OH = param.osz[0]; | ||||
| size_t OW = param.osz[1]; | size_t OW = param.osz[1]; | ||||
| size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
| size_t batch = ncb_index.ndrange_id[0]; | |||||
| MatrixMulImpl::KernParam matmul_kern_param; | MatrixMulImpl::KernParam matmul_kern_param; | ||||
| static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | ||||
| utils::get_matmul_kern_param(param, OH * OW, OC); | utils::get_matmul_kern_param(param, OH * OW, OC); | ||||
| rep(batch, BATCH) { | |||||
| rep(g, GROUP) { | |||||
| if (SH == 2 && SW == 2) | |||||
| megdnn_throw("no support for stride = 2"); | |||||
| size_t bytes_offset_of_b_panel = | |||||
| batch * packb_bytes_per_group * GROUP + | |||||
| g * packb_bytes_per_group; | |||||
| src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||||
| reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||||
| bytes_offset_of_b_panel); | |||||
| matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||||
| ncb_param.src<src_ctype>(batch, g)); | |||||
| matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||||
| } | |||||
| rep(g, GROUP) { | |||||
| if (SH == 2 && SW == 2) | |||||
| megdnn_throw("no support for stride = 2"); | |||||
| size_t bytes_offset_of_b_panel = | |||||
| batch * packb_bytes_per_group * GROUP + | |||||
| g * packb_bytes_per_group; | |||||
| src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||||
| reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||||
| bytes_offset_of_b_panel); | |||||
| matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||||
| ncb_param.src<src_ctype>(batch, g)); | |||||
| matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||||
| } | } | ||||
| } else { | } else { | ||||
| megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | ||||