GitOrigin-RevId: 90aa75d51e
tags/v1.0.0-rc1
| @@ -612,7 +612,7 @@ bool PoolingImpl::AlgoFilter3ModexStridexNCHW44::usable( | |||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
| FH == 3 && FW == 3 && SW == SH && (SH == 1 || SW == 2); | |||
| //! Int8 not support average, because its round mode is different form | |||
| //! quint8 | |||
| //! qint8 | |||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
| param.mode == Mode::AVERAGE); | |||
| return avaible; | |||
| @@ -705,7 +705,7 @@ bool PoolingImpl::AlgoFilter2ModexStridexNCHW44::usable( | |||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
| FH == 2 && FW == 2 && SH == SW && (SW == 1 || SW == 2); | |||
| //! Int8 not support average, because its round mode is different form | |||
| //! quint8 | |||
| //! qint8 | |||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
| param.mode == Mode::AVERAGE); | |||
| return avaible; | |||
| @@ -799,7 +799,7 @@ bool PoolingImpl::AlgoFilter4ModexStridexNCHW44::usable( | |||
| FH == 4 && FW == 4 && SH == SW && (SW == 1 || SW == 2); | |||
| //! Int8 not support average, because its round mode is different form | |||
| //! quint8 | |||
| //! qint8 | |||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
| param.mode == Mode::AVERAGE); | |||
| return avaible; | |||
| @@ -892,7 +892,7 @@ bool PoolingImpl::AlgoFilter5ModexStridexNCHW44::usable( | |||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
| FH == 5 && FW == 5 && SH == SW && (SW == 1 || SW == 2); | |||
| //! Int8 not support average, because its round mode is different form | |||
| //! quint8 | |||
| //! qint8 | |||
| avaible &= !(param.src_type.enumv() == DTypeEnum::Int8 && | |||
| param.mode == Mode::AVERAGE); | |||
| return avaible; | |||
| @@ -47,7 +47,7 @@ size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( | |||
| return round_up<size_t>(oc_block_size_one_thread, 24); | |||
| } | |||
| size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||
| WorkspaceBundle ConvBiasImpl::AlgoConv1x1::get_bundle_according_packmode( | |||
| const NCBKernSizeParam& param) const { | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| @@ -58,168 +58,195 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||
| auto pack_mode = m_matmul_algo->packmode(); | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||
| return dispatcher | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_bundle_default"_hash)) { | |||
| return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||
| .get_bundle(param, matmul_param, m_matmul_algo, | |||
| compt_oc_block_size) | |||
| .total_size_in_bytes(); | |||
| compt_oc_block_size); | |||
| } | |||
| MIDOUT_END(); | |||
| } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
| dispatcher; | |||
| return dispatcher | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_bundle_only_packa"_hash)) { | |||
| return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||
| .get_bundle(param, matmul_param, m_matmul_algo, | |||
| compt_oc_block_size) | |||
| .total_size_in_bytes(); | |||
| compt_oc_block_size); | |||
| } | |||
| MIDOUT_END(); | |||
| } else { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||
| return dispatcher | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_bundle_no_pack"_hash)) { | |||
| return Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||
| .get_bundle(param, matmul_param, m_matmul_algo, | |||
| compt_oc_block_size) | |||
| .total_size_in_bytes(); | |||
| compt_oc_block_size); | |||
| } | |||
| MIDOUT_END(); | |||
| } | |||
| return 0; | |||
| return {nullptr, {}}; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||
| size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||
| const NCBKernSizeParam& param) const { | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| return get_bundle_according_packmode(param).total_size_in_bytes(); | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> | |||
| ConvBiasImpl::AlgoConv1x1::get_kerns_according_packmode( | |||
| const NCBKernSizeParam& param, bool weight_preprocess) const { | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t BATCH = param.n; | |||
| size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||
| auto pack_mode = m_matmul_algo->packmode(); | |||
| Conv1x1StrategyBase* conv1x1_strategy = | |||
| Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||
| param.filter_meta.format); | |||
| auto matmul_param = | |||
| utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | |||
| WorkspaceBundle whole_bundle = {nullptr, {}}; | |||
| WorkspaceBundle thread_bundle = {nullptr, {}}; | |||
| WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||
| auto pack_mode = m_matmul_algo->packmode(); | |||
| WorkspaceBundle whole_bundle = get_bundle_according_packmode(param); | |||
| //! NO_PACK not implement get_bundle | |||
| WorkspaceBundle matmul_bundle ={nullptr,{}}; | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||
| matmul_bundle = {nullptr, | |||
| {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||
| } else { | |||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
| } | |||
| WorkspaceBundle thread_bundle = utils::get_thread_bundle( | |||
| param, matmul_bundle.get_size(2), compt_oc_block_size); | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||
| whole_bundle = dispatcher.get_bundle( | |||
| param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_kern_default"_hash)) { | |||
| if (!weight_preprocess) { | |||
| return Conv1x1Kerns< | |||
| MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||
| .get_kern(param, whole_bundle, matmul_bundle, | |||
| thread_bundle, conv1x1_strategy, | |||
| m_matmul_algo, compt_oc_block_size); | |||
| } else { | |||
| return Conv1x1Kerns< | |||
| MatrixMulImpl::AlgoBase::PackMode::DEFAULT>() | |||
| .get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||
| conv1x1_strategy, m_matmul_algo, | |||
| compt_oc_block_size); | |||
| } | |||
| } | |||
| MIDOUT_END(); | |||
| } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
| dispatcher; | |||
| whole_bundle = dispatcher.get_bundle( | |||
| param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_kern_only_packa"_hash)) { | |||
| if (!weight_preprocess) { | |||
| return Conv1x1Kerns< | |||
| MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||
| .get_kern(param, whole_bundle, matmul_bundle, | |||
| thread_bundle, conv1x1_strategy, | |||
| m_matmul_algo, compt_oc_block_size); | |||
| } else { | |||
| return Conv1x1Kerns< | |||
| MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>() | |||
| .get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||
| conv1x1_strategy, m_matmul_algo, | |||
| compt_oc_block_size); | |||
| } | |||
| } | |||
| MIDOUT_END(); | |||
| } else { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||
| whole_bundle = dispatcher.get_bundle( | |||
| param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||
| matmul_bundle = { | |||
| nullptr, | |||
| {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_kern_no_pack"_hash)) { | |||
| if (!weight_preprocess) { | |||
| return Conv1x1Kerns< | |||
| MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||
| .get_kern(param, whole_bundle, matmul_bundle, | |||
| thread_bundle, conv1x1_strategy, | |||
| m_matmul_algo, compt_oc_block_size); | |||
| } else { | |||
| return Conv1x1Kerns< | |||
| MatrixMulImpl::AlgoBase::PackMode::NO_PACK>() | |||
| .get_kern_preprocess(param, whole_bundle, matmul_bundle, | |||
| conv1x1_strategy, m_matmul_algo, | |||
| compt_oc_block_size); | |||
| } | |||
| } | |||
| MIDOUT_END(); | |||
| } | |||
| } | |||
| //! get thread bundle | |||
| thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||
| compt_oc_block_size); | |||
| Conv1x1StrategyBase* conv1x1_strategy = | |||
| Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||
| param.filter_meta.format); | |||
| SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||
| const NCBKernSizeParam& param) const { | |||
| return get_kerns_according_packmode(param, false); | |||
| } | |||
| auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||
| compt_oc_block_size, conv1x1_strategy]( | |||
| const NCBKernParam& ncb_param, | |||
| const NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
| compt_oc_block_size, this->m_matmul_algo, param, | |||
| ncb_param, std::move(ncb_index)); | |||
| }; | |||
| auto kern_packB = [this, whole_bundle, matmul_bundle, param, | |||
| conv1x1_strategy]( | |||
| const NCBKernParam& ncb_param, | |||
| const NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||
| this->m_matmul_algo, param, ncb_param, | |||
| std::move(ncb_index)); | |||
| }; | |||
| auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, | |||
| compt_oc_block_size, conv1x1_strategy]( | |||
| const NCBKernParam& ncb_param, | |||
| const NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, | |||
| compt_oc_block_size, this->m_matmul_algo, param, | |||
| ncb_param, std::move(ncb_index)); | |||
| }; | |||
| SmallVector<TensorLayout> | |||
| ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||
| const NCBKernSizeParam& param) const { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("deduce_preprocessed_filter_layout"_hash)) { | |||
| WorkspaceBundle wb = get_bundle_according_packmode(param); | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || | |||
| pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
| //! if enable filter preprocess kern_packA should not dispatch | |||
| if (!is_enable_filter_preprocess(param)) { | |||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
| } | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
| ret_kern.push_back({kern_packB, {1}}); | |||
| } | |||
| size_t GROUP = param.filter_meta.group; | |||
| SmallVector<TensorLayout> preprocessed_layouts; | |||
| preprocessed_layouts.push_back( | |||
| {{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||
| return preprocessed_layouts; | |||
| } | |||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| MIDOUT_END(); | |||
| return {}; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> | |||
| ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||
| const NCBKernSizeParam& param) const { | |||
| return get_kerns_according_packmode(param, true); | |||
| } | |||
| bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||
| AlgoSelectionStrategy) const { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | |||
| if (param.filter_meta.format != param::ConvBias::Format::NCHW && | |||
| param.filter_meta.format != param::ConvBias::Format::NCHW44 && | |||
| param.filter_meta.format != param::ConvBias::Format::NCHW44_DOT) | |||
| return false; | |||
| size_t FH = param.filter_meta.spatial[0], | |||
| FW = param.filter_meta.spatial[1]; | |||
| size_t PH = param.filter_meta.padding[0], | |||
| PW = param.filter_meta.padding[1]; | |||
| size_t SH = param.filter_meta.stride[0], | |||
| SW = param.filter_meta.stride[1]; | |||
| if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) | |||
| auto format = param.filter_meta.format; | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| if (format != param::ConvBias::Format::NCHW && | |||
| format != param::ConvBias::Format::NCHW44 && | |||
| format != param::ConvBias::Format::NCHW44_DOT) { | |||
| return false; | |||
| if (param.src_type.enumv() != param.filter_type.enumv()) { | |||
| } | |||
| //! hybird mode is not support | |||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||
| if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||
| param.filter_meta.ocpg == 1) { | |||
| return false; | |||
| } | |||
| } | |||
| #else | |||
| if (format != param::ConvBias::Format::NCHW) { | |||
| return false; | |||
| } | |||
| //! only matmul's packmode is packa or default support weight preprocess | |||
| if (is_enable_filter_preprocess(param) && | |||
| (m_matmul_algo->packmode() == | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { | |||
| #endif | |||
| //! param | |||
| if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||
| return false; | |||
| } | |||
| if (param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
| //! data type | |||
| if (param.src_type.enumv() != param.filter_type.enumv() || | |||
| (param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||
| #endif | |||
| param.src_type.enumv() != DTypeEnum::Float32) { | |||
| param.src_type.enumv() != DTypeEnum::Float32)) { | |||
| return false; | |||
| } | |||
| //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||
| //! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||
| //! not support PostProcess | |||
| @@ -231,27 +258,13 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||
| return false; | |||
| } | |||
| } | |||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { | |||
| if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || | |||
| param.filter_meta.ocpg == 1) { | |||
| return false; | |||
| } | |||
| } | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| MatrixMulImpl::KernSizeParam matmul_param = | |||
| utils::get_matmul_kern_param(param, OH * OW, | |||
| get_oc_tile_size_heuristic(param)); | |||
| bool matmul_usable = m_matmul_algo->usable(matmul_param); | |||
| auto pack_mode = m_matmul_algo->packmode(); | |||
| bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( | |||
| param, pack_mode, param.filter_meta.format); | |||
| return matmul_usable && strategy_usable && | |||
| (param.filter_meta.dilation[0] == | |||
| param.filter_meta.dilation[1] && | |||
| @@ -262,121 +275,6 @@ bool ConvBiasImpl::AlgoConv1x1::usable(const NCBKernSizeParam& param, | |||
| return false; | |||
| } | |||
| SmallVector<TensorLayout> | |||
| ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout( | |||
| const NCBKernSizeParam& param) const { | |||
| MIDOUT_BEGIN( | |||
| megdnn_fallback_conv1x1, | |||
| midout_iv( | |||
| "ConvBiasImpl::AlgoConv1x1::deduce_preprocessed_filter_layout"_hash)) { | |||
| fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = | |||
| m_matmul_algo->matmul_description(); | |||
| bool default_pack = matmul_desc.packmode == | |||
| MatrixMulImpl::AlgoBase::PackMode::DEFAULT; | |||
| bool only_packA = matmul_desc.packmode == | |||
| MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA; | |||
| //! only support default_pack and only_packa mode | |||
| if (matmul_desc.packmode == | |||
| MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||
| return {}; | |||
| } | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||
| auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||
| compt_oc_block_size); | |||
| WorkspaceBundle wb(nullptr, {}); | |||
| if (default_pack) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||
| wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||
| compt_oc_block_size); | |||
| } else if (only_packA) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
| dispatcher; | |||
| wb = dispatcher.get_bundle(param, matmul_param, m_matmul_algo, | |||
| compt_oc_block_size); | |||
| } | |||
| size_t GROUP = param.filter_meta.group; | |||
| SmallVector<TensorLayout> preprocessed_layouts; | |||
| preprocessed_layouts.push_back( | |||
| {{GROUP, wb.get_size(0)}, dtype::Int8()}); | |||
| return preprocessed_layouts; | |||
| } | |||
| MIDOUT_END(); | |||
| return {}; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> | |||
| ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns( | |||
| const NCBKernSizeParam& param) const { | |||
| MIDOUT_BEGIN( | |||
| megdnn_fallback_conv1x1, | |||
| midout_iv( | |||
| "ConvBiasImpl::AlgoConv1x1::dispatch_preprocess_kerns"_hash)) { | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||
| auto matmul_param = utils::get_matmul_kern_param(param, OH * OW, | |||
| compt_oc_block_size); | |||
| WorkspaceBundle whole_bundle = {nullptr, {}}; | |||
| WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||
| auto pack_mode = m_matmul_algo->packmode(); | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1, | |||
| midout_iv("get_defaul_matmul_packmode_bundle"_hash)) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> | |||
| dispatcher; | |||
| whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||
| m_matmul_algo, | |||
| compt_oc_block_size); | |||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
| } | |||
| MIDOUT_END(); | |||
| } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
| MIDOUT_BEGIN( | |||
| megdnn_fallback_conv1x1, | |||
| midout_iv("get_onlypacka_matmul_packmode_bundle"_hash)) { | |||
| Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> | |||
| dispatcher; | |||
| whole_bundle = dispatcher.get_bundle(param, matmul_param, | |||
| m_matmul_algo, | |||
| compt_oc_block_size); | |||
| matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||
| } | |||
| MIDOUT_END(); | |||
| } else { | |||
| //! if nopack return null so that OprWeightPreprocessProxy can run | |||
| //! with nopack mode | |||
| return {}; | |||
| } | |||
| Conv1x1StrategyBase* conv1x1_strategy = | |||
| Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||
| param.filter_meta.format); | |||
| auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||
| compt_oc_block_size, conv1x1_strategy]( | |||
| const NCBKernParam& ncb_param, | |||
| const NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
| compt_oc_block_size, this->m_matmul_algo, | |||
| param, ncb_param, std::move(ncb_index)); | |||
| }; | |||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| } | |||
| MIDOUT_END(); | |||
| return {}; | |||
| } | |||
| bool ConvBiasImpl::AlgoConv1x1::is_preferred( | |||
| const NCBKernSizeParam& param) const { | |||
| @@ -20,6 +20,11 @@ namespace megdnn { | |||
| namespace fallback { | |||
| class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | |||
| WorkspaceBundle get_bundle_according_packmode( | |||
| const NCBKernSizeParam& param) const; | |||
| SmallVector<NCBKern> get_kerns_according_packmode( | |||
| const NCBKernSizeParam& param, bool weight_preprocess) const; | |||
| public: | |||
| AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | |||
| : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | |||
| @@ -41,7 +46,7 @@ public: | |||
| const NCBKernSizeParam& param) const override; | |||
| bool is_preferred(const NCBKernSizeParam&) const override; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const NCBKernSizeParam& param) const override; | |||
| size_t get_preprocess_workspace( | |||
| @@ -360,23 +360,23 @@ ConvBiasImpl::AlgoConv1x1Gemv::dispatch_kerns( | |||
| dt_uint8, PostprocessMode::QUANTIZED, | |||
| "NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); | |||
| break; | |||
| //!no support nchw44 8x8x16 | |||
| case param::ConvBias::Format::NCHW44: | |||
| cb1(param::ConvBias::Format::NCHW44, dt_float32, dt_float32, | |||
| PostprocessMode::FLOAT, "NCHW44::GEMV::FLOAT"_hash); | |||
| cb2(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||
| dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, | |||
| cb3(param::ConvBias::Format::NCHW44, dt_int8, dt_int32, dt_int32, | |||
| dt_int8, dt_int32, dt_int32, PostprocessMode::ADD_BIAS, | |||
| "NCHW44::GEMV::INT8x8x32_INT32"_hash); | |||
| cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||
| cb3(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||
| dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | |||
| dt_int32, PostprocessMode::NO_PROCESS, | |||
| dt_int32, PostprocessMode::ADD_BIAS, | |||
| "NCHW44::GEMV::QINT8x8x32_QINT32"_hash); | |||
| cb2(param::ConvBias::Format::NCHW44, dtype::QuantizedS8, | |||
| dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | |||
| dt_int8, PostprocessMode::QUANTIZED, | |||
| "NCHW44::GEMV::QINT8x8x32_QINT8"_hash); | |||
| break; | |||
| //!no support nchw44-dot 8x8x16 | |||
| case param::ConvBias::Format::NCHW44_DOT: | |||
| cb3(param::ConvBias::Format::NCHW44_DOT, dt_int8, dt_int32, | |||
| dt_int32, dt_int8, dt_int32, dt_int32, | |||
| @@ -420,81 +420,74 @@ bool ConvBiasImpl::AlgoConv1x1Gemv::usable(const NCBKernSizeParam& param, | |||
| MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, | |||
| midout_iv("AlgoConv1x1Gemv::usable"_hash)) { | |||
| auto format = param.filter_meta.format; | |||
| #if MEGDNN_X86 | |||
| if (format != param::ConvBias::Format::NCHW) | |||
| return false; | |||
| #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| if (format != param::ConvBias::Format::NCHW && | |||
| format != param::ConvBias::Format::NCHW44 && | |||
| format != param::ConvBias::Format::NCHW44_DOT) | |||
| return false; | |||
| #endif | |||
| //! whether 1x1 | |||
| size_t FH = param.filter_meta.spatial[0], | |||
| FW = param.filter_meta.spatial[1]; | |||
| size_t PH = param.filter_meta.padding[0], | |||
| PW = param.filter_meta.padding[1]; | |||
| size_t SH = param.filter_meta.stride[0], | |||
| SW = param.filter_meta.stride[1]; | |||
| if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { | |||
| return false; | |||
| } | |||
| //! whether gemv | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| if (OH * OW != 1) { | |||
| //! whether gemv and 1x1 | |||
| if (OH * OW != 1 || FH != 1 || FW != 1 || PH || PW || SH != 1 || | |||
| SW != 1) { | |||
| return false; | |||
| } | |||
| //! even no naive support in gemv | |||
| if ((param.src_type.enumv() == param.filter_type.enumv() && | |||
| param.src_type.enumv() == DTypeEnum::Int16) && | |||
| param.dst_type.enumv() == DTypeEnum::Int32) { | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| if (format != param::ConvBias::Format::NCHW && | |||
| format != param::ConvBias::Format::NCHW44 && | |||
| format != param::ConvBias::Format::NCHW44_DOT) { | |||
| return false; | |||
| } | |||
| //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||
| //! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||
| //! not support PostProcess | |||
| if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||
| param.dst_type.enumv() == DTypeEnum::Int32 || | |||
| param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||
| if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||
| return false; | |||
| } | |||
| } | |||
| //! supports a few dtypes | |||
| if (param.src_type.enumv() != param.filter_type.enumv()) { | |||
| #else | |||
| if (format != param::ConvBias::Format::NCHW) { | |||
| return false; | |||
| } | |||
| if (param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
| #endif | |||
| //! supports a few dtypes | |||
| if (param.src_type.enumv() != param.filter_type.enumv() || | |||
| (param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8 && | |||
| param.src_type.enumv() != DTypeEnum::Quantized8Asymm && | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||
| param.src_type.enumv() != DTypeEnum::Float16 && | |||
| #endif | |||
| param.src_type.enumv() != DTypeEnum::Float32) { | |||
| param.src_type.enumv() != DTypeEnum::Float32)) { | |||
| return false; | |||
| } | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| if (format == param::ConvBias::Format::NCHW44) { | |||
| if (param.src_type.enumv() != DTypeEnum::Float32 && | |||
| param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8) { | |||
| return false; | |||
| } | |||
| //! 8x8x16 is not support nchw44 | |||
| if (param.src_type.enumv() == DTypeEnum::Int8 && | |||
| param.dst_type.enumv() == DTypeEnum::Int16) { | |||
| return false; | |||
| } | |||
| } else if (format == param::ConvBias::Format::NCHW44_DOT) { | |||
| if (param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8) { | |||
| if ((param.src_type.enumv() != DTypeEnum::Int8 && | |||
| param.src_type.enumv() != DTypeEnum::QuantizedS8) || | |||
| param.dst_type.enumv() == DTypeEnum::Int16) { | |||
| return false; | |||
| } | |||
| } | |||
| #endif | |||
| //! make sure 8x8x16 and 8x8x32 biasmode nonlineMode is identity | |||
| //! otherwise return false | |||
| if (param.dst_type.enumv() == DTypeEnum::Int16 || | |||
| param.dst_type.enumv() == DTypeEnum::Int32 || | |||
| param.dst_type.enumv() == DTypeEnum::QuantizedS32) { | |||
| if (param.nonlineMode != megdnn::NonlineMode::IDENTITY) { | |||
| return false; | |||
| } | |||
| } | |||
| //! even no naive support in gemv | |||
| if ((param.src_type.enumv() == param.filter_type.enumv() && | |||
| param.src_type.enumv() == DTypeEnum::Int16) && | |||
| param.dst_type.enumv() == DTypeEnum::Int32) { | |||
| return false; | |||
| } | |||
| return (param.filter_meta.dilation[0] == | |||
| param.filter_meta.dilation[1] && | |||
| param.filter_meta.dilation[0] == 1) && | |||
| @@ -11,14 +11,19 @@ | |||
| #pragma once | |||
| #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||
| #include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" | |||
| #include "src/fallback/conv_bias/opr_impl.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| namespace conv1x1 { | |||
| template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||
| class Conv1x1Kerns { | |||
| class Conv1x1Kerns; | |||
| template <> | |||
| class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> { | |||
| public: | |||
| //! get_bundle | |||
| WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||
| @@ -28,13 +33,12 @@ public: | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t BATCH = param.n; | |||
| //! bundle per thread | |||
| //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||
| //! * OW this does not bother packb bytes | |||
| auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||
| auto thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), | |||
| oc_tile_size); | |||
| auto thread_bundle = utils::get_thread_bundle( | |||
| param, matmul_bundle.get_size(2), oc_tile_size); | |||
| //! size per thread | |||
| size_t all_threads_bytes = | |||
| thread_bundle.total_size_in_bytes() * param.nr_threads; | |||
| @@ -46,11 +50,6 @@ public: | |||
| is_enable_filter_preprocess(param) | |||
| ? 0 | |||
| : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||
| if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) | |||
| return WorkspaceBundle{nullptr, | |||
| {all_packa_bytes, 0, all_threads_bytes}}; | |||
| //! packb size = N * GROUP * packb_size_per_group | |||
| size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||
| size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | |||
| @@ -58,6 +57,165 @@ public: | |||
| return WorkspaceBundle{ | |||
| nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
| WorkspaceBundle& thread_bundle, | |||
| Conv1x1StrategyBase* conv1x1_strategy, | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
| auto kern_packA = | |||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
| conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
| oc_block_size, matmul_algo, param, | |||
| ncb_param, std::move(ncb_index)); | |||
| }; | |||
| auto kern_packB = | |||
| [whole_bundle, matmul_bundle, param, matmul_algo, | |||
| conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||
| matmul_algo, param, ncb_param, | |||
| std::move(ncb_index)); | |||
| }; | |||
| auto kern_compt = | |||
| [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||
| oc_block_size, conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||
| thread_bundle, oc_block_size, | |||
| matmul_algo, param, ncb_param, | |||
| std::move(ncb_index)); | |||
| }; | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t BATCH = param.n; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| if (!is_enable_filter_preprocess(param)) { | |||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
| } | |||
| ret_kern.push_back({kern_packB, {BATCH}}); | |||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
| Conv1x1StrategyBase* conv1x1_strategy, | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
| auto kern_packA = | |||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
| conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
| oc_block_size, matmul_algo, param, | |||
| ncb_param, std::move(ncb_index)); | |||
| }; | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| } | |||
| }; | |||
| template<> | |||
| class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> { | |||
| public: | |||
| //! get_bundle | |||
| WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||
| const MatrixMulImpl::KernSizeParam& matmul_param, | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, | |||
| size_t oc_tile_size) { | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| //! bundle per thread | |||
| //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||
| //! * OW this does not bother packb bytes | |||
| auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||
| auto thread_bundle = utils::get_thread_bundle( | |||
| param, matmul_bundle.get_size(2), oc_tile_size); | |||
| //! size per thread | |||
| size_t all_threads_bytes = | |||
| thread_bundle.total_size_in_bytes() * param.nr_threads; | |||
| //! packa size = GROUP * packa_size_each_group | |||
| size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||
| size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||
| size_t all_packa_bytes = | |||
| is_enable_filter_preprocess(param) | |||
| ? 0 | |||
| : packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||
| return WorkspaceBundle{nullptr, | |||
| {all_packa_bytes, 0, all_threads_bytes}}; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
| WorkspaceBundle& thread_bundle, | |||
| Conv1x1StrategyBase* conv1x1_strategy, | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
| auto kern_packA = | |||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
| conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
| oc_block_size, matmul_algo, param, | |||
| ncb_param, std::move(ncb_index)); | |||
| }; | |||
| auto kern_compt = | |||
| [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||
| oc_block_size, conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||
| thread_bundle, oc_block_size, | |||
| matmul_algo, param, ncb_param, | |||
| std::move(ncb_index)); | |||
| }; | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t BATCH = param.n; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| if (!is_enable_filter_preprocess(param)) { | |||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
| } | |||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
| Conv1x1StrategyBase* conv1x1_strategy, | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
| auto kern_packA = | |||
| [whole_bundle, matmul_bundle, param, matmul_algo, oc_block_size, | |||
| conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||
| oc_block_size, matmul_algo, param, | |||
| ncb_param, std::move(ncb_index)); | |||
| }; | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| } | |||
| }; | |||
| template<> | |||
| @@ -69,14 +227,47 @@ public: | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, | |||
| size_t oc_tile_size) { | |||
| size_t matmul_size = matmul_algo->get_workspace(matmul_param); | |||
| auto thread_bundle = utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||
| auto thread_bundle = | |||
| utils::get_thread_bundle(param, matmul_size, oc_tile_size); | |||
| //! size per thread | |||
| size_t all_threads_bytes = | |||
| thread_bundle.total_size_in_bytes() * param.nr_threads; | |||
| return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> get_kern( | |||
| const ConvBiasImpl::NCBKernSizeParam& param, | |||
| WorkspaceBundle& whole_bundle, WorkspaceBundle& matmul_bundle, | |||
| WorkspaceBundle& thread_bundle, | |||
| Conv1x1StrategyBase* conv1x1_strategy, | |||
| const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) { | |||
| auto kern_compt = | |||
| [whole_bundle, matmul_bundle, thread_bundle, matmul_algo, param, | |||
| oc_block_size, conv1x1_strategy]( | |||
| const ConvBiasImpl::NCBKernParam& ncb_param, | |||
| const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { | |||
| conv1x1_strategy->exec(whole_bundle, matmul_bundle, | |||
| thread_bundle, oc_block_size, | |||
| matmul_algo, param, ncb_param, | |||
| std::move(ncb_index)); | |||
| }; | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t BATCH = param.n; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t oc_blocks_per_group = div_ceil(OC, oc_block_size); | |||
| SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||
| ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||
| return ret_kern; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> get_kern_preprocess( | |||
| const ConvBiasImpl::NCBKernSizeParam&, WorkspaceBundle&, | |||
| WorkspaceBundle&, Conv1x1StrategyBase*, | |||
| const MatrixMulImpl::AlgoBase*, size_t) { | |||
| return {}; | |||
| } | |||
| }; | |||
| } // namespace conv1x1 | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -59,7 +59,8 @@ public: | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||
| megdnn::PostprocessMode postprocess_mode, | |||
| MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||
| class Conv1x1Strategy : public Conv1x1StrategyBase { | |||
| public: | |||
| explicit Conv1x1Strategy(size_t pack_size = 1) : m_pack_size(pack_size) {} | |||
| @@ -136,32 +137,30 @@ public: | |||
| size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||
| size_t GROUP = param.filter_meta.group; | |||
| size_t BATCH = param.n; | |||
| size_t SH = param.filter_meta.stride[0]; | |||
| size_t SW = param.filter_meta.stride[1]; | |||
| size_t OH = param.osz[0]; | |||
| size_t OW = param.osz[1]; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t batch = ncb_index.ndrange_id[0]; | |||
| MatrixMulImpl::KernParam matmul_kern_param; | |||
| static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | |||
| utils::get_matmul_kern_param(param, OH * OW, OC); | |||
| rep(batch, BATCH) { | |||
| rep(g, GROUP) { | |||
| if (SH == 2 && SW == 2) | |||
| megdnn_throw("no support for stride = 2"); | |||
| size_t bytes_offset_of_b_panel = | |||
| batch * packb_bytes_per_group * GROUP + | |||
| g * packb_bytes_per_group; | |||
| src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||
| bytes_offset_of_b_panel); | |||
| matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||
| ncb_param.src<src_ctype>(batch, g)); | |||
| matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||
| } | |||
| rep(g, GROUP) { | |||
| if (SH == 2 && SW == 2) | |||
| megdnn_throw("no support for stride = 2"); | |||
| size_t bytes_offset_of_b_panel = | |||
| batch * packb_bytes_per_group * GROUP + | |||
| g * packb_bytes_per_group; | |||
| src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||
| bytes_offset_of_b_panel); | |||
| matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||
| ncb_param.src<src_ctype>(batch, g)); | |||
| matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||
| } | |||
| } else { | |||
| megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | |||