GitOrigin-RevId: 8ef62baf79
tags/v1.6.0
| @@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, | |||||
| TensorND src = src0, dst = dst0; | TensorND src = src0, dst = dst0; | ||||
| check_layout_and_canonize(src.layout, dst.layout); | check_layout_and_canonize(src.layout, dst.layout); | ||||
| // FIXME: optimize for lowbit cases | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle); | |||||
| return; | |||||
| } | |||||
| relayout::TransposeParam trans_param; | relayout::TransposeParam trans_param; | ||||
| bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); | bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); | ||||
| if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { | if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { | ||||
| @@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, | |||||
| TensorND src = src0, dst = dst0; | TensorND src = src0, dst = dst0; | ||||
| check_layout_and_canonize(src.layout, dst.layout); | check_layout_and_canonize(src.layout, dst.layout); | ||||
| // FIXME: optimize for lowbit cases | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle); | |||||
| return; | |||||
| } | |||||
| relayout::TransposeParam trans_param; | relayout::TransposeParam trans_param; | ||||
| bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); | bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); | ||||
| if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { | if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { | ||||
| @@ -0,0 +1,313 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/opr_format_modifier.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./opr_format_modifier.h" | |||||
| #include "megbrain/opr/dnn/convolution.h" | |||||
| #include "megbrain/opr/dnn/pooling.h" | |||||
| #include "megbrain/opr/imgproc.h" | |||||
| #include "megbrain/opr/io.h" | |||||
| #include "megbrain/serialization/sereg.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megbrain_opr_format_modifier) | |||||
| #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) { | |||||
| #define MIDOUT_E \ | |||||
| } \ | |||||
| MIDOUT_END(); | |||||
| using namespace mgb; | |||||
| using namespace opr; | |||||
| namespace { | |||||
| template <class MegDNNConv = megdnn::Convolution> | |||||
| struct MakeConvCaller2 { | |||||
| template <typename Opr> | |||||
| static VarNode* make(const cg::VarNodeArray& inputs, | |||||
| const typename MegDNNConv::Param& param, | |||||
| const megdnn::param::ExecutionPolicy& execution_policy, | |||||
| const OperatorNodeConfig& config) { | |||||
| if (inputs.size() == 2) { | |||||
| return Opr::make(inputs[0], inputs[1], param, execution_policy, | |||||
| config) | |||||
| .node(); | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| }; | |||||
| template <class MegDNNConv = megdnn::Convolution> | |||||
| struct MakeConvCaller3 { | |||||
| template <typename Opr> | |||||
| static VarNode* make(const cg::VarNodeArray& inputs, | |||||
| const typename MegDNNConv::Param& param, | |||||
| const megdnn::param::ExecutionPolicy& execution_policy, | |||||
| const OperatorNodeConfig& config) { | |||||
| if (inputs.size() == 3) { | |||||
| return Opr::make(inputs[0], inputs[1], inputs[2], param, | |||||
| execution_policy, config) | |||||
| .node(); | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| }; | |||||
| template <class MegDNNConv = megdnn::Convolution> | |||||
| struct MakeConvCaller4 { | |||||
| template <typename Opr> | |||||
| static VarNode* make(const cg::VarNodeArray& inputs, | |||||
| const typename MegDNNConv::Param& param, | |||||
| const megdnn::param::ExecutionPolicy& execution_policy, | |||||
| const OperatorNodeConfig& config) { | |||||
| if (inputs.size() == 4) { | |||||
| return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, | |||||
| execution_policy, config) | |||||
| .node(); | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| }; | |||||
| template <class MegDNNConv = megdnn::Convolution> | |||||
| struct MakeConvCaller5 { | |||||
| template <typename Opr> | |||||
| static VarNode* make(const cg::VarNodeArray& inputs, | |||||
| const typename MegDNNConv::Param& param, | |||||
| const megdnn::param::ExecutionPolicy& execution_policy, | |||||
| const OperatorNodeConfig& config) { | |||||
| if (inputs.size() == 5) { | |||||
| return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], | |||||
| inputs[4], param, execution_policy, config) | |||||
| .node(); | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| }; | |||||
| template <class MegDNNConv = megdnn::Convolution> | |||||
| struct MakeConvCallerEmpty { | |||||
| template <typename Opr> | |||||
| static VarNode* make(const cg::VarNodeArray&, | |||||
| const typename MegDNNConv::Param&, | |||||
| const megdnn::param::ExecutionPolicy&, | |||||
| const OperatorNodeConfig&) { | |||||
| return nullptr; | |||||
| } | |||||
| }; | |||||
| template <class Opr, class Maker0, class MegDNNConv, | |||||
| class Maker1 = MakeConvCallerEmpty<MegDNNConv>, | |||||
| class Maker2 = MakeConvCallerEmpty<MegDNNConv>, | |||||
| typename ConvParam = megdnn::param::Convolution> | |||||
| struct ConvMakerImpl { | |||||
| static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param, | |||||
| const megdnn::param::ExecutionPolicy& execution_policy, | |||||
| const OperatorNodeConfig& config) { | |||||
| VarNode* ret = Maker0::template make<Opr>(inputs, param, | |||||
| execution_policy, config); | |||||
| if (!ret) { | |||||
| ret = Maker1::template make<Opr>(inputs, param, execution_policy, | |||||
| config); | |||||
| } | |||||
| if (!ret) { | |||||
| ret = Maker2::template make<Opr>(inputs, param, execution_policy, | |||||
| config); | |||||
| } | |||||
| mgb_assert(ret); | |||||
| return ret; | |||||
| } | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct ConvMaker; | |||||
| template <> | |||||
| struct ConvMaker<opr::Convolution> | |||||
| : public ConvMakerImpl<opr::Convolution, | |||||
| MakeConvCaller2<megdnn::Convolution>, | |||||
| megdnn::Convolution> {}; | |||||
| template <> | |||||
| struct ConvMaker<opr::ConvolutionBackwardData> | |||||
| : public ConvMakerImpl<opr::ConvolutionBackwardData, | |||||
| MakeConvCaller2<megdnn::Convolution>, | |||||
| megdnn::Convolution, | |||||
| MakeConvCaller3<megdnn::Convolution>> {}; | |||||
| template <> | |||||
| struct ConvMaker<opr::ConvBiasForward> | |||||
| : public ConvMakerImpl<opr::ConvBiasForward, | |||||
| MakeConvCaller2<megdnn::ConvBiasForward>, | |||||
| megdnn::ConvBiasForward, | |||||
| MakeConvCaller3<megdnn::ConvBiasForward>, | |||||
| MakeConvCaller4<megdnn::ConvBiasForward>, | |||||
| megdnn::param::ConvBias> {}; | |||||
| template <> | |||||
| struct ConvMaker<opr::BatchConvBiasForward> | |||||
| : public ConvMakerImpl<opr::BatchConvBiasForward, | |||||
| MakeConvCaller2<megdnn::BatchConvBiasForward>, | |||||
| megdnn::BatchConvBiasForward, | |||||
| MakeConvCaller3<megdnn::BatchConvBiasForward>, | |||||
| MakeConvCaller4<megdnn::BatchConvBiasForward>, | |||||
| megdnn::param::BatchConvBias> {}; | |||||
| #if 0 | |||||
| #include "../../opr/impl/internal/invoke.h" | |||||
| template <typename Opr> | |||||
| struct MultiAlgoOprTrait; | |||||
| #define APPLY(statement, ...) \ | |||||
| mgb::apply([&](const auto&... args) { return statement; }, \ | |||||
| std::tuple_cat(__VA_ARGS__)) | |||||
| #define INST(_Opr) \ | |||||
| template <> \ | |||||
| struct MultiAlgoOprTrait<_Opr> { \ | |||||
| static constexpr bool has_algo = true; \ | |||||
| using MegDNNOpr = megdnn::_Opr; \ | |||||
| static constexpr int arity = OprArityTrait<MegDNNOpr>::arity; \ | |||||
| using FixedTensorLayouts = std::array<TensorLayout, arity>; \ | |||||
| static bool has_available_algo(const VarNodeArray& i, \ | |||||
| const cg::OperatorNodeBase* opr_) { \ | |||||
| MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)), \ | |||||
| midout_iv(MGB_HASH_STR("has_available_algo"))) \ | |||||
| auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||||
| auto&& megdnn_opr = \ | |||||
| reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr()); \ | |||||
| FixedTensorLayouts array_layouts; \ | |||||
| size_t in = i.size() - 1; \ | |||||
| for (size_t idx = 0; idx < in; idx++) { \ | |||||
| const auto& v = i[idx]; \ | |||||
| array_layouts[idx] = \ | |||||
| TensorLayout{v->shape(), v->dtype(), v->format()}; \ | |||||
| } \ | |||||
| const auto& v = i[in]; \ | |||||
| array_layouts[arity - 1] = \ | |||||
| TensorLayout{v->shape(), v->dtype(), v->format()}; \ | |||||
| return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \ | |||||
| array_layouts); \ | |||||
| MIDOUT_E \ | |||||
| } \ | |||||
| }; | |||||
| INST(Convolution) | |||||
| INST(ConvBiasForward) | |||||
| INST(ConvolutionBackwardData) | |||||
| INST(PoolingForward) | |||||
| #undef APPLY | |||||
| #undef INST | |||||
| #endif | |||||
| } // namespace | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| namespace intl { | |||||
| template <typename Opr> | |||||
| struct OprFormatModifier; | |||||
| #define INST(_Opr) \ | |||||
| template <> \ | |||||
| struct OprFormatModifier<_Opr> { \ | |||||
| using OprFormat = typename _Opr::Param::Format; \ | |||||
| static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \ | |||||
| const cg::OperatorNodeBase* opr_) { \ | |||||
| MIDOUT_B(_Opr) \ | |||||
| auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||||
| auto param = opr.param(); \ | |||||
| param.format = opr_format; \ | |||||
| return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \ | |||||
| opr.config()); \ | |||||
| MIDOUT_E \ | |||||
| } \ | |||||
| }; | |||||
| INST(Convolution); | |||||
| INST(ConvBiasForward); | |||||
| INST(ConvolutionBackwardData); | |||||
| INST(BatchConvBiasForward); | |||||
| #undef INST | |||||
| template <> | |||||
| struct OprFormatModifier<WarpPerspective> { | |||||
| using Opr = opr::WarpPerspective; | |||||
| using OprFormat = typename Opr::Param::Format; | |||||
| static VarNode* make(OprFormat opr_format, const VarNodeArray& i, | |||||
| const cg::OperatorNodeBase* opr_) { | |||||
| MIDOUT_B(Opr) | |||||
| auto&& opr = opr_->cast_final_safe<Opr>(); | |||||
| auto param = opr.param(); | |||||
| param.format = opr_format; | |||||
| if (i.size() == 3) { | |||||
| return Opr::make(i[0], i[1], i[2], param, opr.config()).node(); | |||||
| } else { | |||||
| mgb_assert(i.size() == 4); | |||||
| return Opr::make(i[0], i[1], i[2], i[3], param, opr.config()) | |||||
| .node(); | |||||
| } | |||||
| MIDOUT_E | |||||
| } | |||||
| }; | |||||
| #define INST(_Opr, _arity) \ | |||||
| template <> \ | |||||
| struct OprFormatModifier<_Opr> { \ | |||||
| using OprFormat = typename _Opr::Param::Format; \ | |||||
| static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \ | |||||
| const cg::OperatorNodeBase* opr_) { \ | |||||
| MIDOUT_B(_Opr) \ | |||||
| auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||||
| auto param = opr.param(); \ | |||||
| param.format = opr_format; \ | |||||
| return serialization::OprMaker<_Opr, _arity>::make( \ | |||||
| param, i, *i[0]->owner_graph(), opr.config()) \ | |||||
| ->output(0); \ | |||||
| MIDOUT_E \ | |||||
| } \ | |||||
| }; | |||||
| INST(PoolingForward, 1); | |||||
| INST(Resize, 2); | |||||
| #undef INST | |||||
| VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format, | |||||
| const VarNodeArray& i, | |||||
| const cg::OperatorNodeBase* opr) { | |||||
| #define cb(_Opr) \ | |||||
| if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ | |||||
| return OprFormatModifier<_Opr>::make(opr_format, i, opr); \ | |||||
| } else | |||||
| FOREACH_FORMAT_AWARE_OPR(cb) { | |||||
| mgb_throw(InternalError, "invalid format aware operator(got:%s)", | |||||
| opr->dyn_typeinfo()->name); | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| #if 0 | |||||
| bool has_available_algo(const VarNodeArray& i, | |||||
| const cg::OperatorNodeBase* opr) { | |||||
| #define cb(_Opr) \ | |||||
| if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ | |||||
| MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo); \ | |||||
| VarNodeArray _ = i; \ | |||||
| _.emplace_back(opr->output(0)); \ | |||||
| return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \ | |||||
| } else | |||||
| cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) | |||||
| cb(PoolingForward) { | |||||
| mgb_throw(InternalError, "invalid multi-algo operator(got:%s)", | |||||
| opr->dyn_typeinfo()->name); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| } // namespace intl | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,36 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/opr_format_modifier.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megbrain/graph.h" | |||||
| #include "megbrain/opr/dnn/convolution.h" | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| namespace intl { | |||||
| #define FOREACH_FORMAT_AWARE_OPR(cb) \ | |||||
| cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \ | |||||
| cb(PoolingForward) cb(WarpPerspective) cb(Resize) | |||||
| #if 0 | |||||
| bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr); | |||||
| #endif | |||||
| VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format, | |||||
| const VarNodeArray& i, | |||||
| const cg::OperatorNodeBase* opr); | |||||
| } // namespace intl | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,582 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/opr_tensor_formats_config.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./utils.h" | |||||
| #include "megbrain/gopt/global_layout_transform.h" | |||||
| #include "megbrain/opr/dnn/pooling.h" | |||||
| #include "megbrain/opr/imgproc.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megbrain_opr_tensor_formats_config) | |||||
| #define MIDOUT_B(...) \ | |||||
| MIDOUT_BEGIN(megbrain_opr_tensor_formats_config, __VA_ARGS__) { | |||||
| #define MIDOUT_E \ | |||||
| } \ | |||||
| MIDOUT_END(); | |||||
| using namespace mgb; | |||||
| using namespace cg; | |||||
| using namespace gopt; | |||||
| using OprFormat = opr::ConvBias::Param::Format; | |||||
| namespace { | |||||
| template <typename Opr> | |||||
| struct ConvParamTrait; | |||||
| #define INST(_conv, _weight_idx, _bias_idx, _has_bias) \ | |||||
| template <> \ | |||||
| struct ConvParamTrait<opr::_conv> { \ | |||||
| static constexpr int weight_idx = _weight_idx; \ | |||||
| static constexpr int bias_idx = _bias_idx; \ | |||||
| static constexpr bool has_bias = _has_bias; \ | |||||
| } | |||||
| INST(ConvBias, 1, 2, true); | |||||
| INST(ConvolutionForward, 1, 0, false); | |||||
| INST(ConvolutionBackwardData, 0, 0, false); | |||||
| template <typename Opr, size_t weight_idx = ConvParamTrait<Opr>::weight_idx> | |||||
| static bool is_channel_wise_conv(const OperatorNodeBase* opr) { | |||||
| MGB_MARK_USED_VAR(ConvParamTrait<Opr>::has_bias); | |||||
| MGB_MARK_USED_VAR(ConvParamTrait<Opr>::bias_idx); | |||||
| auto&& conv = opr->cast_final_safe<Opr>(); | |||||
| auto format = conv.param().format; | |||||
| auto weight = opr->input(weight_idx); | |||||
| auto weight_shp = weight->shape(); | |||||
| if (conv.param().sparse == Opr::Param::Sparse::DENSE) | |||||
| return false; | |||||
| size_t ocpg, icpg; | |||||
| if (format == Opr::Param::Format::NCHW) { | |||||
| ocpg = weight_shp[1], icpg = weight_shp[2]; | |||||
| return ocpg == 1 && icpg == 1; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| template <OprFormat opr_format_> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl; | |||||
| template <> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW; | |||||
| config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||||
| config.input_tensor_types = {TensorType::FEATURE}; | |||||
| config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||||
| config.input_tensor_formats = {TensorFormats::NCHW}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHW}; | |||||
| return config; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW4> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW4; | |||||
| bool available = true; | |||||
| available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||||
| config.input_tensor_types = {TensorType::FEATURE}; | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||||
| config.input_tensor_formats = {TensorFormats::NCHWc4}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc4}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::CHWN4> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::CHWN4; | |||||
| bool available = true; | |||||
| available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||||
| config.input_tensor_types = {TensorType::FEATURE}; | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||||
| config.input_tensor_formats = {TensorFormats::CHWNc4}; | |||||
| config.output_tensor_formats = {TensorFormats::CHWNc4}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW32> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW32; | |||||
| bool available = true; | |||||
| available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||||
| config.input_tensor_types = {TensorType::FEATURE}; | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||||
| config.input_tensor_formats = {TensorFormats::NCHWc32}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc32}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NHWC> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NHWC; | |||||
| bool available = true; | |||||
| available &= | |||||
| opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||||
| opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||||
| config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||||
| config.input_tensor_types = {TensorType::FEATURE}; | |||||
| available &= opr->output(0)->dtype().enumv() == | |||||
| opr->input(0)->dtype().enumv(); | |||||
| config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||||
| config.input_tensor_formats = {TensorFormats::NHWC}; | |||||
| config.output_tensor_formats = {TensorFormats::NHWC}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW64> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW64; | |||||
| bool available = true; | |||||
| available &= | |||||
| opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||||
| opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||||
| config.input_dtypes = {opr->input(0)->dtype().enumv()}; | |||||
| config.input_tensor_types = {TensorType::FEATURE}; | |||||
| available &= opr->output(0)->dtype().enumv() == | |||||
| opr->input(0)->dtype().enumv(); | |||||
| config.output_dtypes = {opr->output(0)->dtype().enumv()}; | |||||
| config.input_tensor_formats = {TensorFormats::NCHWc64}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc64}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <typename Opr, OprFormat opr_format_> | |||||
| struct ConvTensorFormatsDispatcherImpl; | |||||
| template <typename Opr> | |||||
| struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW; | |||||
| // setup dtypes | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| // setup tensor formats | |||||
| if (conv.param().sparse == Opr::Param::Sparse::DENSE) { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHW, TensorFormats::NCHW, | |||||
| TensorFormats::NCHW, TensorFormats::NCHW}; | |||||
| } else { | |||||
| mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); | |||||
| if (is_channel_wise_conv<Opr>(opr)) { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHW, TensorFormats::C11RS, | |||||
| TensorFormats::NCHW, TensorFormats::NCHW}; | |||||
| } else { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHW, TensorFormats::GKCRS, | |||||
| TensorFormats::NCHW, TensorFormats::NCHW}; | |||||
| } | |||||
| } | |||||
| config.output_tensor_formats = {TensorFormats::NCHW}; | |||||
| return config; | |||||
| } | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NHWC; | |||||
| bool available = true; | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| if (i == 2) | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS32; | |||||
| else | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::Quantized4Asymm || | |||||
| opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS4; | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| available &= | |||||
| opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||||
| opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||||
| config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, | |||||
| TensorFormats::NHWC, | |||||
| TensorFormats::NHWC}; | |||||
| config.output_tensor_formats = {TensorFormats::NHWC}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW4> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW4; | |||||
| bool available = true; | |||||
| // setup dtypes | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| if (i == 2) | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS32; | |||||
| else | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| // setup tensor formats | |||||
| if (conv.param().sparse == Opr::Param::Sparse::DENSE) { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHWc4, TensorFormats::NCHWc4, | |||||
| TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||||
| } else { | |||||
| mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); | |||||
| if (is_channel_wise_conv<Opr>(opr)) { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHWc4, TensorFormats::C11RSc4, | |||||
| TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||||
| } else { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHWc4, TensorFormats::GKCRSc4, | |||||
| TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||||
| } | |||||
| } | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc4}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW32> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW32; | |||||
| bool available = true; | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| if (i == 2) | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS32; | |||||
| else | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHWc32, TensorFormats::NCHWc32, | |||||
| TensorFormats::NCHWc32, TensorFormats::NCHWc32}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc32}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW64> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW64; | |||||
| bool available = true; | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| if (i == 2) | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS32; | |||||
| else | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::Quantized4Asymm || | |||||
| opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS4; | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| available &= | |||||
| opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || | |||||
| opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHWc64, TensorFormats::NCHWc64, | |||||
| TensorFormats::NCHWc64, TensorFormats::NCHWc64}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc64}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::CHWN4> { | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::CHWN4; | |||||
| bool available = true; | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| if (i == 2) | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS32; | |||||
| else | |||||
| available &= opr->input(i)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| available &= conv.param().sparse == Opr::Param::Sparse::DENSE; | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::CHWNc4, TensorFormats::CHWNc4, | |||||
| TensorFormats::CHWNc4, TensorFormats::CHWNc4}; | |||||
| config.output_tensor_formats = {TensorFormats::CHWNc4}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData, | |||||
| OprFormat::NCHW> { | |||||
| using Opr = opr::ConvolutionBackwardData; | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW; | |||||
| // setup dtypes | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| // setup tensor formats | |||||
| if (conv.param().sparse == Opr::Param::Sparse::DENSE) { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHW, TensorFormats::NCHW, | |||||
| TensorFormats::NCHW, TensorFormats::NCHW}; | |||||
| } else { | |||||
| mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); | |||||
| if (is_channel_wise_conv<Opr>(opr)) { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::C11RS, TensorFormats::NCHW, | |||||
| TensorFormats::NCHW, TensorFormats::NCHW}; | |||||
| } else { | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::GKCRS, TensorFormats::NCHW, | |||||
| TensorFormats::NCHW, TensorFormats::NCHW}; | |||||
| } | |||||
| } | |||||
| config.output_tensor_formats = {TensorFormats::NCHW}; | |||||
| return config; | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData, | |||||
| OprFormat::NCHW4> { | |||||
| using Opr = opr::ConvolutionBackwardData; | |||||
| static Maybe<OprTensorFormatsConfiguration> dispatch( | |||||
| const OperatorNodeBase* opr) { | |||||
| const auto& conv = opr->cast_final_safe<Opr>(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = OprFormat::NCHW4; | |||||
| bool available = true; | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| available &= | |||||
| opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); | |||||
| TensorType tensor_type = | |||||
| i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; | |||||
| config.input_tensor_types.emplace_back(tensor_type); | |||||
| } | |||||
| available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE; | |||||
| config.input_tensor_formats = { | |||||
| TensorFormats::NCHWc4, TensorFormats::NCHWc4, | |||||
| TensorFormats::NCHWc4, TensorFormats::NCHWc4}; | |||||
| config.output_tensor_formats = {TensorFormats::NCHWc4}; | |||||
| if (available) | |||||
| return config; | |||||
| return None; | |||||
| } | |||||
| }; | |||||
| struct StaticData { | |||||
| struct KeyHash { | |||||
| size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const { | |||||
| size_t h1 = mgb::hash<Typeinfo*>(val.first); | |||||
| size_t h2 = | |||||
| std::hash<uint32_t>()(static_cast<uint32_t>(val.second)); | |||||
| return mgb::hash_pair_combine(h1, h2); | |||||
| } | |||||
| }; | |||||
| using OprTensorFormatsDispatcher = | |||||
| OprTensorFormatsConfiguration::OprTensorFormatsDispatcher; | |||||
| std::unordered_map<std::pair<Typeinfo*, OprFormat>, | |||||
| OprTensorFormatsDispatcher, KeyHash> | |||||
| typefmt2dispatcher; | |||||
| StaticData(); | |||||
| }; | |||||
| StaticData::StaticData() { | |||||
| #define OPR_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \ | |||||
| typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \ | |||||
| [](const OperatorNodeBase* opr) { \ | |||||
| MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \ | |||||
| return ConvTensorFormatsDispatcherImpl< \ | |||||
| opr::_Opr, OprFormat::_fmt>::dispatch(opr); \ | |||||
| MIDOUT_E \ | |||||
| } | |||||
| #define OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \ | |||||
| typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \ | |||||
| [](const OperatorNodeBase* opr) { \ | |||||
| MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \ | |||||
| return OprSingleInOutTensorFormatsDispatcherImpl< \ | |||||
| OprFormat::_fmt>::dispatch(opr); \ | |||||
| MIDOUT_E \ | |||||
| } | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NHWC); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW); | |||||
| OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NHWC); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW4); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW64); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NHWC); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW4); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, CHWN4); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW32); | |||||
| OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW64); | |||||
| #undef OPR_TENSOR_FORMATS_CONFIG_REG | |||||
| #undef OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG | |||||
| } | |||||
| StaticData& static_data() { | |||||
| static StaticData inst; | |||||
| return inst; | |||||
| } | |||||
| } // namespace | |||||
| OprTensorFormatsConfiguration::OprTensorFormatsDispatcher* | |||||
| OprTensorFormatsConfiguration::find_dispatcher_by_type_format( | |||||
| Typeinfo* type, OprFormat opr_format) { | |||||
| auto&& typefmt2dispatcher = static_data().typefmt2dispatcher; | |||||
| auto iter = typefmt2dispatcher.find(std::make_pair(type, opr_format)); | |||||
| mgb_assert(iter != typefmt2dispatcher.end(), | |||||
| "cannot find OprTensorFormatsDispatcher for opr type(%s) and " | |||||
| "opr format(%s)", | |||||
| type->name, opr_format_to_string(opr_format)); | |||||
| return &iter->second; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,527 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/profiler_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./opr_format_modifier.h" | |||||
| #include "./utils.h" | |||||
| #include "megbrain/gopt/framework.h" | |||||
| #include "megbrain/gopt/global_layout_transform.h" | |||||
| #include "megbrain/graph/event.h" | |||||
| #include "megbrain/opr/dnn/pooling.h" | |||||
| #include "megbrain/opr/imgproc.h" | |||||
| #include "megbrain/opr/io.h" | |||||
| #include "megbrain/plugin/base.h" | |||||
| #include "megbrain/serialization/sereg.h" | |||||
| using namespace mgb; | |||||
| using namespace cg; | |||||
| using namespace opr; | |||||
| using namespace gopt; | |||||
| using ReformatKey = ReformatManager::ReformatKey; | |||||
| namespace { | |||||
| using OprFormat = Problem::OprFormat; | |||||
| OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) { | |||||
| switch (tensor_format) { | |||||
| case TensorFormats::NCHW: | |||||
| return OprFormat::NCHW; | |||||
| case TensorFormats::NCHWc4: | |||||
| return OprFormat::NCHW4; | |||||
| case TensorFormats::NCHWc8: | |||||
| return OprFormat::NCHW8; | |||||
| case TensorFormats::NCHWc32: | |||||
| return OprFormat::NCHW32; | |||||
| case TensorFormats::NCHWc64: | |||||
| return OprFormat::NCHW64; | |||||
| case TensorFormats::NHWC: | |||||
| return OprFormat::NHWC; | |||||
| case TensorFormats::CHWNc4: | |||||
| return OprFormat::CHWN4; | |||||
| default: | |||||
| mgb_throw(MegBrainError, "tensor format(%u) is not supported", | |||||
| static_cast<uint32_t>(tensor_format)); | |||||
| } | |||||
| } | |||||
| class GraphPartitionProfiler final : public PluginBase { | |||||
| using CompNodeEventPtr = std::unique_ptr<CompNode::Event>; | |||||
| public: | |||||
| using OprFilter = thin_function<bool(OperatorNodeBase*)>; | |||||
| struct OprKernEvent { | |||||
| CompNodeEventPtr start, end; | |||||
| }; | |||||
| GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter); | |||||
| ~GraphPartitionProfiler() noexcept; | |||||
| float duration_in_usec() const; | |||||
| private: | |||||
| void record_event(CompNodeEventPtr& dest, CompNode cn) { | |||||
| if (dest == nullptr) | |||||
| dest = cn.create_event(CompNode::Event::NEED_TIMER); | |||||
| dest->record(); | |||||
| } | |||||
| ThinHashMap<OperatorNodeBase*, OprKernEvent> m_kern_event; | |||||
| OprFilter m_opr_filter; | |||||
| }; | |||||
| GraphPartitionProfiler::GraphPartitionProfiler(ComputingGraph* graph, | |||||
| OprFilter opr_filter) | |||||
| : PluginBase(graph), m_opr_filter(opr_filter) { | |||||
| using namespace event; | |||||
| auto on_before_kern = [this](BeforeKernel const& event) { | |||||
| if (!m_opr_filter(event.opr)) | |||||
| return; | |||||
| auto evptr = &m_kern_event[event.opr].start; | |||||
| record_event(*evptr, event.comp_node); | |||||
| }; | |||||
| auto on_after_kern = [this](AfterKernel const& event) { | |||||
| if (!m_opr_filter(event.opr)) | |||||
| return; | |||||
| auto evptr = &m_kern_event[event.opr].end; | |||||
| record_event(*evptr, event.comp_node); | |||||
| }; | |||||
| auto&& ev = graph->event(); | |||||
| add_event_handler(ev.register_receiver<BeforeKernel>(on_before_kern)); | |||||
| add_event_handler(ev.register_receiver<AfterKernel>(on_after_kern)); | |||||
| } | |||||
| GraphPartitionProfiler::~GraphPartitionProfiler() noexcept { | |||||
| auto wait = [](const CompNodeEventPtr& ev) { | |||||
| if (ev) | |||||
| ev->host_wait(); | |||||
| }; | |||||
| for (auto&& i : m_kern_event) { | |||||
| wait(i.second.start); | |||||
| wait(i.second.end); | |||||
| } | |||||
| } | |||||
| float GraphPartitionProfiler::duration_in_usec() const { | |||||
| float device_duration = 0.f; | |||||
| for (auto&& kern_ev : m_kern_event) { | |||||
| auto&& event = kern_ev.second; | |||||
| event.end->host_wait(); | |||||
| device_duration += 1e6 * event.start->elapsed_time_until(*event.end); | |||||
| } | |||||
| return device_duration; | |||||
| } | |||||
| /*! | |||||
| * \brief An operator that indicates its input var node is contiguous | |||||
| */ | |||||
| // clang-format off | |||||
| MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{ | |||||
| void scn_do_execute() override {}; | |||||
| void init_output_static_infer_desc() override; | |||||
| void add_input_layout_constraint() override { | |||||
| input(0)->add_layout_constraint_contiguous(); | |||||
| } | |||||
| public: | |||||
| MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config); | |||||
| static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {}); | |||||
| }; | |||||
| // clang-format on | |||||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous); | |||||
| MarkInputContiguous::MarkInputContiguous(VarNode* input, | |||||
| const OperatorNodeConfig& config) | |||||
| : Super(input->owner_graph(), config, "mark_contiguous", {input}) { | |||||
| add_input({input}); | |||||
| add_output(None); | |||||
| } | |||||
| SymbolVar MarkInputContiguous::make(SymbolVar input, | |||||
| const OperatorNodeConfig& config) { | |||||
| return input.insert_single_output_opr<MarkInputContiguous>(input.node(), | |||||
| config); | |||||
| } | |||||
| void MarkInputContiguous::init_output_static_infer_desc() { | |||||
| using namespace cg::static_infer; | |||||
| auto&& mgr = owner_graph()->static_infer_manager(); | |||||
| mgr.register_shape_infer(output(0), | |||||
| ShapeInferDesc::make_identity(input(0))); | |||||
| } | |||||
| } // namespace | |||||
| /* ================== ProfilerImpl =================*/ | |||||
| class ProfilerImpl final : public ProfilerBase { | |||||
| public: | |||||
| ProfilerImpl(int runs = 10) : m_runs{runs} {}; | |||||
| ~ProfilerImpl() = default; | |||||
| ProfilingResult profile(const Problem& problem) const override; | |||||
| private: | |||||
| static constexpr float PROFILE_TIME_OUT = 1e7; | |||||
| /*! | |||||
| * \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) | |||||
| * | |||||
| * \param opr pointer to the operator node to be profiled | |||||
| * \param base_format the original tensor format of the operator node. | |||||
| * \param available_tensor_formats the available tensor formats | |||||
| * \return the operator node record | |||||
| */ | |||||
| OperatorNodeRecord profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats) const; | |||||
| float profile_operator(const OperatorNodeBase* opr, | |||||
| TensorFormats base_format, | |||||
| TensorFormats tensor_format) const; | |||||
| /*! | |||||
| * \brief profile opr format aware operators (like conv, deconv, conv_bias, etc.) | |||||
| * | |||||
| * \param opr pointer to the operator node to be profiled | |||||
| * \param base_config the tensor formats configuration of base opr format | |||||
| * \param config all the available configuration | |||||
| * \return the operator node record | |||||
| */ | |||||
| OperatorNodeRecord profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const SmallVector<OprTensorFormatsConfiguration>& available_configs) | |||||
| const; | |||||
| float profile_operator(const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config) const; | |||||
| /*! | |||||
| * \brief profile layout transform of the var node | |||||
| * | |||||
| * \param var pointer to the var node to be profiled | |||||
| * \param base_format the original tensor formats in which the var node is stored | |||||
| * \param available_tensor_formats the available tensor formats | |||||
| * \param extra_attribute the extra attributes (options) of the problem | |||||
| * \return the var node record | |||||
| */ | |||||
| VarNodeRecord profile_var_node( | |||||
| const VarNode* var, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| ReformatKey::Attribute extra_attribute = | |||||
| ReformatKey::Attribute::DEFAULT) const; | |||||
| float profile_var_node(const VarNode* var, TensorFormats base_format, | |||||
| const ReformatKey& key) const; | |||||
| int m_runs; /// sample times of the profiler | |||||
| }; | |||||
| ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats) const { | |||||
| OperatorNodeRecord record; | |||||
| record.opr = opr; | |||||
| auto& costs = record.costs; | |||||
| for (auto&& f : available_tensor_formats) { | |||||
| auto opr_format = tensor_formats_to_opr_format(f); | |||||
| costs[opr_format] = profile_operator(opr, base_format, f); | |||||
| } | |||||
| return record; | |||||
| } | |||||
| float ProfilerImpl::profile_operator(const OperatorNodeBase* opr, | |||||
| TensorFormats base_format, | |||||
| TensorFormats tensor_format) const { | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| graph->options().var_sanity_check_first_run = false; | |||||
| VarNodeArray new_inps(opr->input().size()); | |||||
| for (size_t i = 0; i < opr->input().size(); ++i) { | |||||
| auto&& var = opr->input(i); | |||||
| auto&& cn = var->comp_node(); | |||||
| auto&& dtype = var->dtype(); | |||||
| auto dval = std::make_shared<DeviceTensorND>(cn, dtype); | |||||
| auto aligned_tensor_shape = | |||||
| make_aligned_tensor_shape(var, base_format, tensor_format); | |||||
| dval->resize(aligned_tensor_shape); | |||||
| auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||||
| new_inps[i] = aligned_var.node(); | |||||
| } | |||||
| auto new_opr = serialization::copy_opr_shallow( | |||||
| *opr, new_inps, opr->config(), {graph.get()}); | |||||
| auto y = new_opr->output(0); | |||||
| auto mark = MarkInputContiguous::make(SymbolVar(y)); | |||||
| auto func = graph->compile({{mark, {}}}); | |||||
| auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; }; | |||||
| auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(), | |||||
| std::move(filter)); | |||||
| for (int i = 0; i < m_runs; ++i) | |||||
| func->execute(); | |||||
| return profiler->duration_in_usec(); | |||||
| } | |||||
| ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const SmallVector<OprTensorFormatsConfiguration>& available_configs) | |||||
| const { | |||||
| OperatorNodeRecord record; | |||||
| record.opr = opr; | |||||
| auto& costs = record.costs; | |||||
| for (auto&& i : available_configs) { | |||||
| costs[i.opr_format] = profile_operator(opr, base_config, i); | |||||
| } | |||||
| return record; | |||||
| } | |||||
| float ProfilerImpl::profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config) const { | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| graph->options().var_sanity_check_first_run = false; | |||||
| VarNodeArray new_inps(opr->input().size()); | |||||
| size_t i = 0; | |||||
| size_t nr_input_tensor = | |||||
| std::min(config.input_tensor_formats.size(), opr->input().size()); | |||||
| for (; i < nr_input_tensor; ++i) { | |||||
| auto&& var = opr->input(i); | |||||
| auto&& cn = var->comp_node(); | |||||
| auto&& dtype = var->dtype(); | |||||
| auto dval = std::make_shared<DeviceTensorND>(cn, dtype); | |||||
| TensorShape aligned_shape; | |||||
| if (config.input_tensor_types[i] == TensorType::WEIGHT) { | |||||
| mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT); | |||||
| aligned_shape = make_aligned_weight_shape( | |||||
| var, base_config.input_tensor_formats[i], | |||||
| config.input_tensor_formats[i], | |||||
| config.output_tensor_formats[0]); | |||||
| } else { | |||||
| mgb_assert(base_config.input_tensor_types[i] == | |||||
| config.input_tensor_types[i]); | |||||
| mgb_assert(base_config.input_tensor_types[i] == | |||||
| TensorType::FEATURE); | |||||
| aligned_shape = make_aligned_tensor_shape( | |||||
| var, base_config.input_tensor_formats[i], | |||||
| config.input_tensor_formats[i]); | |||||
| } | |||||
| dval->resize(aligned_shape); | |||||
| auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||||
| new_inps[i] = aligned_var.node(); | |||||
| } | |||||
| for (; i < opr->input().size(); ++i) { | |||||
| auto&& var = opr->input(i); | |||||
| auto&& cn = var->comp_node(); | |||||
| auto&& dtype = var->dtype(); | |||||
| auto hval = std::make_shared<HostTensorND>(cn, dtype); | |||||
| hval->resize(var->shape()); | |||||
| auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); }; | |||||
| { | |||||
| auto cg = var->owner_graph(); | |||||
| cg->compile({{var, cb}})->execute(); | |||||
| } | |||||
| auto imm = opr::ImmutableTensor::make(*graph, *hval); | |||||
| new_inps[i] = imm.node(); | |||||
| } | |||||
| VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps, | |||||
| opr); | |||||
| #if 0 | |||||
| static const ThinHashSet<Typeinfo*> multi_algo_oprs = { | |||||
| opr::Convolution::typeinfo(), | |||||
| opr::ConvBiasForward::typeinfo(), | |||||
| opr::ConvolutionBackwardData::typeinfo(), | |||||
| opr::PoolingForward::typeinfo(), | |||||
| }; | |||||
| if (multi_algo_oprs.count(opr->dyn_typeinfo()) && | |||||
| !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr())) | |||||
| return PROFILE_TIME_OUT; | |||||
| #endif | |||||
| auto mark = MarkInputContiguous::make(SymbolVar(y)); | |||||
| auto func = graph->compile({{mark, {}}}); | |||||
| auto new_opr = y->owner_opr(); | |||||
| auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; }; | |||||
| auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(), | |||||
| std::move(filter)); | |||||
| for (int i = 0; i < m_runs; ++i) | |||||
| func->execute(); | |||||
| return profiler->duration_in_usec(); | |||||
| } | |||||
| ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node( | |||||
| const VarNode* var, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| ReformatKey::Attribute attribute) const { | |||||
| VarNodeRecord record; | |||||
| record.var = var; | |||||
| auto& costs = record.costs; | |||||
| for (auto&& i : available_tensor_formats) { | |||||
| for (auto&& o : available_tensor_formats) { | |||||
| if (i == o) | |||||
| continue; | |||||
| ReformatKey key{i, o, attribute, var->dtype().enumv(), | |||||
| var->dtype().enumv()}; | |||||
| costs[{i, o}] = profile_var_node(var, base_format, key); | |||||
| } | |||||
| } | |||||
| return record; | |||||
| } | |||||
| float ProfilerImpl::profile_var_node(const VarNode* var, | |||||
| TensorFormats base_format, | |||||
| const ReformatKey& key) const { | |||||
| auto&& cn = var->comp_node(); | |||||
| auto&& dtype = var->dtype(); | |||||
| auto dval = std::make_shared<DeviceTensorND>(cn, dtype); | |||||
| auto aligned_tensor_shape = | |||||
| make_aligned_tensor_shape(var, base_format, key.input_format); | |||||
| dval->resize(aligned_tensor_shape); | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| graph->options().var_sanity_check_first_run = false; | |||||
| auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||||
| auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||||
| var, base_format, key); | |||||
| auto y = builder({aligned_var.node()}); | |||||
| ThinHashSet<OperatorNodeBase*> set; | |||||
| DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); }); | |||||
| iter.add(y->owner_opr()); | |||||
| iter.set_visited(aligned_var.node()->owner_opr()); | |||||
| auto mark = MarkInputContiguous::make(SymbolVar(y)); | |||||
| auto func = graph->compile({{mark, {}}}); | |||||
| auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; }; | |||||
| auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(), | |||||
| std::move(filter)); | |||||
| for (int i = 0; i < m_runs; ++i) | |||||
| func->execute(); | |||||
| return profiler->duration_in_usec(); | |||||
| } | |||||
| ProfilerImpl::ProfilingResult ProfilerImpl::profile( | |||||
| const Problem& problem) const { | |||||
| ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM}; | |||||
| { | |||||
| auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); }; | |||||
| DepOprIter iter{cb}; | |||||
| for (auto&& o : problem.graph_partition().output()) { | |||||
| iter.add(o->owner_opr()); | |||||
| } | |||||
| } | |||||
| static const ThinHashMap<Typeinfo*, size_t> format_aware_input_tensors = { | |||||
| #define cb(_Opr, _arity) {_Opr::typeinfo(), _arity} | |||||
| cb(Convolution, 2), | |||||
| cb(ConvBiasForward, 4), | |||||
| cb(ConvolutionBackwardData, 2), | |||||
| cb(PoolingForward, 1), | |||||
| cb(WarpPerspective, 1), | |||||
| cb(Resize, 1), | |||||
| #undef cb | |||||
| }; | |||||
| ThinHashSet<VarNode*> vars; | |||||
| ThinHashSet<OperatorNodeBase*> oprs; | |||||
| { | |||||
| auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) { | |||||
| if (cvprop.is_const(opr)) | |||||
| return; | |||||
| oprs.insert(opr); | |||||
| auto find = format_aware_input_tensors.find(opr->dyn_typeinfo()); | |||||
| if (find == format_aware_input_tensors.end()) { | |||||
| for (auto&& i : opr->input()) { | |||||
| if (!cvprop.is_const(i)) { | |||||
| vars.insert(i); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| size_t nr_input_tensor = | |||||
| std::min(find->second, opr->input().size()); | |||||
| for (size_t i = 0; i < nr_input_tensor; ++i) { | |||||
| if (!cvprop.is_const(opr->input(i))) { | |||||
| vars.insert(opr->input(i)); | |||||
| } | |||||
| } | |||||
| } | |||||
| vars.insert(opr->output(0)); | |||||
| }; | |||||
| DepOprIter iter{cb}; | |||||
| for (auto&& i : problem.graph_partition().input()) { | |||||
| iter.set_visited(i->owner_opr()); | |||||
| } | |||||
| for (auto&& o : problem.graph_partition().output()) { | |||||
| iter.add(o->owner_opr()); | |||||
| } | |||||
| } | |||||
| auto base_format = problem.base_format(); | |||||
| auto&& available_tensor_formats = problem.available_tensor_formats(); | |||||
| ProfilingResult profiling_result; | |||||
| auto& opr_record = profiling_result.opr_record; | |||||
| auto& var_record = profiling_result.var_record; | |||||
| for (auto&& var : vars) { | |||||
| var_record[var] = | |||||
| profile_var_node(var, base_format, available_tensor_formats); | |||||
| } | |||||
| for (auto&& opr : oprs) { | |||||
| auto&& opr_configs = problem.opr_configs(); | |||||
| auto find = opr_configs.find(opr->dyn_typeinfo()); | |||||
| if (find == opr_configs.end()) { | |||||
| opr_record[opr] = profile_operator(opr, base_format, | |||||
| available_tensor_formats); | |||||
| } else { | |||||
| auto&& dispatchers = find->second; | |||||
| SmallVector<OprTensorFormatsConfiguration> configs; | |||||
| for (const auto& item : dispatchers) { | |||||
| auto config = (*item.second)(opr); | |||||
| if (config.valid()) { | |||||
| configs.emplace_back(config.val()); | |||||
| } | |||||
| } | |||||
| auto base_config = problem.base_config(opr); | |||||
| opr_record[opr] = profile_operator(opr, base_config, configs); | |||||
| } | |||||
| } | |||||
| for (auto&& rpair : opr_record) { | |||||
| mgb_log_debug("%s", rpair.second.to_string().c_str()); | |||||
| } | |||||
| for (auto&& rpair : var_record) { | |||||
| mgb_log_debug("%s", rpair.second.to_string().c_str()); | |||||
| } | |||||
| return profiling_result; | |||||
| } | |||||
| /* ================== ProfilerBase =================*/ | |||||
| std::string ProfilerBase::OperatorNodeRecord::to_string() const { | |||||
| auto str = ssprintf("\nopr type: %s\nopr name: %s\ninputs:\n", | |||||
| opr->dyn_typeinfo()->name, opr->cname()); | |||||
| for (auto&& i : opr->input()) { | |||||
| str += ssprintf("\tvar: %s\n\tshape: %s\n", i->cname(), | |||||
| i->shape().to_string().c_str()); | |||||
| } | |||||
| str += ssprintf("outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n", | |||||
| opr->output(0)->cname(), | |||||
| opr->output(0)->shape().to_string().c_str()); | |||||
| for (auto&& cpair : costs) { | |||||
| str += ssprintf("\tformat: %s; cost:%f", | |||||
| opr_format_to_string(cpair.first), cpair.second); | |||||
| } | |||||
| return str; | |||||
| } | |||||
| std::string ProfilerBase::VarNodeRecord::to_string() const { | |||||
| auto str = ssprintf("\nvar: %s\ncosts:", var->cname()); | |||||
| for (auto&& cpair : costs) { | |||||
| auto&& formats = cpair.first; | |||||
| str += ssprintf("\n\tformat: (i:%s;o:%s); cost:%f", | |||||
| tensor_formats_to_named_tensor_shape(formats.first) | |||||
| .to_string() | |||||
| .c_str(), | |||||
| tensor_formats_to_named_tensor_shape(formats.second) | |||||
| .to_string() | |||||
| .c_str(), | |||||
| cpair.second); | |||||
| } | |||||
| return str; | |||||
| } | |||||
| std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() { | |||||
| return std::make_unique<ProfilerImpl>(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { | |||||
| /* ============== PaddingEmitter ================= */ | /* ============== PaddingEmitter ================= */ | ||||
| PaddingEmitter::EmitResult PaddingEmitter::emit() const { | PaddingEmitter::EmitResult PaddingEmitter::emit() const { | ||||
| auto&& padshp = m_padshp; | |||||
| auto&& const_extent = m_const_extent; | auto&& const_extent = m_const_extent; | ||||
| auto&& axis = m_axis; | auto&& axis = m_axis; | ||||
| auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||||
| auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) { | |||||
| auto i = vars[0]; | auto i = vars[0]; | ||||
| auto padding_shp_var = vars[1]; | auto padding_shp_var = vars[1]; | ||||
| TensorShape shape; | TensorShape shape; | ||||
| shape.ndim = i->shape().ndim; | shape.ndim = i->shape().ndim; | ||||
| for (size_t ax = 0; ax < shape.ndim; ++ax) | for (size_t ax = 0; ax < shape.ndim; ++ax) | ||||
| shape[ax] = 1; | shape[ax] = 1; | ||||
| shape[axis] = const_extent; | |||||
| // avoid making a scalar lowbit tensor | |||||
| if (!i->dtype().is_low_bit() || const_extent != 1) | |||||
| shape[axis] = const_extent; | |||||
| else { | |||||
| size_t const_axis = 0; | |||||
| size_t new_const_extent = const_extent; | |||||
| for (size_t i = 0; i < padshp.ndim; ++i) { | |||||
| const auto& dim = padshp[i]; | |||||
| if (dim.extent() != Dimension::UNDETERMINED_EXTENT && | |||||
| dim.extent() != 1) { | |||||
| new_const_extent = dim.extent(); | |||||
| const_axis = i; | |||||
| break; | |||||
| } | |||||
| } | |||||
| mgb_assert(new_const_extent != 1, | |||||
| "cannot make an scalar lowbit tensor(got:%s)", | |||||
| i->dtype().name()); | |||||
| shape[const_axis] = new_const_extent; | |||||
| } | |||||
| auto host_val = | auto host_val = | ||||
| std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); | std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); | ||||
| host_val->resize(shape); | host_val->resize(shape); | ||||
| @@ -13,6 +13,7 @@ | |||||
| #include "megbrain/gopt/reformat_manager.h" | #include "megbrain/gopt/reformat_manager.h" | ||||
| #include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
| #include "megbrain/utils/arith_helper.h" | #include "megbrain/utils/arith_helper.h" | ||||
| #include "./utils.h" | |||||
| using namespace mgb; | using namespace mgb; | ||||
| using namespace gopt; | using namespace gopt; | ||||
| @@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) { | |||||
| } | } | ||||
| return x; | return x; | ||||
| } | } | ||||
| NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { | |||||
| switch (format) { | |||||
| case TensorFormats::NCHW: | |||||
| return {{"N"}, {"C"}, {"H"}, {"W"}}; | |||||
| case TensorFormats::NHWC: | |||||
| return {{"N"}, {"H"}, {"W"}, {"C"}}; | |||||
| case TensorFormats::NCHWc4: | |||||
| return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}}; | |||||
| case TensorFormats::NCHWc8: | |||||
| return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}}; | |||||
| case TensorFormats::NCHWc32: | |||||
| return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}}; | |||||
| case TensorFormats::NCHWc64: | |||||
| return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}}; | |||||
| case TensorFormats::CHWNc4: | |||||
| return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}}; | |||||
| case TensorFormats::NHCWc4: | |||||
| return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}}; | |||||
| case TensorFormats::KRSCk4: | |||||
| return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||||
| case TensorFormats::GKRSCk4: | |||||
| return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||||
| case TensorFormats::C1RSc4: | |||||
| return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::KRSCk4c4: | |||||
| return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::GKRSCk4c4: | |||||
| return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::KCRSk4c4: | |||||
| return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::GKCRSk4c4: | |||||
| return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::KCRSc4k4: | |||||
| return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||||
| case TensorFormats::GKCRSc4k4: | |||||
| return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||||
| case TensorFormats::C11RSc4: | |||||
| return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::KCRSc8k8: | |||||
| return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||||
| case TensorFormats::GKCRSc8k8: | |||||
| return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||||
| case TensorFormats::C11RSc8: | |||||
| return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}}; | |||||
| case TensorFormats::KRSCk8: | |||||
| return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}}; | |||||
| case TensorFormats::KCRSc4: | |||||
| return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::GKCRSc4: | |||||
| return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::KCRS: | |||||
| return {{"K"}, {"C"}, {"R"}, {"S"}}; | |||||
| case TensorFormats::GKCRS: | |||||
| return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}}; | |||||
| case TensorFormats::C11RS: | |||||
| return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}}; | |||||
| default: | |||||
| mgb_throw(AssertionError, "invalid tensor formats(%u)", | |||||
| static_cast<uint32_t>(format)); | |||||
| } | |||||
| } | |||||
| }; // namespace | }; // namespace | ||||
| // =================== ReformatManager::ReformatKey ====================*/ | // =================== ReformatManager::ReformatKey ====================*/ | ||||
| @@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||||
| tensor_formats_to_named_tensor_shape(key.input_format); | tensor_formats_to_named_tensor_shape(key.input_format); | ||||
| NamedTensorShape output_shape = | NamedTensorShape output_shape = | ||||
| tensor_formats_to_named_tensor_shape(key.output_format); | tensor_formats_to_named_tensor_shape(key.output_format); | ||||
| size_t input_alignment, output_alignment; | |||||
| size_t input_channel_idx, output_channel_idx; | |||||
| size_t input_alignment = 0; | |||||
| size_t output_alignment = 0; | |||||
| size_t input_channel_idx = input_shape.ndim, | |||||
| output_channel_idx = input_shape.ndim; | |||||
| for (size_t i = 0; i < input_shape.ndim; ++i) { | for (size_t i = 0; i < input_shape.ndim; ++i) { | ||||
| if (input_shape[i].name() == Dimension::Name::C && | if (input_shape[i].name() == Dimension::Name::C && | ||||
| input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | ||||
| @@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| mgb_assert(input_channel_idx < input_shape.ndim && | |||||
| output_channel_idx < input_shape.ndim, | |||||
| "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)", | |||||
| input_channel_idx, output_channel_idx, | |||||
| input_shape.to_string().c_str()); | |||||
| mgb_assert(input_alignment > 0 && output_alignment > 0, | |||||
| "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)", | |||||
| input_alignment, output_alignment, | |||||
| input_shape.to_string().c_str()); | |||||
| NamedTensorShape orig_shape = | NamedTensorShape orig_shape = | ||||
| tensor_formats_to_named_tensor_shape(orig_format); | tensor_formats_to_named_tensor_shape(orig_format); | ||||
| size_t orig_channel = 0; | size_t orig_channel = 0; | ||||
| @@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||||
| auto make_shape = std::get<0>( | auto make_shape = std::get<0>( | ||||
| MakeShapeEmitter{input_shape, padding_shape}.emit()); | MakeShapeEmitter{input_shape, padding_shape}.emit()); | ||||
| auto padding_shp_var = make_shape({x}); | auto padding_shp_var = make_shape({x}); | ||||
| auto padding = std::get<0>( | |||||
| PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||||
| auto padding = std::get<0>(PaddingEmitter{ | |||||
| padding_shape, const_extent, input_channel_idx} | |||||
| .emit()); | |||||
| cur = padding({cur, padding_shp_var}); | cur = padding({cur, padding_shp_var}); | ||||
| } | } | ||||
| cur = ReformatManager::instance().get(key)({cur}); | cur = ReformatManager::instance().get(key)({cur}); | ||||
| @@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||||
| const VarNode* orig_var, const ReformatKey& key, | const VarNode* orig_var, const ReformatKey& key, | ||||
| const AlignmentDesc& extra_alignment) const { | const AlignmentDesc& extra_alignment) const { | ||||
| size_t in_channels = 0, out_channels = 0; | size_t in_channels = 0, out_channels = 0; | ||||
| size_t input_channel_idx, output_channel_idx; | |||||
| Dimension::Name out_channel_name; | |||||
| Dimension::Name out_channel_name = Dimension::Name::C; | |||||
| auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); | auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); | ||||
| size_t input_channel_idx = input_shape.ndim, | |||||
| output_channel_idx = input_shape.ndim; | |||||
| for (size_t i = 0; i < input_shape.ndim; ++i) { | for (size_t i = 0; i < input_shape.ndim; ++i) { | ||||
| if (input_shape[i].name() == Dimension::Name::C && | if (input_shape[i].name() == Dimension::Name::C && | ||||
| input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | ||||
| @@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||||
| input_shape.to_string().c_str()); | input_shape.to_string().c_str()); | ||||
| } | } | ||||
| } | } | ||||
| size_t in_channel_alignment, out_channel_alignment = 1; | |||||
| mgb_assert(out_channel_name == Dimension::Name::K || | |||||
| out_channel_name == Dimension::Name::N, | |||||
| "invalid out channel(shp:%s)", input_shape.to_string().c_str()); | |||||
| mgb_assert(input_channel_idx < input_shape.ndim && | |||||
| output_channel_idx < input_shape.ndim, | |||||
| "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)", | |||||
| input_channel_idx, output_channel_idx, | |||||
| input_shape.to_string().c_str()); | |||||
| size_t in_channel_alignment = 0, out_channel_alignment = 0; | |||||
| auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); | auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); | ||||
| for (size_t i = 0; i < output_shape.ndim; ++i) { | for (size_t i = 0; i < output_shape.ndim; ++i) { | ||||
| if (output_shape[i].name() == Dimension::Name::C && | if (output_shape[i].name() == Dimension::Name::C && | ||||
| @@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||||
| out_channel_alignment = output_shape[i].stride(); | out_channel_alignment = output_shape[i].stride(); | ||||
| } | } | ||||
| } | } | ||||
| mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0, | |||||
| "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)", | |||||
| in_channel_alignment, out_channel_alignment, | |||||
| output_shape.to_string().c_str()); | |||||
| size_t aligned_in_channel = | size_t aligned_in_channel = | ||||
| divup(in_channels, in_channel_alignment) * in_channel_alignment; | divup(in_channels, in_channel_alignment) * in_channel_alignment; | ||||
| if (extra_alignment.name == out_channel_name) { | if (extra_alignment.name == out_channel_name) { | ||||
| @@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||||
| auto make_shape = std::get<0>( | auto make_shape = std::get<0>( | ||||
| MakeShapeEmitter{input_shape, padding_shape}.emit()); | MakeShapeEmitter{input_shape, padding_shape}.emit()); | ||||
| auto padding_shp_var = make_shape({x}); | auto padding_shp_var = make_shape({x}); | ||||
| auto padding = std::get<0>( | |||||
| PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||||
| auto padding = std::get<0>(PaddingEmitter{ | |||||
| padding_shape, const_extent, input_channel_idx} | |||||
| .emit()); | |||||
| cur = padding({cur, padding_shp_var}); | cur = padding({cur, padding_shp_var}); | ||||
| } | } | ||||
| if (aligned_out_channel > out_channels) { | if (aligned_out_channel > out_channels) { | ||||
| @@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||||
| auto make_shape = std::get<0>( | auto make_shape = std::get<0>( | ||||
| MakeShapeEmitter{input_shape, padding_shape}.emit()); | MakeShapeEmitter{input_shape, padding_shape}.emit()); | ||||
| auto padding_shp_var = make_shape({cur}); | auto padding_shp_var = make_shape({cur}); | ||||
| auto padding = std::get<0>( | |||||
| PaddingEmitter{const_extent, output_channel_idx}.emit()); | |||||
| auto padding = std::get<0>(PaddingEmitter{ | |||||
| padding_shape, const_extent, output_channel_idx} | |||||
| .emit()); | |||||
| cur = padding({cur, padding_shp_var}); | cur = padding({cur, padding_shp_var}); | ||||
| } | } | ||||
| cur = ReformatManager::instance().get(key)({cur}); | cur = ReformatManager::instance().get(key)({cur}); | ||||
| @@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() { | |||||
| static ReformatManager inst; | static ReformatManager inst; | ||||
| return inst; | return inst; | ||||
| } | } | ||||
| TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var, | |||||
| TensorFormats orig_formats, | |||||
| TensorFormats target_formats) { | |||||
| using Dimension = megdnn::Dimension; | |||||
| static constexpr uint32_t UNDETERMINED_EXTENT = | |||||
| Dimension::UNDETERMINED_EXTENT; | |||||
| auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats); | |||||
| auto target_shape = tensor_formats_to_named_tensor_shape(target_formats); | |||||
| TensorShape oshp = var->shape(); | |||||
| mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim, | |||||
| "orig shape of var node is not compatible with tensor " | |||||
| "formats(var:%s;shp:%s;fmt:%s)", | |||||
| var->cname(), oshp.to_string().c_str(), | |||||
| orig_shape.to_string().c_str()); | |||||
| if (oshp.is_scalar()) return oshp; | |||||
| TensorShape tshp; | |||||
| ThinHashMap<Dimension::Name, int> name2dominant; | |||||
| for (size_t i = 0; i < orig_shape.ndim; ++i) { | |||||
| auto name = orig_shape[i].name(); | |||||
| if (orig_shape[i].extent() == UNDETERMINED_EXTENT) { | |||||
| auto insert = name2dominant.insert(std::make_pair(name, i)); | |||||
| mgb_assert(insert.second); | |||||
| } | |||||
| } | |||||
| tshp.ndim = target_shape.ndim; | |||||
| for (size_t i = 0; i < target_shape.ndim; ++i) { | |||||
| auto name = target_shape[i].name(); | |||||
| if (target_shape[i].extent() == UNDETERMINED_EXTENT) { | |||||
| int idx = name2dominant.at(name); | |||||
| bool mul = orig_shape[idx] < target_shape[i]; | |||||
| size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent() | |||||
| : (orig_shape[idx] / target_shape[i]).extent(); | |||||
| if (mul) | |||||
| tshp[i] = oshp[idx] * factor; | |||||
| else | |||||
| tshp[i] = divup(oshp[idx], factor); | |||||
| } else { | |||||
| tshp[i] = target_shape[i].extent(); | |||||
| } | |||||
| } | |||||
| return tshp; | |||||
| } | |||||
| TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var, | |||||
| TensorFormats orig_formats, | |||||
| TensorFormats target_formats, | |||||
| TensorFormats extra_formats) { | |||||
| auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats); | |||||
| auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats); | |||||
| using Dimension = megdnn::Dimension; | |||||
| static constexpr uint32_t UNDETERMINED_EXTENT = | |||||
| Dimension::UNDETERMINED_EXTENT; | |||||
| size_t out_channel_alignment = 1; | |||||
| for (size_t i = 0; i < extra_shape.ndim; ++i) { | |||||
| auto name = extra_shape[i].name(); | |||||
| if (name == Dimension::Name::C && | |||||
| extra_shape[i].extent() == UNDETERMINED_EXTENT) { | |||||
| out_channel_alignment = extra_shape[i].stride(); | |||||
| } | |||||
| } | |||||
| auto target_shape = tensor_formats_to_named_tensor_shape(target_formats); | |||||
| for (size_t i = 0; i < target_shape.ndim; ++i) { | |||||
| auto name = target_shape[i].name(); | |||||
| if ((name == Dimension::Name::K || name == Dimension::Name::N) && | |||||
| target_shape[i].extent() == UNDETERMINED_EXTENT) { | |||||
| size_t out_channels = tshp[i] * target_shape[i].stride(); | |||||
| tshp[i] = divup(out_channels, out_channel_alignment) * | |||||
| out_channel_alignment / target_shape[i].stride(); | |||||
| } | |||||
| } | |||||
| return tshp; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -0,0 +1,105 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/utils.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megbrain/gopt/global_layout_transform.h" | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| static inline const char* opr_format_to_string( | |||||
| OprTensorFormatsConfiguration::OprFormat opr_format) { | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| #define cb(_fmt) \ | |||||
| case OprFormat::_fmt: \ | |||||
| return #_fmt | |||||
| switch (opr_format) { | |||||
| cb(NCHW); | |||||
| cb(NHWC); | |||||
| cb(NCHW4); | |||||
| cb(NCHW32); | |||||
| cb(NCHW64); | |||||
| cb(CHWN4); | |||||
| default: | |||||
| mgb_assert(false, "Invalid opr format(got:%u)", | |||||
| static_cast<uint32_t>(opr_format)); | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape( | |||||
| TensorFormats format) { | |||||
| switch (format) { | |||||
| case TensorFormats::NCHW: | |||||
| return {{"N"}, {"C"}, {"H"}, {"W"}}; | |||||
| case TensorFormats::NHWC: | |||||
| return {{"N"}, {"H"}, {"W"}, {"C"}}; | |||||
| case TensorFormats::NCHWc4: | |||||
| return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}}; | |||||
| case TensorFormats::NCHWc8: | |||||
| return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}}; | |||||
| case TensorFormats::NCHWc32: | |||||
| return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}}; | |||||
| case TensorFormats::NCHWc64: | |||||
| return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}}; | |||||
| case TensorFormats::CHWNc4: | |||||
| return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}}; | |||||
| case TensorFormats::NHCWc4: | |||||
| return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}}; | |||||
| case TensorFormats::KRSCk4: | |||||
| return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||||
| case TensorFormats::GKRSCk4: | |||||
| return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; | |||||
| case TensorFormats::C1RSc4: | |||||
| return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::KRSCk4c4: | |||||
| return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::GKRSCk4c4: | |||||
| return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::KCRSk4c4: | |||||
| return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::GKCRSk4c4: | |||||
| return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; | |||||
| case TensorFormats::KCRSc4k4: | |||||
| return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||||
| case TensorFormats::GKCRSc4k4: | |||||
| return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; | |||||
| case TensorFormats::C11RSc4: | |||||
| return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::KCRSc8k8: | |||||
| return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||||
| case TensorFormats::GKCRSc8k8: | |||||
| return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; | |||||
| case TensorFormats::C11RSc8: | |||||
| return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}}; | |||||
| case TensorFormats::KRSCk8: | |||||
| return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}}; | |||||
| case TensorFormats::KCRSc4: | |||||
| return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::GKCRSc4: | |||||
| return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; | |||||
| case TensorFormats::KCRS: | |||||
| return {{"K"}, {"C"}, {"R"}, {"S"}}; | |||||
| case TensorFormats::GKCRS: | |||||
| return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}}; | |||||
| case TensorFormats::C11RS: | |||||
| return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}}; | |||||
| default: | |||||
| mgb_throw(AssertionError, "invalid tensor formats(%u)", | |||||
| static_cast<uint32_t>(format)); | |||||
| } | |||||
| } | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,176 @@ | |||||
| /** | |||||
| * \file src/gopt/include/megbrain/gopt/global_layout_transformation.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megbrain/gopt/reformat_manager.h" | |||||
| #include "megbrain/gopt/subgraph_extractor.h" | |||||
| #include "megbrain/opr/dnn/convolution.h" | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| /*! | |||||
| * \brief A structure that describe the data types and tensor formats | |||||
| * configuration of the opr format | |||||
| */ | |||||
| struct OprTensorFormatsConfiguration { | |||||
| using OprFormat = opr::ConvBias::Param::Format; | |||||
| using OprTensorFormatsDispatcher = | |||||
| thin_function<Maybe<OprTensorFormatsConfiguration>( | |||||
| const cg::OperatorNodeBase*)>; | |||||
| Typeinfo* typeinfo; | |||||
| OprFormat opr_format; | |||||
| SmallVector<DTypeEnum> input_dtypes; | |||||
| SmallVector<DTypeEnum> output_dtypes; | |||||
| SmallVector<TensorFormats> input_tensor_formats; | |||||
| SmallVector<TensorType> input_tensor_types; | |||||
| SmallVector<TensorFormats> output_tensor_formats; | |||||
| static OprTensorFormatsDispatcher* find_dispatcher_by_type_format( | |||||
| Typeinfo* type, OprFormat opr_format); | |||||
| }; | |||||
| /*! | |||||
| * \brief A structure that describes the global layout transform problem | |||||
| */ | |||||
| class Problem { | |||||
| public: | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| using OprTensorFormatsDispatcher = | |||||
| OprTensorFormatsConfiguration::OprTensorFormatsDispatcher; | |||||
| using OprConfigTrait = | |||||
| ThinHashMap<Typeinfo*, | |||||
| ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>; | |||||
| struct Attribute { | |||||
| OprFormat base_opr_format; /// the base opr format indicates that the | |||||
| /// network to be optimized is constructed | |||||
| /// in the base opr format, i.e. all the | |||||
| /// format aware operators (conv, conv_bias, | |||||
| /// deconv, pooling etc.) are built in | |||||
| /// this format. | |||||
| TensorFormats | |||||
| base_tensor_formats; /// the base tensor format indicates that | |||||
| /// all the format agnostic operators | |||||
| /// (like elemwise, elemwise multi type, | |||||
| /// typecvt etc.) are built in the base | |||||
| /// tensor format. | |||||
| }; | |||||
| Problem(const GraphPartition& graph_partition, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| const OprConfigTrait& opr_config, const Attribute& attribute) | |||||
| : m_graph_partition{graph_partition}, | |||||
| m_available_tensor_formats{available_tensor_formats}, | |||||
| m_opr_configs{opr_config}, | |||||
| m_attribute{attribute} {} | |||||
| ~Problem() noexcept = default; | |||||
| const GraphPartition& graph_partition() const { return m_graph_partition; } | |||||
| const OprConfigTrait& opr_configs() const { return m_opr_configs; } | |||||
| const SmallVector<TensorFormats>& available_tensor_formats() const { | |||||
| return m_available_tensor_formats; | |||||
| } | |||||
| TensorFormats base_format() const { | |||||
| return m_attribute.base_tensor_formats; | |||||
| } | |||||
| OprTensorFormatsConfiguration base_config( | |||||
| const cg::OperatorNodeBase* opr) const { | |||||
| auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format( | |||||
| opr->dyn_typeinfo(), m_attribute.base_opr_format); | |||||
| auto rst = (*_)(opr); | |||||
| if (rst.valid()) | |||||
| return rst.val(); | |||||
| OprTensorFormatsConfiguration config; | |||||
| config.typeinfo = opr->dyn_typeinfo(); | |||||
| config.opr_format = m_attribute.base_opr_format; | |||||
| for (const auto& i : opr->input()) { | |||||
| config.input_dtypes.emplace_back(i->dtype().enumv()); | |||||
| config.input_tensor_formats.emplace_back( | |||||
| m_attribute.base_tensor_formats); | |||||
| config.input_tensor_types.emplace_back(TensorType::FEATURE); | |||||
| } | |||||
| config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); | |||||
| config.output_tensor_formats.emplace_back( | |||||
| m_attribute.base_tensor_formats); | |||||
| return config; | |||||
| } | |||||
| private: | |||||
| const GraphPartition& m_graph_partition; /// the graph partition | |||||
| const SmallVector<TensorFormats>& | |||||
| m_available_tensor_formats; /// the available tensor formats, used | |||||
| /// for format agnostic operators (like | |||||
| /// elemwise, elemwise multi type, | |||||
| /// typecvt, etc. | |||||
| const OprConfigTrait& | |||||
| m_opr_configs; /// the available opr format configurations, used | |||||
| /// for format aware operators (like conv, deconv, | |||||
| /// conv_bias, etc. | |||||
| Attribute m_attribute; /// the extra attributes to describe the problem | |||||
| }; | |||||
| /*! | |||||
| * \brief A profiler that collects all the performance data to describe the | |||||
| * global layout transform problem. | |||||
| */ | |||||
| class ProfilerBase { | |||||
| public: | |||||
| using OprFormat = Problem::OprFormat; | |||||
| struct OperatorNodeRecord { | |||||
| const cg::OperatorNodeBase* opr; ///< pointer to operator node | |||||
| ThinHashMap<OprFormat, float> | |||||
| costs; ///< costs of operator node, i.e. the elapsed device | |||||
| ///< time of the operator node on different opr format | |||||
| ///< (layout configuration). | |||||
| std::string to_string() const; | |||||
| }; | |||||
| struct VarNodeRecord { | |||||
| struct KeyHash { | |||||
| size_t operator()( | |||||
| const std::pair<TensorFormats, TensorFormats>& val) const { | |||||
| size_t h1 = | |||||
| std::hash<uint32_t>()(static_cast<uint32_t>(val.first)); | |||||
| size_t h2 = std::hash<uint32_t>()( | |||||
| static_cast<uint32_t>(val.second)); | |||||
| return mgb::hash_pair_combine(h1, h2); | |||||
| } | |||||
| }; | |||||
| const VarNode* var; ///< pointer to var node | |||||
| std::unordered_map<std::pair<TensorFormats, TensorFormats>, float, | |||||
| KeyHash> | |||||
| costs; ///< costs of var node, i.e. the elapsed | |||||
| ///< device time of the layout transform. | |||||
| ///< Key of the hashmap indicates the | |||||
| ///< source tensor format and the target | |||||
| ///< tensor format. | |||||
| std::string to_string() const; | |||||
| }; | |||||
| /*! | |||||
| * \note the profiler assumes all the input and output var node are stored | |||||
| * in contiguous layout in memory | |||||
| */ | |||||
| struct ProfilingResult { | |||||
| /// A hashmap, that maps the operator node to the costs (device elapsed | |||||
| /// time) of different layouts configuration | |||||
| ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record; | |||||
| /// A hashmap, that maps the var node to the costs of layout transform | |||||
| ThinHashMap<VarNode*, VarNodeRecord> var_record; | |||||
| }; | |||||
| ProfilerBase() = default; | |||||
| virtual ~ProfilerBase() = default; | |||||
| virtual ProfilingResult profile(const Problem& problem) const = 0; | |||||
| static std::unique_ptr<ProfilerBase> make_profiler(); | |||||
| }; | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -80,11 +80,13 @@ private: | |||||
| class PaddingEmitter final : public Emitter { | class PaddingEmitter final : public Emitter { | ||||
| public: | public: | ||||
| PaddingEmitter(size_t const_extent, size_t axis) | |||||
| : m_const_extent{const_extent}, m_axis{axis} {} | |||||
| PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent, | |||||
| size_t axis) | |||||
| : m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {} | |||||
| EmitResult emit() const override; | EmitResult emit() const override; | ||||
| private: | private: | ||||
| megdnn::NamedTensorShape m_padshp; | |||||
| size_t m_const_extent, m_axis; | size_t m_const_extent, m_axis; | ||||
| }; | }; | ||||
| @@ -17,6 +17,11 @@ | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace gopt { | namespace gopt { | ||||
| enum class TensorType : uint32_t { | |||||
| FEATURE = 0, | |||||
| WEIGHT = 1, | |||||
| }; | |||||
| enum class TensorFormats : uint32_t { | enum class TensorFormats : uint32_t { | ||||
| // input tensor formats | // input tensor formats | ||||
| NCHW = 0, ///< [N, C, H, W] | NCHW = 0, ///< [N, C, H, W] | ||||
| @@ -116,6 +121,15 @@ public: | |||||
| private: | private: | ||||
| ReformatCache m_cache; | ReformatCache m_cache; | ||||
| }; | }; | ||||
| TensorShape make_aligned_tensor_shape(const VarNode* var, | |||||
| TensorFormats orig_formats, | |||||
| TensorFormats target_formats); | |||||
| TensorShape make_aligned_weight_shape(const VarNode* var, | |||||
| TensorFormats orig_formats, | |||||
| TensorFormats target_formats, | |||||
| TensorFormats extra_formats); | |||||
| } // namespace gopt | } // namespace gopt | ||||
| } // namespace mgb | } // namespace mgb | ||||
| @@ -20,6 +20,7 @@ class GraphPartition { | |||||
| public: | public: | ||||
| using VarNodeSet = ThinHashSet<VarNode*>; | using VarNodeSet = ThinHashSet<VarNode*>; | ||||
| using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>; | using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>; | ||||
| class InputPlaceholder; | class InputPlaceholder; | ||||
| GraphPartition() = default; | GraphPartition() = default; | ||||
| @@ -45,13 +46,13 @@ private: | |||||
| class SubGraphExtractor { | class SubGraphExtractor { | ||||
| public: | public: | ||||
| using OprList = ThinHashSet<Typeinfo*>; | using OprList = ThinHashSet<Typeinfo*>; | ||||
| SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {}; | |||||
| SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {}; | |||||
| std::vector<GraphPartition> extract( | std::vector<GraphPartition> extract( | ||||
| const SymbolVarArray& endpoint_vars) const; | const SymbolVarArray& endpoint_vars) const; | ||||
| private: | private: | ||||
| class Impl; | class Impl; | ||||
| OprList m_opr_list; | |||||
| const OprList& m_opr_list; | |||||
| }; | }; | ||||
| } // namespace gopt | } // namespace gopt | ||||
| @@ -0,0 +1,429 @@ | |||||
| /** | |||||
| * \file src/gopt/test/profiler.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./helper.h" | |||||
| #include "megbrain/gopt/global_layout_transform.h" | |||||
| #include "megbrain/gopt/inference.h" | |||||
| #include "megbrain/opr/dnn/pooling.h" | |||||
| #include "megbrain/opr/imgproc.h" | |||||
| #include "megbrain/opr/nn_int.h" | |||||
| #include "megbrain/serialization/serializer.h" | |||||
| using namespace mgb; | |||||
| using namespace gopt; | |||||
| using namespace serialization; | |||||
| namespace { | |||||
| class LayoutTransformContext : public NonCopyableObj { | |||||
| public: | |||||
| using OprList = SubGraphExtractor::OprList; | |||||
| using OprFormat = Problem::OprFormat; | |||||
| using OprConfigTrait = Problem::OprConfigTrait; | |||||
| LayoutTransformContext() = delete; | |||||
| LayoutTransformContext(OprList opr_list, | |||||
| SmallVector<TensorFormats> available_tensor_formats, | |||||
| OprConfigTrait opr_configs) | |||||
| : m_opr_list{std::move(opr_list)}, | |||||
| m_available_tensor_formats{std::move(available_tensor_formats)}, | |||||
| m_opr_configs{std::move(opr_configs)} {} | |||||
| const OprList& opr_list() const { return m_opr_list; } | |||||
| const SmallVector<TensorFormats>& available_tensor_formats() const { | |||||
| return m_available_tensor_formats; | |||||
| } | |||||
| const OprConfigTrait& opr_configs() const { return m_opr_configs; } | |||||
| static std::unique_ptr<LayoutTransformContext> make() { | |||||
| OprList opr_list = { | |||||
| opr::ConvBiasForward::typeinfo(), | |||||
| opr::ConvolutionForward::typeinfo(), | |||||
| opr::ConvolutionBackwardData::typeinfo(), | |||||
| opr::ElemwiseMultiType::typeinfo(), | |||||
| opr::Elemwise::typeinfo(), | |||||
| opr::TypeCvt::typeinfo(), | |||||
| opr::PoolingForward::typeinfo(), | |||||
| opr::WarpPerspectiveForward::typeinfo(), | |||||
| }; | |||||
| OprConfigTrait opr_configs; | |||||
| { | |||||
| auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()]; | |||||
| #define cb(_fmt) \ | |||||
| dispatchers[OprFormat::_fmt] = \ | |||||
| OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||||
| opr::ConvBias::typeinfo(), OprFormat::_fmt); | |||||
| cb(NCHW4); | |||||
| cb(NCHW32); | |||||
| cb(NHWC); | |||||
| cb(NCHW64); | |||||
| cb(CHWN4); | |||||
| #undef cb | |||||
| } | |||||
| { | |||||
| auto& dispatchers = | |||||
| opr_configs[opr::ConvolutionBackwardData::typeinfo()]; | |||||
| #define cb(_fmt) \ | |||||
| dispatchers[OprFormat::_fmt] = \ | |||||
| OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||||
| opr::ConvolutionBackwardData::typeinfo(), \ | |||||
| OprFormat::_fmt); | |||||
| cb(NCHW4); | |||||
| #undef cb | |||||
| } | |||||
| { | |||||
| auto& dispatchers = | |||||
| opr_configs[opr::ConvolutionForward::typeinfo()]; | |||||
| #define cb(_fmt) \ | |||||
| dispatchers[OprFormat::_fmt] = \ | |||||
| OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||||
| opr::ConvolutionForward::typeinfo(), OprFormat::_fmt); | |||||
| cb(NCHW4); | |||||
| #undef cb | |||||
| } | |||||
| { | |||||
| auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()]; | |||||
| #define cb(_fmt) \ | |||||
| dispatchers[OprFormat::_fmt] = \ | |||||
| OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||||
| opr::PoolingForward::typeinfo(), OprFormat::_fmt); | |||||
| cb(NCHW4); | |||||
| cb(NCHW32); | |||||
| cb(NHWC); | |||||
| cb(NCHW64); | |||||
| cb(CHWN4); | |||||
| #undef cb | |||||
| } | |||||
| { | |||||
| auto& dispatchers = | |||||
| opr_configs[opr::WarpPerspectiveForward::typeinfo()]; | |||||
| #define cb(_fmt) \ | |||||
| dispatchers[OprFormat::_fmt] = \ | |||||
| OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ | |||||
| opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt); | |||||
| cb(NHWC); | |||||
| cb(NCHW4); | |||||
| cb(NCHW64); | |||||
| #undef cb | |||||
| } | |||||
| SmallVector<TensorFormats> available_tensor_formats = { | |||||
| TensorFormats::NHWC, TensorFormats::NCHWc4, | |||||
| TensorFormats::NCHWc32, TensorFormats::NCHWc64}; | |||||
| return std::make_unique<LayoutTransformContext>( | |||||
| std::move(opr_list), std::move(available_tensor_formats), | |||||
| std::move(opr_configs)); | |||||
| } | |||||
| private: | |||||
| OprList m_opr_list; | |||||
| SmallVector<TensorFormats> m_available_tensor_formats; | |||||
| OprConfigTrait m_opr_configs; | |||||
| }; | |||||
| }; // namespace | |||||
| #if MGB_CUDA | |||||
| #if CUDA_VERSION >= 10020 | |||||
| TEST(TestProfiler, Conv) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| auto ctx = LayoutTransformContext::make(); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {64, 48, 14, 14}, | |||||
| dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||||
| auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f)); | |||||
| auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| auto c1 = opr::ConvBias::make(x, w1, b1, param, {}, | |||||
| OperatorNodeConfig(dtype::Quantized4Asymm( | |||||
| 12.345f, static_cast<uint8_t>(5)))); | |||||
| x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f)); | |||||
| auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f)); | |||||
| auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f)); | |||||
| auto c2 = opr::ConvBias::make(x, w2, b2, param, {}, | |||||
| OperatorNodeConfig(dtype::QuantizedS8(2.5f))); | |||||
| using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
| S strategy = S::PROFILE; | |||||
| gopt::modify_opr_algo_strategy_inplace({c2}, strategy); | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| SubGraphExtractor extractor(ctx->opr_list()); | |||||
| auto partitions = extractor.extract({c2}); | |||||
| ASSERT_EQ(partitions.size(), 1u); | |||||
| using Attribute = Problem::Attribute; | |||||
| Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||||
| Problem problem(partitions[0], ctx->available_tensor_formats(), | |||||
| ctx->opr_configs(), attribute); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| auto rst = profiler->profile(problem); | |||||
| const auto& opr_rst = rst.opr_record; | |||||
| const auto& var_rst = rst.var_record; | |||||
| EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(w1.node()) == 0); | |||||
| EXPECT_TRUE(var_rst.count(b1.node()) == 0); | |||||
| EXPECT_TRUE(var_rst.count(w2.node()) == 0); | |||||
| EXPECT_TRUE(var_rst.count(b2.node()) == 0); | |||||
| } | |||||
| #endif | |||||
| TEST(TestProfiler, Deconv) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| auto ctx = LayoutTransformContext::make(); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f)); | |||||
| auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); | |||||
| using Param = opr::ConvolutionBackwardData::Param; | |||||
| Param param; | |||||
| param.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | |||||
| param.stride_h = param.stride_w = 2; | |||||
| param.pad_h = param.pad_w = 0; | |||||
| auto c1 = opr::ConvolutionBackwardData::make( | |||||
| w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); | |||||
| auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); | |||||
| auto c2 = opr::ConvolutionBackwardData::make( | |||||
| w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); | |||||
| using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
| S strategy = S::PROFILE; | |||||
| gopt::modify_opr_algo_strategy_inplace({c2}, strategy); | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| SubGraphExtractor extractor(ctx->opr_list()); | |||||
| auto partitions = extractor.extract({c2}); | |||||
| ASSERT_EQ(partitions.size(), 1u); | |||||
| using Attribute = Problem::Attribute; | |||||
| Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||||
| Problem problem(partitions[0], ctx->available_tensor_formats(), | |||||
| ctx->opr_configs(), attribute); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| auto rst = profiler->profile(problem); | |||||
| const auto& opr_rst = rst.opr_record; | |||||
| const auto& var_rst = rst.var_record; | |||||
| EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(w1.node()) == 0); | |||||
| EXPECT_TRUE(var_rst.count(w2.node()) == 0); | |||||
| } | |||||
| TEST(TestProfiler, Warp) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| auto ctx = LayoutTransformContext::make(); | |||||
| constexpr size_t INP_H = 10, INP_W = 10, N = 16; | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {N, 48, INP_H, INP_W}, | |||||
| dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||||
| float value1 = M_PI, value2 = 0.6; | |||||
| auto gen_mat = [&](HostTensorND& mat) { | |||||
| auto ptr = mat.ptr<float>(); | |||||
| for (size_t i = 0; i < N; ++i) { | |||||
| auto rot = value1, scale = value2, sheer = value1, dy = value2, | |||||
| dx = value2, ky = value2, kx = value2, kb = value2; | |||||
| ptr[0] = ptr[4] = cos(rot) * scale; | |||||
| ptr[1] = -(ptr[3] = sin(rot) * scale); | |||||
| ptr[3] *= sheer; | |||||
| ptr[4] *= sheer; | |||||
| ptr[2] = dx; | |||||
| ptr[5] = dy; | |||||
| ptr[6] = kx; | |||||
| ptr[7] = ky; | |||||
| ptr[8] = kb; | |||||
| ptr += 9; | |||||
| } | |||||
| mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems()); | |||||
| }; | |||||
| auto mat_host = std::make_shared<HostTensorND>( | |||||
| x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32()); | |||||
| gen_mat(*mat_host); | |||||
| auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat"); | |||||
| TensorShape out_shp{20, 20}; | |||||
| auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp); | |||||
| using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
| S strategy = S::PROFILE; | |||||
| gopt::modify_opr_algo_strategy_inplace({w1}, strategy); | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| SubGraphExtractor extractor(ctx->opr_list()); | |||||
| auto partitions = extractor.extract({w1}); | |||||
| ASSERT_EQ(partitions.size(), 1u); | |||||
| using Attribute = Problem::Attribute; | |||||
| Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||||
| Problem problem(partitions[0], ctx->available_tensor_formats(), | |||||
| ctx->opr_configs(), attribute); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| auto rst = profiler->profile(problem); | |||||
| const auto& opr_rst = rst.opr_record; | |||||
| const auto& var_rst = rst.var_record; | |||||
| EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(mat.node()) == 0); | |||||
| EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0); | |||||
| EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0); | |||||
| } | |||||
| TEST(TestProfiler, Pooling) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| auto ctx = LayoutTransformContext::make(); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {64, 64, 55, 55}, | |||||
| dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||||
| using Param = opr::Pooling::Param; | |||||
| Param param; | |||||
| param.format = Param::Format::NCHW; | |||||
| auto p1 = opr::Pooling::make(x, param); | |||||
| x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f)); | |||||
| auto p2 = opr::Pooling::make(x, param); | |||||
| using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; | |||||
| S strategy = S::PROFILE; | |||||
| gopt::modify_opr_algo_strategy_inplace({p2}, strategy); | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| SubGraphExtractor extractor(ctx->opr_list()); | |||||
| auto partitions = extractor.extract({p2}); | |||||
| ASSERT_EQ(partitions.size(), 1u); | |||||
| using Attribute = Problem::Attribute; | |||||
| Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||||
| Problem problem(partitions[0], ctx->available_tensor_formats(), | |||||
| ctx->opr_configs(), attribute); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| auto rst = profiler->profile(problem); | |||||
| const auto& opr_rst = rst.opr_record; | |||||
| EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); | |||||
| } | |||||
| TEST(TestProfiler, Elemwise) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| auto ctx = LayoutTransformContext::make(); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32()); | |||||
| auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32()); | |||||
| auto c = opr::Elemwise::make({a, b}, | |||||
| {opr::Elemwise::Param::Mode::FUSE_ADD_RELU}); | |||||
| auto q4c = opr::TypeCvt::make( | |||||
| c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4))); | |||||
| auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f)); | |||||
| auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f)); | |||||
| auto q8d = opr::ElemwiseMultiType::make( | |||||
| {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}, | |||||
| OperatorNodeConfig(dtype::QuantizedS8(12.f))); | |||||
| auto q4d = opr::TypeCvt::make( | |||||
| q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3))); | |||||
| auto q4e = opr::ElemwiseMultiType::make( | |||||
| {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD}, | |||||
| OperatorNodeConfig( | |||||
| dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4)))); | |||||
| using OprFormat = OprTensorFormatsConfiguration::OprFormat; | |||||
| SubGraphExtractor extractor(ctx->opr_list()); | |||||
| auto partitions = extractor.extract({q4e}); | |||||
| ASSERT_EQ(partitions.size(), 1u); | |||||
| using Attribute = Problem::Attribute; | |||||
| Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; | |||||
| Problem problem(partitions[0], ctx->available_tensor_formats(), | |||||
| ctx->opr_configs(), attribute); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| auto rst = profiler->profile(problem); | |||||
| const auto& opr_rst = rst.opr_record; | |||||
| const auto& var_rst = rst.var_record; | |||||
| EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(a.node()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(b.node()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(q8a.node()) > 0); | |||||
| EXPECT_TRUE(var_rst.count(q8b.node()) > 0); | |||||
| } | |||||
| #endif | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) { | |||||
| for (size_t i = 0; i < RUNS; ++i) | for (size_t i = 0; i < RUNS; ++i) | ||||
| func->execute(); | func->execute(); | ||||
| double time_profiler = profiler->duration() * 1e6; | double time_profiler = profiler->duration() * 1e6; | ||||
| printf("%f, %f\n", time_profiler, time_cuda_evt); | |||||
| ASSERT_EQ(time_cuda_evt, time_profiler); | |||||
| MGB_CUDA_CHECK(cudaEventDestroy(evt0)); | MGB_CUDA_CHECK(cudaEventDestroy(evt0)); | ||||
| MGB_CUDA_CHECK(cudaEventDestroy(evt1)); | MGB_CUDA_CHECK(cudaEventDestroy(evt1)); | ||||
| } | } | ||||