GitOrigin-RevId: a12a7d399a
tags/v1.3.0
| @@ -0,0 +1,172 @@ | |||
| /** | |||
| * \file dnn/src/cuda/convolution/forward/algos.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/cuda/convolution/forward/algos.h" | |||
| #include "src/cuda/conv_bias/opr_impl.h" | |||
| #include "src/cuda/conv_bias/algo.h" | |||
| #include "src/common/algo_base.h" | |||
| #include "src/common/algo_chooser.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| namespace { | |||
| std::pair<TensorLayoutArray, ConvBiasForward::Param> sub_opr_config( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, const ConvolutionForwardImpl* opr) { | |||
| auto conv_param = opr->param(); | |||
| DType bias_type; | |||
| if (src.dtype.enumv() == DTypeEnum::QuantizedS8) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::QuantizedS8>().scale * | |||
| filter.dtype.param<dtype::QuantizedS8>().scale); | |||
| } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::Quantized8Asymm>().scale * | |||
| filter.dtype.param<dtype::Quantized8Asymm>().scale); | |||
| } else if (src.dtype.enumv() == DTypeEnum::Uint8 || | |||
| src.dtype.enumv() == DTypeEnum::Int8) { | |||
| bias_type = dtype::Int32{}; | |||
| } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::Quantized4Asymm>().scale * | |||
| filter.dtype.param<dtype::Quantized4Asymm>().scale); | |||
| } else { | |||
| megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); | |||
| bias_type = src.dtype; | |||
| } | |||
| std::pair<TensorLayoutArray, ConvBiasForward::Param> ret; | |||
| ret.second = {param::ConvBias::NonlineMode::IDENTITY, | |||
| conv_param.mode, | |||
| conv_param.sparse, | |||
| conv_param.format, | |||
| conv_param.pad_h, | |||
| conv_param.pad_w, | |||
| conv_param.stride_h, | |||
| conv_param.stride_w, | |||
| conv_param.dilate_h, | |||
| conv_param.dilate_w, | |||
| conv_param.compute_mode}; | |||
| ret.first.push_back(TensorLayout({}, bias_type)); | |||
| ret.first.push_back(TensorLayout({}, dst.dtype)); | |||
| return ret; | |||
| } | |||
| } // namespace | |||
| ConvolutionForwardImpl::AlgoPack::AlgoPack() { | |||
| all_algos.push_back(&algo_default); | |||
| for (auto&& algo : all_algos) { | |||
| m_all_algos_map.emplace(algo->info().desc, algo); | |||
| } | |||
| } | |||
| ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack; | |||
| MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionForwardImpl) | |||
| ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o, | |||
| const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst) | |||
| : opr{o}, layout_src{&src}, layout_filter{&filter}, layout_dst{&dst} {} | |||
| ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs( | |||
| ConvolutionForwardImpl* opr, _megdnn_tensor_in src, | |||
| _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) | |||
| : SizeArgs(opr, src.layout, filter.layout, dst.layout), | |||
| tensor_src{src}, | |||
| tensor_filter{filter}, | |||
| tensor_dst{dst}, | |||
| workspace{workspace} {} | |||
| std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||
| return megdnn_mangle(ssprintf("src=%s, filter=%s, dst=%s", | |||
| layout_src->to_string().c_str(), | |||
| layout_filter->to_string().c_str(), | |||
| layout_dst->to_string().c_str())); | |||
| } | |||
| /* ===================== default algo ===================== */ | |||
| std::vector<Algorithm::SearchItem> | |||
| ConvolutionForwardImpl::AlgoDefault::get_subopr_list( | |||
| const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||
| auto&& config = | |||
| sub_opr_config(layouts[0], layouts[1], layouts[2], | |||
| static_cast<const ConvolutionForwardImpl*>(opr)); | |||
| TensorLayoutArray conv_bias_layouts = {layouts[0], layouts[1], | |||
| config.first[0], config.first[1], | |||
| layouts[2]}; | |||
| std::string param_str; | |||
| Algorithm::serialize_write_pod(config.second, param_str); | |||
| return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, | |||
| conv_bias_layouts}}; | |||
| } | |||
| bool ConvolutionForwardImpl::AlgoDefault::is_available( | |||
| const SizeArgs& args) const { | |||
| auto conv_bias_opr = | |||
| args.opr->handle()->create_operator<ConvBiasForward>(); | |||
| auto&& config = sub_opr_config( | |||
| *args.layout_src, *args.layout_filter, *args.layout_dst, | |||
| args.opr); | |||
| conv_bias_opr->param() = config.second; | |||
| return get_algorithm(static_cast<ConvBiasForwardImpl*>(conv_bias_opr.get()), | |||
| *args.layout_src, *args.layout_filter, config.first[0], | |||
| config.first[1], *args.layout_dst); | |||
| } | |||
| size_t ConvolutionForwardImpl::AlgoDefault::get_workspace_in_bytes( | |||
| const SizeArgs& args) const { | |||
| auto conv_bias_opr = args.opr->handle()->create_operator<ConvBiasForward>(); | |||
| if (args.opr->execution_policy().algo.valid() && | |||
| !args.opr->execution_policy().sub_policy.empty()) { | |||
| megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); | |||
| conv_bias_opr->execution_policy() = | |||
| args.opr->execution_policy().sub_policy[0]; | |||
| } | |||
| auto&& config = sub_opr_config( | |||
| *args.layout_src, *args.layout_filter, *args.layout_dst, | |||
| args.opr); | |||
| conv_bias_opr->param() = config.second; | |||
| return conv_bias_opr->get_workspace_in_bytes( | |||
| *args.layout_src, *args.layout_filter, config.first[0], | |||
| config.first[1], *args.layout_dst, nullptr); | |||
| } | |||
| void ConvolutionForwardImpl::AlgoDefault::exec(const ExecArgs& args) const { | |||
| auto conv_bias_opr = args.opr->handle()->create_operator<ConvBiasForward>(); | |||
| if (args.opr->execution_policy().algo.valid()) { | |||
| megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); | |||
| conv_bias_opr->execution_policy() = | |||
| args.opr->execution_policy().sub_policy[0]; | |||
| } | |||
| auto&& config = sub_opr_config( | |||
| *args.layout_src, *args.layout_filter, *args.layout_dst, | |||
| args.opr); | |||
| conv_bias_opr->param() = config.second; | |||
| conv_bias_opr->exec(args.tensor_src, args.tensor_filter, | |||
| {nullptr, config.first[0]}, {nullptr, config.first[1]}, | |||
| args.tensor_dst, nullptr, args.workspace); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,111 @@ | |||
| /** | |||
| * \file dnn/src/cuda/convolution/forward/algos.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| #include "src/common/algo_base.h" | |||
| #include "src/common/metahelper.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/cuda/convolution/opr_impl.h" | |||
| #include <unordered_map> | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| /*! | |||
| * \brief base class for convolutionForward algos | |||
| * | |||
| */ | |||
| class ConvolutionForwardImpl::AlgoBase : public Algorithm { | |||
| protected: | |||
| ~AlgoBase() = default; | |||
| public: | |||
| enum class AlgoType : uint32_t { | |||
| CUDA_DEFAULT, | |||
| }; | |||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||
| AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | |||
| struct SizeArgs { | |||
| ConvolutionForwardImpl* opr; | |||
| const TensorLayout *layout_src, *layout_filter, *layout_dst; | |||
| std::string to_string() const; | |||
| SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src, | |||
| const TensorLayout& filter, const TensorLayout& dst); | |||
| }; | |||
| struct ExecArgs : public SizeArgs { | |||
| TensorND tensor_src, tensor_filter, tensor_dst; | |||
| Workspace workspace; | |||
| ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src, | |||
| _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace); | |||
| }; | |||
| virtual bool is_available(const SizeArgs& args) const = 0; | |||
| virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | |||
| virtual void exec(const ExecArgs&) const = 0; | |||
| bool is_available_wk(const SizeArgs& args, size_t limit) const { | |||
| return is_available(args) && get_workspace_in_bytes(args) <= limit; | |||
| } | |||
| bool is_available_reproducible( | |||
| const SizeArgs& args, bool reproducible = true, | |||
| size_t limit = std::numeric_limits<size_t>::max()) const { | |||
| return (!reproducible || is_reproducible()) && | |||
| is_available_wk(args, limit); | |||
| } | |||
| AlgoBase& check_workspace(const SizeArgs& args, | |||
| const Workspace& workspace) { | |||
| auto req = get_workspace_in_bytes(args); | |||
| megdnn_assert(req <= workspace.size, | |||
| "convolution fwd algo %s: required workspace %zu bytes, " | |||
| "got %zu", | |||
| name(), req, workspace.size); | |||
| return *this; | |||
| } | |||
| }; | |||
| class ConvolutionForwardImpl::AlgoDefault final : public AlgoBase { | |||
| public: | |||
| AlgoDefault() = default; | |||
| bool is_available(const SizeArgs&) const override; | |||
| size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override; | |||
| const char* name() const override { return "DEFAULT"; } | |||
| void exec(const ExecArgs&) const override; | |||
| bool is_reproducible() const override { return true; } | |||
| std::vector<SearchItem> get_subopr_list( | |||
| const TensorLayoutArray& layouts, | |||
| const OperatorBase* opr) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_DEFAULT) | |||
| }; | |||
| class ConvolutionForwardImpl::AlgoPack : NonCopyableObj { | |||
| private: | |||
| AlgoBase::Mapper m_all_algos_map; | |||
| public: | |||
| AlgoPack(); | |||
| AlgoDefault algo_default; | |||
| std::vector<AlgoBase*> all_algos; | |||
| const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | |||
| }; | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -12,6 +12,7 @@ | |||
| #include "src/cuda/convolution/opr_impl.h" | |||
| #include "megdnn/dtype.h" | |||
| #include "src/cuda/convolution/helper.h" | |||
| #include "src/cuda/convolution/forward/algos.h" | |||
| #include "src/cuda/convolution/backward_data/algo.h" | |||
| #include "src/cuda/convolution/backward_filter/algo.h" | |||
| #include "src/cuda/conv_bias/opr_impl.h" | |||
| @@ -28,108 +29,34 @@ using namespace convolution; | |||
| TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) | |||
| /* ============== ConvolutionForwardImpl ============== */ | |||
| ConvolutionForwardImpl::ConvBiasExtraData | |||
| ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst) { | |||
| auto conv_param = param(); | |||
| DType bias_type; | |||
| if (src.dtype.enumv() == DTypeEnum::QuantizedS8) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::QuantizedS8>().scale * | |||
| filter.dtype.param<dtype::QuantizedS8>().scale); | |||
| } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::Quantized8Asymm>().scale * | |||
| filter.dtype.param<dtype::Quantized8Asymm>().scale); | |||
| } else if (src.dtype.enumv() == DTypeEnum::Uint8 || | |||
| src.dtype.enumv() == DTypeEnum::Int8) { | |||
| bias_type = dtype::Int32{}; | |||
| } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::Quantized4Asymm>().scale * | |||
| filter.dtype.param<dtype::Quantized4Asymm>().scale); | |||
| } else { | |||
| megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); | |||
| bias_type = src.dtype; | |||
| } | |||
| ConvBiasExtraData ret = {this->handle()->create_operator<ConvBiasForward>(), | |||
| TensorLayout(bias_type), TensorLayout(dst.dtype)}; | |||
| ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY, | |||
| conv_param.mode, | |||
| conv_param.sparse, | |||
| conv_param.format, | |||
| conv_param.pad_h, | |||
| conv_param.pad_w, | |||
| conv_param.stride_h, | |||
| conv_param.stride_w, | |||
| conv_param.dilate_h, | |||
| conv_param.dilate_w, | |||
| conv_param.compute_mode}; | |||
| ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; | |||
| return ret; | |||
| } | |||
| ConvolutionForwardImpl::Algorithm* | |||
| ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) { | |||
| auto extra_data = conv_bias_extra_data(src, filter, dst); | |||
| return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
| ->get_algorithm_heuristic(src, filter, extra_data.bias_layout, | |||
| extra_data.z_layout, dst, | |||
| workspace_limit_in_bytes, reproducible); | |||
| } | |||
| ConvolutionForwardImpl::Algorithm* | |||
| ConvolutionForwardImpl::get_algorithm_from_desc( | |||
| const ConvolutionForward::AlgorithmDesc& desc) { | |||
| auto conv_param = param(); | |||
| auto convbias_opr = this->handle()->create_operator<ConvBiasForward>(); | |||
| convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY, | |||
| conv_param.mode, | |||
| conv_param.sparse, | |||
| conv_param.format, | |||
| conv_param.pad_h, | |||
| conv_param.pad_w, | |||
| conv_param.stride_h, | |||
| conv_param.stride_w, | |||
| conv_param.dilate_h, | |||
| conv_param.dilate_w, | |||
| conv_param.compute_mode}; | |||
| convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; | |||
| return static_cast<ConvBiasForwardImpl*>(convbias_opr.get()) | |||
| ->get_algorithm_from_desc(desc); | |||
| AlgoBase::SizeArgs args{this, src, filter, dst}; | |||
| MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); | |||
| MEGDNN_MARK_USED_VAR(reproducible); | |||
| return &sm_algo_pack.algo_default; | |||
| } | |||
| std::vector<ConvolutionForwardImpl::Algorithm*> | |||
| ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst) { | |||
| auto extra_data = conv_bias_extra_data(src, filter, dst); | |||
| return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
| ->get_all_algorithms(src, filter, extra_data.bias_layout, | |||
| extra_data.z_layout, dst); | |||
| AlgoBase::SizeArgs args{this, src, filter, dst}; | |||
| return megdnn::get_all_algorithms<ConvolutionForwardImpl>(args); | |||
| } | |||
| size_t ConvolutionForwardImpl::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) { | |||
| auto extra_data = conv_bias_extra_data(src, filter, dst); | |||
| return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
| ->get_workspace_in_bytes( | |||
| src, filter, extra_data.bias_layout, extra_data.z_layout, | |||
| dst, | |||
| reinterpret_cast<const ConvolutionBase< | |||
| param::ConvBias>::PreprocessedFilter*>( | |||
| preprocessed_filter)); | |||
| MEGDNN_MARK_USED_VAR(preprocessed_filter); | |||
| AlgoBase::SizeArgs args{this, src, filter, dst}; | |||
| return megdnn::get_algorithm(this, src, filter, dst) | |||
| ->get_workspace_in_bytes(args); | |||
| } | |||
| void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
| @@ -137,20 +64,15 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) { | |||
| auto extra_data = | |||
| conv_bias_extra_data(src.layout, filter.layout, dst.layout); | |||
| TensorND bias(nullptr, extra_data.bias_layout); | |||
| TensorND z(nullptr, extra_data.z_layout); | |||
| return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
| ->exec(src, filter, bias, z, dst, | |||
| reinterpret_cast<const ConvolutionBase< | |||
| param::ConvBias>::PreprocessedFilter*>( | |||
| preprocessed_filter), | |||
| workspace); | |||
| check_exec(src.layout, filter.layout, dst.layout, workspace.size, | |||
| preprocessed_filter); | |||
| AlgoBase::ExecArgs args(this, src, filter, dst, workspace); | |||
| auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout); | |||
| algo->check_workspace(args, workspace).exec(args); | |||
| } | |||
| const char* ConvolutionForwardImpl::get_algorithm_set_name() const { | |||
| return "CUDACONV0+CUDNN" CUDNN_VERSION_STR; | |||
| return "CUDA CONVOLUTION_FORWARD" ; | |||
| } | |||
| /* ============== ConvolutionBackwardDataImpl ============== */ | |||
| @@ -6,7 +6,8 @@ | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| @@ -16,58 +17,56 @@ | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| class ConvolutionForwardImpl: public ConvolutionForward { | |||
| public: | |||
| using ConvolutionForward::ConvolutionForward; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) override; | |||
| const char* get_algorithm_set_name() const override; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return {}; | |||
| } | |||
| size_t get_preprocess_workspace_in_bytes( | |||
| const TensorLayout& , const TensorLayout& , | |||
| const TensorLayout& ) override{ | |||
| return 0; | |||
| } | |||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
| const TensorLayout&, PreprocessedFilter*, | |||
| _megdnn_workspace) override { | |||
| megdnn_throw("cuda exec_preprocess has not implemeted yet"); | |||
| } | |||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; | |||
| protected: | |||
| struct ConvBiasExtraData{ | |||
| std::unique_ptr<ConvBiasForward> convbias_opr; | |||
| TensorLayout bias_layout; | |||
| TensorLayout z_layout; | |||
| }; | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst) override; | |||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| private: | |||
| ConvBiasExtraData conv_bias_extra_data(const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&); | |||
| class ConvolutionForwardImpl : public ConvolutionForward { | |||
| public: | |||
| using ConvolutionForward::ConvolutionForward; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) override; | |||
| const char* get_algorithm_set_name() const override; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return {}; | |||
| } | |||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
| const TensorLayout&, PreprocessedFilter*, | |||
| _megdnn_workspace) override { | |||
| megdnn_throw("cuda exec_preprocess has not implemeted yet"); | |||
| } | |||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; | |||
| class AlgoBase; | |||
| class AlgoDefault; | |||
| class AlgoPack; | |||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||
| protected: | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst) override; | |||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| private: | |||
| static AlgoPack sm_algo_pack; | |||
| }; | |||
| class ConvolutionBackwardDataImpl : public ConvolutionBackwardData { | |||
| @@ -122,6 +121,7 @@ protected: | |||
| const TensorLayout& grad, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| private: | |||
| Algorithm* get_algorithm_heuristic(const TensorLayout& filter, | |||
| const CanonizedFilterMeta& filter_meta, | |||
| @@ -141,12 +141,10 @@ public: | |||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& diff, | |||
| const TensorLayout& grad) override; | |||
| AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& src, | |||
| const TensorLayout& diff, | |||
| const TensorLayout& grad, | |||
| const CanonizedFilterMeta& grad_meta, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) { | |||
| AlgorithmInfo get_algorithm_info_heuristic( | |||
| const TensorLayout& src, const TensorLayout& diff, | |||
| const TensorLayout& grad, const CanonizedFilterMeta& grad_meta, | |||
| size_t workspace_limit_in_bytes, bool reproducible) { | |||
| return get_algorithm_heuristic(src, diff, grad, grad_meta, | |||
| workspace_limit_in_bytes, reproducible) | |||
| ->info(); | |||
| @@ -162,7 +160,6 @@ public: | |||
| ->info(); | |||
| } | |||
| const char* get_algorithm_set_name() const override; | |||
| class AlgoBase; | |||
| @@ -187,6 +184,7 @@ protected: | |||
| const TensorLayout& grad, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| private: | |||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||
| const TensorLayout& diff, | |||
| @@ -532,6 +532,30 @@ private: | |||
| bool* m_require_algo; | |||
| }; | |||
| template <typename Opr> | |||
| void construct_sub_execution_policy_heuristic(ExecutionPolicy& policy, | |||
| const TensorLayoutArray& layouts, | |||
| const std::string& param, | |||
| Handle* handle) { | |||
| megdnn_assert(layouts.size() == OprTrait<Opr>::arity); | |||
| auto opr = handle->create_operator<Opr>(); | |||
| opr->param() = Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||
| if (!policy.algo.valid()) { | |||
| policy.algo = AlgoProxy<Opr, OprTrait<Opr>::arity>:: | |||
| get_algorithm_info_heuristic(opr.get(), layouts).desc; | |||
| } | |||
| Algorithm* algo = opr->get_algorithm_from_desc(policy.algo); | |||
| std::vector<Algorithm::SearchItem>&& sub_items = | |||
| algo->get_subopr_list(layouts, opr.get()); | |||
| FOREACH_OPR_TYPE_DISPATCH(sub_items, { | |||
| policy.sub_policy.push_back(ExecutionPolicy{}); | |||
| construct_sub_execution_policy_heuristic<_Opr>( | |||
| policy.sub_policy.back(), _item.layouts, _item.param, | |||
| handle); | |||
| }); | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -570,6 +570,8 @@ void convolution::test_conv_config_combinations(int k_size, | |||
| .set_param(param); | |||
| auto opr = checker.opr(); | |||
| opr->param() = param; | |||
| std::string param_str; | |||
| Algorithm::serialize_write_pod(opr->param(), param_str); | |||
| TensorLayout ily{ishp, inp_type}, fly{fshp, inp_type}, oly; | |||
| oly.dtype = out_type; | |||
| opr->deduce_layout(ily, fly, oly); | |||
| @@ -581,10 +583,14 @@ void convolution::test_conv_config_combinations(int k_size, | |||
| for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { | |||
| used_algos.insert(algo.desc); | |||
| opr->execution_policy().algo = algo.desc; | |||
| construct_sub_execution_policy_heuristic<ConvolutionForward>( | |||
| opr->execution_policy(), {ily, fly, oly}, param_str, | |||
| opr->handle()); | |||
| checker | |||
| .set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) | |||
| .execs({ishp, fshp, {}}); | |||
| opr->execution_policy().algo.reset(); | |||
| opr->execution_policy() = {}; | |||
| ASSERT_TRUE(checker.prev_succ()) << errmsg(algo.name.c_str()); | |||
| } | |||
| @@ -597,13 +603,19 @@ void convolution::test_conv_config_combinations(int k_size, | |||
| auto opr = checker_bwd_data.opr(); | |||
| opr->param() = param; | |||
| std::string param_str; | |||
| Algorithm::serialize_write_pod(opr->param(), param_str); | |||
| for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { | |||
| used_algos_bwd_data.insert(algo.desc); | |||
| opr->execution_policy().algo = algo.desc; | |||
| construct_sub_execution_policy_heuristic< | |||
| ConvolutionBackwardData>(opr->execution_policy(), | |||
| {fly, oly, ily}, param_str, | |||
| opr->handle()); | |||
| checker_bwd_data | |||
| .set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) | |||
| .execl({fly, oly, ily}); | |||
| opr->execution_policy().algo.reset(); | |||
| opr->execution_policy() = {}; | |||
| ASSERT_TRUE(checker_bwd_data.prev_succ()) << | |||
| errmsg(algo.name.c_str()); | |||
| } | |||
| @@ -618,13 +630,19 @@ void convolution::test_conv_config_combinations(int k_size, | |||
| auto opr = checker_bwd_filter.opr(); | |||
| opr->param() = param; | |||
| std::string param_str; | |||
| Algorithm::serialize_write_pod(opr->param(), param_str); | |||
| for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { | |||
| used_algos_bwd_flt.insert(algo.desc); | |||
| opr->execution_policy().algo = algo.desc; | |||
| construct_sub_execution_policy_heuristic< | |||
| ConvolutionBackwardFilter>(opr->execution_policy(), | |||
| {ily, oly, fly}, param_str, | |||
| opr->handle()); | |||
| checker_bwd_filter | |||
| .set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) | |||
| .execl({ily, oly, fly}); | |||
| opr->execution_policy().algo.reset(); | |||
| opr->execution_policy() = {}; | |||
| ASSERT_TRUE(checker_bwd_filter.prev_succ()) << | |||
| errmsg(algo.name.c_str()); | |||
| } | |||
| @@ -338,6 +338,7 @@ struct OprProxyProfilingBase | |||
| FastRunCache& cache) { | |||
| megdnn_assert(layouts.size() == arity); | |||
| auto opr = handle->create_operator<Opr>(); | |||
| opr->param() = | |||
| Algorithm::deserialize_read_pod<typename Opr::Param>(param); | |||
| SmallVector<size_t> sizes_in_bytes; | |||
| @@ -427,9 +428,9 @@ struct OprProxyProfilingBase | |||
| auto&& search_items = | |||
| flatten_search_space(layouts, param_str, opr->handle()); | |||
| FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||
| OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W, | |||
| opr->handle(), warmup_times, | |||
| exec_times, cache); | |||
| OprProxyProfilingBase<_Opr>::search( | |||
| _item.layouts, _item.param, W, opr->handle(), | |||
| warmup_times, exec_times, cache); | |||
| }); | |||
| construct_execution_policy(layouts, param_str, opr->handle(), cache, | |||
| @@ -273,10 +273,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD) { | |||
| Checker<Convolution> checker(handle_cuda()); | |||
| bool require_algo = false; | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str(), | |||
| ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str(), | |||
| {}}}}, | |||
| &require_algo)); | |||
| for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) { | |||
| checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype); | |||
| if (dtype.enumv() == DTypeEnum::Float16) | |||
| @@ -306,8 +310,12 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_SMALL) { | |||
| Checker<Convolution> checker(handle_cuda()); | |||
| bool require_algo = false; | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE_SMALL", {}).c_str(), | |||
| ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE_SMALL", {}) | |||
| .c_str(), | |||
| {}}}}, | |||
| &require_algo)); | |||
| for (auto dtype : std::vector<DType> { | |||
| dtype::Float32(), | |||
| @@ -338,6 +346,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) { | |||
| bool require_algo = false; | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>( | |||
| "CHANNEL_WISE", &require_algo)); | |||
| for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) { | |||
| checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype); | |||
| if (dtype.enumv() == DTypeEnum::Float16) | |||
| @@ -368,9 +377,8 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA) { | |||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_SMALL) { | |||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||
| bool require_algo = false; | |||
| checker.set_before_exec_callback( | |||
| AlgoChecker<ConvolutionBackwardData>( | |||
| "CHANNEL_WISE_SMALL", &require_algo)); | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>( | |||
| "CHANNEL_WISE_SMALL", &require_algo)); | |||
| for (auto dtype : std::vector<DType> { | |||
| dtype::Float32(), | |||
| #if CUDA_VERSION >= 9000 | |||
| @@ -396,10 +404,14 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) { | |||
| Checker<ConvolutionBackwardFilter> checker(handle_cuda()); | |||
| bool require_algo = false; | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>( | |||
| "CHANNEL_WISE", &require_algo)); | |||
| "CHANNEL_WISE", &require_algo)); | |||
| UniformFloatRNG rng(-0.1, 0.1); | |||
| for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) { | |||
| checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype).set_rng(0, &rng).set_rng(1, &rng); | |||
| checker.set_dtype(0, dtype) | |||
| .set_dtype(1, dtype) | |||
| .set_dtype(2, dtype) | |||
| .set_rng(0, &rng) | |||
| .set_rng(1, &rng); | |||
| if (dtype.enumv() == DTypeEnum::Float16) | |||
| checker.set_epsilon(2e-1); | |||
| // simple case | |||
| @@ -514,7 +526,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) { | |||
| auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, | |||
| size_t FW) { | |||
| checker.proxy()->target_execution_policy.algo.reset(); | |||
| checker.proxy()->target_execution_policy = {}; | |||
| checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); | |||
| }; | |||
| @@ -614,7 +626,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_rng(0, &rng) | |||
| .set_rng(1, &rng); | |||
| bencher.proxy()->target_execution_policy.algo.reset(); | |||
| bencher.proxy()->target_execution_policy = {}; | |||
| auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; | |||
| bencher.set_param(param) | |||
| @@ -623,7 +635,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { | |||
| .set_dtype(2, dtype::Float16()) | |||
| .set_rng(0, &rng) | |||
| .set_rng(1, &rng); | |||
| bencher.proxy()->target_execution_policy.algo.reset(); | |||
| bencher.proxy()->target_execution_policy = {}; | |||
| auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||
| bencher.proxy()->target_execution_policy.algo.reset(); | |||
| @@ -677,10 +689,13 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT) { | |||
| CUBenchmarker<ConvolutionForward> bencher(handle_cuda()); | |||
| size_t RUNS = 1; | |||
| bencher.set_display(false).set_times(RUNS); | |||
| bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str())); | |||
| bencher.set_before_exec_callback( | |||
| AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| Convolution::Param param; | |||
| param.format = ConvBias::Param::Format::NCHW; | |||
| @@ -783,17 +798,24 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) { | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name< | |||
| ConvBiasForward::DirectParam>("CHANNEL_WISE", | |||
| {}) | |||
| .c_str())); | |||
| .set_before_exec_callback( | |||
| AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name< | |||
| ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| auto time_in_ms_fp32_normal = bencher.execs({src, filter, {}}) / RUNS; | |||
| bencher.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str())); | |||
| ExecutionPolicyAlgoName{"DEFAULT", | |||
| {{ConvBiasForward::algo_name< | |||
| ConvBiasForward::DirectParam>( | |||
| "CHANNEL_WISE", {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| auto time_in_ms_fp32_small = bencher.execs({src, filter, {}}) / RUNS; | |||
| bencher.set_param(param) | |||
| @@ -135,10 +135,13 @@ TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) { | |||
| .set_rng(1, &int_rng) | |||
| .set_param(param); | |||
| checker.set_before_exec_callback(AlgoChecker<Convolution>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | |||
| "MATMUL8X8X32", {}) | |||
| .c_str())); | |||
| checker.set_before_exec_callback( | |||
| AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | |||
| "MATMUL8X8X32", {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| param.sparse = Convolution::Param::Sparse::DENSE; | |||
| param.pad_h = param.pad_w = 1; | |||
| @@ -30,19 +30,26 @@ TEST_F(CUDA, DILATED_CONVOLUTION_FORWARD) | |||
| auto args = get_dilated_args(); | |||
| Checker<ConvolutionForward> checker(handle_cuda()); | |||
| #if CUDNN_VERSION >= 7500 | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>( | |||
| "CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_" | |||
| "PRECOMP_" | |||
| "GEMM" CUDNN_VERSION_STRING, | |||
| {}) | |||
| .c_str())); | |||
| checker.set_before_exec_callback( | |||
| AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>( | |||
| "CUDNN:Convolution:CUDNN_CONVOLUTION_FWD_ALGO_" | |||
| "IMPLICIT_" | |||
| "PRECOMP_" | |||
| "GEMM" CUDNN_VERSION_STRING, | |||
| {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); | |||
| #else | |||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>("MATMUL", | |||
| {}) | |||
| .c_str())); | |||
| checker.set_before_exec_callback( | |||
| AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | |||
| "MATMUL", {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| #endif | |||
| NormalRNG default_rng; | |||
| for (auto &&arg: args) { | |||
| @@ -116,12 +116,17 @@ TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) { | |||
| std::string conv1x1_name = | |||
| ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>( | |||
| "MATMUL1X1", {}); | |||
| checker.set_before_exec_callback(AlgoChecker<Convolution>( | |||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||
| ssprintf("%s:%s", "CUDA:GROUP_CONV", | |||
| conv1x1_name.c_str()), | |||
| {}) | |||
| .c_str())); | |||
| checker.set_before_exec_callback( | |||
| AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{ | |||
| "DEFAULT", | |||
| {{ConvBiasForward::algo_name< | |||
| ConvBiasForward::DirectParam>( | |||
| ssprintf("%s:%s", "CUDA:GROUP_CONV", | |||
| conv1x1_name.c_str()) | |||
| .c_str(), | |||
| {}) | |||
| .c_str(), | |||
| {}}}})); | |||
| #endif | |||
| Convolution::Param param; | |||
| param.sparse = Convolution::Param::Sparse::GROUP; | |||
| @@ -231,7 +231,7 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) { | |||
| algo.name.c_str(), str_on_inp_shape.c_str()); | |||
| ImplExecutionPolicy policy; | |||
| policy.algo = algo.desc; | |||
| ctx.construct_execution_policy_from_cache(require_reproducible, policy); | |||
| ctx.construct_execution_policy(require_reproducible, policy); | |||
| if (ctx.get_workspace_size_bytes(policy) >= workspace_limit) | |||
| continue; | |||
| @@ -302,7 +302,7 @@ AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible, | |||
| }); | |||
| } | |||
| typename AlgoChooser<Opr>::ImplExecutionPolicy policy; | |||
| ctx.construct_execution_policy_from_cache(require_reproducible, policy); | |||
| ctx.construct_execution_policy(require_reproducible, policy); | |||
| return policy; | |||
| MIDOUT_E | |||
| } | |||
| @@ -324,6 +324,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||
| ImplExecutionPolicy policy; | |||
| if (auto algo_choose_hook = mgb_opr->algo_chooser()) { | |||
| policy = algo_choose_hook(mgb_opr); | |||
| ctx.construct_execution_policy( | |||
| mgb_opr->execution_policy().strategy == | |||
| mixin::AlgoChooserHelper::ExecutionPolicy::Strategy:: | |||
| HEURISTIC_REPRODUCIBLE, | |||
| policy, false); | |||
| } | |||
| if (!policy.algo.valid()) { | |||
| policy = get_policy(ctx); | |||
| @@ -520,13 +525,26 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const { | |||
| } | |||
| template <typename Opr> | |||
| void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache( | |||
| void AlgoChooser<Opr>::ExeContext::construct_execution_policy( | |||
| bool require_reproducible, | |||
| typename AlgoChooser<Opr>::ImplExecutionPolicy& policy) const { | |||
| typename AlgoChooser<Opr>::ImplExecutionPolicy& policy, | |||
| bool retrive_from_cache) const { | |||
| if (!policy.algo.valid()) { | |||
| policy.algo = get_profile_result_from_cache(require_reproducible).desc; | |||
| if (retrive_from_cache) { | |||
| policy.algo = | |||
| get_profile_result_from_cache(require_reproducible).desc; | |||
| } else { | |||
| auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( | |||
| owner_graph(), m_cn, m_execution_policy.workspace_limit); | |||
| policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic( | |||
| args..., workspace_limit, | |||
| require_reproducible), | |||
| m_layouts) | |||
| .desc; | |||
| } | |||
| mgb_assert(policy.algo.valid(), | |||
| "No cache found, maybe some error occured"); | |||
| "No algo found from cache or heuristic, maybe some error " | |||
| "occured"); | |||
| } | |||
| Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); | |||
| @@ -544,8 +562,9 @@ void AlgoChooser<Opr>::ExeContext::construct_execution_policy_from_cache( | |||
| _item.param, m_base_mgb_opr, m_cn, m_execution_policy, | |||
| m_allow_weight_preprocess); | |||
| policy.sub_policy.push_back({}); | |||
| sub_ctx.construct_execution_policy_from_cache(require_reproducible, | |||
| policy.sub_policy.back()); | |||
| sub_ctx.construct_execution_policy(require_reproducible, | |||
| policy.sub_policy.back(), | |||
| retrive_from_cache); | |||
| }); | |||
| return; | |||
| @@ -672,11 +691,11 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const { | |||
| AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \ | |||
| const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
| policy) const; \ | |||
| template void AlgoChooser<megdnn::Opr>::ExeContext:: \ | |||
| construct_execution_policy_from_cache( \ | |||
| bool require_reproducible, \ | |||
| typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
| policy) const; \ | |||
| template void \ | |||
| AlgoChooser<megdnn::Opr>::ExeContext::construct_execution_policy( \ | |||
| bool require_reproducible, \ | |||
| typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy, \ | |||
| bool retrive_from_cache) const; \ | |||
| template Maybe<AlgoChooserProfileCache::ResultEntry> \ | |||
| AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \ | |||
| const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& \ | |||
| @@ -129,13 +129,16 @@ public: | |||
| ImplAlgo get_profile_result_from_cache(bool require_reproducible) const; | |||
| /** | |||
| * \brief construct execution policy from cache. | |||
| * \brief construct execution policy from cache or heuristic. | |||
| * | |||
| * \param require_reproducible select algo which is reproducible | |||
| * \param policy execution policy | |||
| * \param retrive_from_cache retrive algo from cache if set True, get | |||
| * from heuristic otherwise. | |||
| */ | |||
| void construct_execution_policy_from_cache( | |||
| bool require_reproducible, ImplExecutionPolicy& policy) const; | |||
| void construct_execution_policy( | |||
| bool require_reproducible, ImplExecutionPolicy& policy, | |||
| bool retrive_from_cache = true) const; | |||
| private: | |||
| Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const; | |||