GitOrigin-RevId: 27abd22295
tags/v1.9.0
| @@ -18,6 +18,7 @@ file( | |||||
| opr/impl/nvof/*.cpp | opr/impl/nvof/*.cpp | ||||
| plugin/impl/*.cpp | plugin/impl/*.cpp | ||||
| serialization/impl/*.cpp | serialization/impl/*.cpp | ||||
| rdnn/impl/*.cpp | |||||
| core/impl/*.inl | core/impl/*.inl | ||||
| gopt/impl/*.inl | gopt/impl/*.inl | ||||
| opr/impl/*.inl | opr/impl/*.inl | ||||
| @@ -53,7 +54,8 @@ set(MGB_INC | |||||
| ${CMAKE_CURRENT_LIST_DIR}/gopt/include | ${CMAKE_CURRENT_LIST_DIR}/gopt/include | ||||
| ${CMAKE_CURRENT_LIST_DIR}/opr/include | ${CMAKE_CURRENT_LIST_DIR}/opr/include | ||||
| ${CMAKE_CURRENT_LIST_DIR}/plugin/include | ${CMAKE_CURRENT_LIST_DIR}/plugin/include | ||||
| ${CMAKE_CURRENT_LIST_DIR}/serialization/include) | |||||
| ${CMAKE_CURRENT_LIST_DIR}/serialization/include | |||||
| ${CMAKE_CURRENT_LIST_DIR}/rdnn/include) | |||||
| if(MGE_WITH_JIT) | if(MGE_WITH_JIT) | ||||
| list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include) | list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include) | ||||
| @@ -183,7 +183,7 @@ struct OprWithPolicyMaker<opr::BatchConvBiasForward> | |||||
| MakeOprWithPolicyCaller4<megdnn::BatchConvBiasForward>, | MakeOprWithPolicyCaller4<megdnn::BatchConvBiasForward>, | ||||
| megdnn::param::BatchConvBias> {}; | megdnn::param::BatchConvBias> {}; | ||||
| #include "../../opr/impl/internal/invoke.h" | |||||
| #include "megbrain/utils/invoke.h" | |||||
| template <typename Opr> | template <typename Opr> | ||||
| struct MultiAlgoOprTrait; | struct MultiAlgoOprTrait; | ||||
| @@ -23,8 +23,8 @@ | |||||
| #include "megbrain/opr/imgproc.h" | #include "megbrain/opr/imgproc.h" | ||||
| #include "megbrain/opr/misc.h" | #include "megbrain/opr/misc.h" | ||||
| #include "megbrain/opr/nn_int.h" | #include "megbrain/opr/nn_int.h" | ||||
| #include "megbrain/opr/search_policy/algo_chooser.h" | |||||
| #include "megbrain/opr/search_policy/algo_chooser_helper.h" | #include "megbrain/opr/search_policy/algo_chooser_helper.h" | ||||
| #include "megbrain/opr/search_policy/profiler.h" | |||||
| #include "megbrain/opr/tensor_gen.h" | #include "megbrain/opr/tensor_gen.h" | ||||
| #include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
| #include "megbrain/opr/utility.h" | #include "megbrain/opr/utility.h" | ||||
| @@ -19,7 +19,6 @@ | |||||
| #include "megbrain/opr/tensor_manip.h" | #include "megbrain/opr/tensor_manip.h" | ||||
| #include "megbrain/opr/search_policy/algo_chooser.h" | #include "megbrain/opr/search_policy/algo_chooser.h" | ||||
| #include "megbrain/opr/search_policy/profiler.h" | |||||
| #include "./internal/megdnn_opr_wrapper.inl" | #include "./internal/megdnn_opr_wrapper.inl" | ||||
| #include "./search_policy/workspace_need_limit_getter.inl" | #include "./search_policy/workspace_need_limit_getter.inl" | ||||
| @@ -18,11 +18,11 @@ | |||||
| #include "megbrain/graph/grad_impl.h" | #include "megbrain/graph/grad_impl.h" | ||||
| #include "megbrain/system.h" | #include "megbrain/system.h" | ||||
| #include "megbrain/utils/hash_ct.h" | #include "megbrain/utils/hash_ct.h" | ||||
| #include "megbrain/utils/invoke.h" | |||||
| #include "megbrain/utils/timer.h" | #include "megbrain/utils/timer.h" | ||||
| #include "megdnn/oprs/utils.h" | #include "megdnn/oprs/utils.h" | ||||
| #include "../internal/invoke.h" | |||||
| #include "../internal/megdnn_opr_wrapper.inl" | #include "../internal/megdnn_opr_wrapper.inl" | ||||
| #include "../search_policy/workspace_need_limit_getter.inl" | #include "../search_policy/workspace_need_limit_getter.inl" | ||||
| @@ -25,26 +25,6 @@ using namespace mixin; | |||||
| /* ================== global functions ================== */ | /* ================== global functions ================== */ | ||||
| namespace { | namespace { | ||||
| template <class Opr> | |||||
| class MegDNNGlobalOprContainer final : public UserDataContainer::UserData { | |||||
| MGB_TYPEINFO_OBJ_DECL; | |||||
| std::shared_ptr<megdnn::Handle> m_megdnn_handle; | |||||
| std::unique_ptr<Opr> m_opr; | |||||
| public: | |||||
| MegDNNGlobalOprContainer(CompNode cn) | |||||
| : m_megdnn_handle{get_megdnn_handle_shared(cn)}, | |||||
| m_opr{m_megdnn_handle->create_operator<Opr>()} { | |||||
| mgb_assert(m_opr->is_thread_safe()); | |||||
| } | |||||
| Opr* get() const { return m_opr.get(); } | |||||
| }; | |||||
| template <class Opr> | |||||
| MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer<Opr>); | |||||
| class TempStorageContainer final : public UserDataContainer::UserData { | class TempStorageContainer final : public UserDataContainer::UserData { | ||||
| MGB_TYPEINFO_OBJ_DECL; | MGB_TYPEINFO_OBJ_DECL; | ||||
| @@ -55,34 +35,6 @@ public: | |||||
| MGB_TYPEINFO_OBJ_IMPL(TempStorageContainer); | MGB_TYPEINFO_OBJ_IMPL(TempStorageContainer); | ||||
| } // anonymous namespace | } // anonymous namespace | ||||
| std::shared_ptr<megdnn::Handle> intl::get_megdnn_handle_shared(CompNode comp_node) { | |||||
| auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)); | |||||
| return {handle.shared_from_this(), handle.handle()}; | |||||
| } | |||||
| megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) { | |||||
| return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle(); | |||||
| } | |||||
| template <typename Opr> | |||||
| Opr* intl::get_megdnn_global_opr(CompNode comp_node) { | |||||
| using T = MegDNNGlobalOprContainer<Opr>; | |||||
| auto maker = [comp_node]() { return std::make_shared<T>(comp_node); }; | |||||
| return CompNodeEnv::from_comp_node(comp_node).get_user_data<T>(maker).get(); | |||||
| } | |||||
| namespace mgb { | |||||
| namespace opr { | |||||
| namespace intl { | |||||
| #define INST(o) template o* get_megdnn_global_opr<o>(CompNode) | |||||
| INST(megdnn::AddUpdate); | |||||
| INST(megdnn::Relayout); | |||||
| INST(megdnn::Checksum); | |||||
| #undef INST | |||||
| } // namespace intl | |||||
| } // namespace opr | |||||
| } // namespace mgb | |||||
| DeviceTensorStorage& intl::get_temp_storage(ComputingGraph& graph, CompNode comp_node) { | DeviceTensorStorage& intl::get_temp_storage(ComputingGraph& graph, CompNode comp_node) { | ||||
| auto container = | auto container = | ||||
| graph.options().user_data.get_user_data_or_create<TempStorageContainer>(); | graph.options().user_data.get_user_data_or_create<TempStorageContainer>(); | ||||
| @@ -1,413 +0,0 @@ | |||||
| /** | |||||
| * \file src/opr/impl/search_policy/profile.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "megbrain/opr/search_policy/profiler.h" | |||||
| #include "../internal/invoke.h" | |||||
| #include "../internal/megdnn_opr_wrapper.inl" | |||||
| #include "megdnn/handle.h" | |||||
| #include "megdnn/oprs/base.h" | |||||
| #if MGB_ROCM | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "megcore_rocm.h" | |||||
| #endif | |||||
| //! TODO: here has to be know some megdnn::opr when there is produced midout.h | |||||
| //! fix it if there is another graceful way. | |||||
| #include "megdnn/oprs.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megbrain_opr_profile) | |||||
| #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) { | |||||
| #define MIDOUT_E \ | |||||
| } \ | |||||
| MIDOUT_END(); | |||||
| namespace { | |||||
| std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { | |||||
| std::string ret; | |||||
| //! serialize AlgorithmDesc | |||||
| megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); | |||||
| megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); | |||||
| uint32_t param_size = policy.algo.param.size(); | |||||
| uint32_t name_size = policy.algo.name.size(); | |||||
| megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret); | |||||
| megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, ret); | |||||
| ret += policy.algo.param; | |||||
| ret += policy.algo.name; | |||||
| //! serialize sub_policy | |||||
| uint32_t size = policy.sub_policy.size(); | |||||
| megdnn::Algorithm::serialize_write_pod(size, ret); | |||||
| for (auto&& sub : policy.sub_policy) { | |||||
| ret += serialize_policy(sub); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| megdnn::ExecutionPolicy deserialize_policy( | |||||
| const char* buf, uint32_t size, uint32_t& offset) { | |||||
| megdnn::ExecutionPolicy ret; | |||||
| #define cb(_val, _type) \ | |||||
| _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ | |||||
| offset += sizeof(_val) | |||||
| cb(ret.algo.handle_type, megdnn::Handle::HandleType); | |||||
| cb(ret.algo.type, uint32_t); | |||||
| uint32_t param_size = 0; | |||||
| uint32_t name_size = 0; | |||||
| cb(param_size, uint32_t); | |||||
| cb(name_size, uint32_t); | |||||
| if (param_size > 0) { | |||||
| ret.algo.param = std::string(buf + offset, param_size); | |||||
| offset += param_size; | |||||
| } | |||||
| if (name_size > 0) { | |||||
| ret.algo.name = std::string(buf + offset, name_size); | |||||
| offset += name_size; | |||||
| } | |||||
| uint32_t nr_policy = 0; | |||||
| cb(nr_policy, uint32_t); | |||||
| #undef cb | |||||
| for (uint32_t i = 0; i < nr_policy; i++) { | |||||
| ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| } // namespace | |||||
| namespace mgb { | |||||
| namespace opr { | |||||
| #define APPLY(statement, ...) \ | |||||
| mgb::apply( \ | |||||
| [&](const auto&... args) { return statement; }, \ | |||||
| std::tuple_cat(__VA_ARGS__)) | |||||
| ////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// | |||||
| template <typename Opr> | |||||
| typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob TimedProfiler<Opr>::Param:: | |||||
| ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) { | |||||
| ExecutionPolicyBlob ret; | |||||
| std::string serialize_bin = serialize_policy(policy); | |||||
| mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); | |||||
| memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); | |||||
| ret.size = serialize_bin.size(); | |||||
| return ret; | |||||
| } | |||||
| template <typename Opr> | |||||
| megdnn::ExecutionPolicy TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize() | |||||
| const { | |||||
| uint32_t offset = 0; | |||||
| auto&& ret = deserialize_policy(data, size, offset); | |||||
| mgb_assert(offset == size); | |||||
| return std::move(ret); | |||||
| } | |||||
| #define INST(Opr) \ | |||||
| template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \ | |||||
| TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \ | |||||
| const megdnn::ExecutionPolicy& policy); \ | |||||
| template megdnn::ExecutionPolicy \ | |||||
| TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() const; | |||||
| MGB_FOREACH_FASTRUN_OPR(INST) | |||||
| #undef INST | |||||
| ////////////////// TimedProfiler ////////////////////////////// | |||||
| template <typename Opr> | |||||
| const double TimedProfiler<Opr>::timeout_setting = | |||||
| TimedProfiler<Opr>::init_timeout_setting(); | |||||
| template <typename Opr> | |||||
| double TimedProfiler<Opr>::init_timeout_setting() { | |||||
| #if MGB_ENABLE_FASTRUN | |||||
| sys::TimedFuncInvoker::ins().register_func( | |||||
| AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl, | |||||
| &TimedProfiler<Opr>::prof_init_device); | |||||
| auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT"); | |||||
| if (to_set) | |||||
| return std::stod(to_set); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| #define APPLY(statement, ...) \ | |||||
| mgb::apply( \ | |||||
| [&](const auto&... args) { return statement; }, \ | |||||
| std::tuple_cat(__VA_ARGS__)) | |||||
| template <typename Opr> | |||||
| void TimedProfiler<Opr>::preprocess( | |||||
| const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&, | |||||
| intl::UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&, | |||||
| std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) { | |||||
| // Opr is neither convbias nor convolution.This function do nothing. | |||||
| } | |||||
| //! convbias | |||||
| template <> | |||||
| void TimedProfiler<megdnn::ConvBias>::preprocess( | |||||
| const TensorLayoutArray& preprocessed_layout, | |||||
| const SmallVector<DeviceTensorND>& flt_val, | |||||
| intl::UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, | |||||
| megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | |||||
| std::array<DeviceTensorND, arity_in>& inp_val, | |||||
| PreprocessFilter<megdnn::ConvBias>& prep_flt) { | |||||
| if (!preprocessed_layout.empty()) { | |||||
| auto&& pf = prep_flt; | |||||
| pf.algorithm_id = nullptr; | |||||
| pf.tensors.resize(flt_val.size()); | |||||
| for (size_t i = 0; i < flt_val.size(); i++) { | |||||
| pf.tensors[i] = flt_val[i].as_megdnn(); | |||||
| } | |||||
| APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | |||||
| std::forward_as_tuple( | |||||
| layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()), | |||||
| array_skip<arity_in - 1>(layouts)); | |||||
| } | |||||
| } | |||||
| //! convolution | |||||
| template <> | |||||
| void TimedProfiler<megdnn::ConvolutionForward>::preprocess( | |||||
| const TensorLayoutArray& preprocessed_layout, | |||||
| const megdnn::SmallVector<DeviceTensorND>& flt_val, | |||||
| intl::UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr, | |||||
| megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, | |||||
| std::array<DeviceTensorND, arity_in>& inp_val, | |||||
| PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) { | |||||
| if (!preprocessed_layout.empty()) { | |||||
| auto&& pf = prep_flt; | |||||
| pf.algorithm_id = nullptr; | |||||
| pf.tensors.resize(flt_val.size()); | |||||
| for (size_t i = 0; i < flt_val.size(); i++) { | |||||
| pf.tensors[i] = flt_val[i].as_megdnn(); | |||||
| } | |||||
| APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), | |||||
| std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), | |||||
| array_skip<2>(layouts)); | |||||
| } | |||||
| } | |||||
| template <typename Opr> | |||||
| typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
| const TParam& raw_param) { | |||||
| MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl"))) | |||||
| #if MGB_ROCM | |||||
| bool miopen_algo_search_enabled; | |||||
| megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled); | |||||
| mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled"); | |||||
| #endif | |||||
| auto&& param = raw_param.as_single_pod<Param>(); | |||||
| CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); | |||||
| auto megdnn_opr = intl::create_megdnn_opr<Opr>(cn); | |||||
| std::array<TensorLayout, arity> layouts; | |||||
| auto from_enum = [&](DTypeEnum enumv) -> DType { | |||||
| switch (enumv) { | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: \ | |||||
| return _dt(1.0f, static_cast<uint8_t>(0)) | |||||
| cb(dtype::Quantized8Asymm); | |||||
| cb(dtype::Quantized4Asymm); | |||||
| #undef cb | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: \ | |||||
| return _dt(1.0f) | |||||
| cb(dtype::QuantizedS8); | |||||
| cb(dtype::QuantizedS16); | |||||
| cb(dtype::QuantizedS32); | |||||
| cb(dtype::QuantizedS4); | |||||
| default: | |||||
| return DType::from_enum(enumv); | |||||
| #undef cb | |||||
| } | |||||
| }; | |||||
| for (int i = 0; i < arity; ++i) { | |||||
| layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])}; | |||||
| } | |||||
| megdnn_opr->param() = param.opr_param; | |||||
| megdnn_opr->execution_policy() = param.execution_policy.deserialize(); | |||||
| // Allocate preprocessed weight buffers. | |||||
| TensorLayoutArray preprocessed_layout; | |||||
| if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) { | |||||
| if (param.allow_weight_preprocess) { | |||||
| preprocessed_layout = APPLY( | |||||
| _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts); | |||||
| } | |||||
| }); | |||||
| { | |||||
| // first allocate a whole chunk to avoid memory fragmentation (here we | |||||
| // rely on memory allocator to reuse memory) | |||||
| auto align = cn.get_mem_addr_alignment(); | |||||
| size_t tot_size = align; | |||||
| for (int i = 0; i < arity; ++i) { | |||||
| tot_size += layouts[i].span().high_byte + align; | |||||
| } | |||||
| for (const auto& layout : preprocessed_layout) { | |||||
| tot_size += layout.span().high_byte + align; | |||||
| } | |||||
| tot_size += param.workspace; | |||||
| DeviceTensorStorage storage{cn}; | |||||
| storage.ensure_size(tot_size); | |||||
| } | |||||
| // allocate input and output memory | |||||
| std::array<DeviceTensorND, arity_in> inp_val; | |||||
| std::array<DeviceTensorND, arity_out> out_val; | |||||
| DeviceTensorND workspace; | |||||
| for (int i = 0; i < arity_in; ++i) { | |||||
| inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); | |||||
| } | |||||
| for (int i = 0; i < arity_out; ++i) { | |||||
| out_val[i] | |||||
| .comp_node(cn) | |||||
| .dtype(layouts[arity_in + i].dtype) | |||||
| .resize(layouts[arity_in + i]); | |||||
| } | |||||
| megdnn::Workspace mdn_workspace; | |||||
| // allocate workspace | |||||
| if (param.workspace) { | |||||
| workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); | |||||
| mdn_workspace.size = param.workspace; | |||||
| mdn_workspace.raw_ptr = workspace.raw_ptr(); | |||||
| } | |||||
| // allocate storage for preprocessed filter | |||||
| SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size()); | |||||
| for (size_t i = 0; i < preprocessed_layout.size(); i++) { | |||||
| flt_val[i] = { | |||||
| cn, preprocessed_layout[i], preprocessed_layout[i].dtype, | |||||
| preprocessed_layout[i].format}; | |||||
| } | |||||
| for (int i = 0; i < arity_in; ++i) { | |||||
| fill_zero_dev_tensor(inp_val[i]); | |||||
| } | |||||
| PreprocessFilter<Opr> prep_flt; | |||||
| preprocess( | |||||
| preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, | |||||
| prep_flt); | |||||
| RealTimer timer; | |||||
| auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER), | |||||
| ev_end = cn.create_event(CompNode::Event::NEED_TIMER); | |||||
| ev_start->record(); | |||||
| if_constexpr<opr_supports_preprocess<Opr>()>( | |||||
| [&](auto _) { | |||||
| auto&& opr = _(megdnn_opr); | |||||
| PreprocessFilter<Opr>* pf = | |||||
| preprocessed_layout.empty() ? nullptr : &prep_flt; | |||||
| APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, | |||||
| out_val); | |||||
| }, | |||||
| /* else */ | |||||
| [&](auto _) { | |||||
| APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, | |||||
| out_val); | |||||
| }); | |||||
| ev_end->record(); | |||||
| megdnn::Algorithm* algo = | |||||
| megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); | |||||
| mgb_assert(algo); | |||||
| double next_report_time = 0.5; | |||||
| while (!ev_end->finished()) { | |||||
| if (timer.get_secs() >= next_report_time) { | |||||
| #if MGB_ENABLE_GETENV | |||||
| mgb_log_warn( | |||||
| "profiling conv algo %s already took %.3f/%.3f secs" | |||||
| " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | |||||
| algo->name(), timer.get_secs(), param.actual_timeout); | |||||
| #else | |||||
| mgb_log_warn( | |||||
| "profiling conv algo %s already took %.3f/%.3f secs", algo->name(), | |||||
| timer.get_secs(), param.actual_timeout); | |||||
| #endif | |||||
| next_report_time = timer.get_secs() + 1; | |||||
| } | |||||
| using namespace std::literals; | |||||
| #if !__DEPLOY_ON_XP_SP2__ | |||||
| std::this_thread::sleep_for(1000us); | |||||
| #endif | |||||
| } | |||||
| // release all free blocks owned by child process, | |||||
| // in order to avoid main process running out of memory | |||||
| cn.try_coalesce_all_free_memory(); | |||||
| mgb_assert(ev_start->finished()); | |||||
| return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)}); | |||||
| MIDOUT_E | |||||
| }; | |||||
| template <typename Opr> | |||||
| Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile( | |||||
| const Param& param, double& timeout) { | |||||
| mgb_assert(timeout >= 0); | |||||
| if (!timeout) { | |||||
| timeout = timeout_setting; | |||||
| } else if (timeout_setting) { | |||||
| timeout = std::min(timeout, timeout_setting); | |||||
| } | |||||
| param.actual_timeout = timeout ? timeout : std::numeric_limits<double>::infinity(); | |||||
| auto res = sys::TimedFuncInvoker::ins().invoke( | |||||
| AlgoChooserFuncId<Opr>::ID, TParam::from_pod(const_cast<Param&>(param)), | |||||
| timeout); | |||||
| if (res.valid()) | |||||
| return res.val().template as_single_pod<Result>(); | |||||
| return None; | |||||
| } | |||||
| template <typename Opr> | |||||
| void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) { | |||||
| MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device"))) | |||||
| #if MGB_ROCM | |||||
| megcore::enableMIOpenAlgoSearch(true); | |||||
| #endif | |||||
| auto&& param = raw_param.as_single_pod<Param>(); | |||||
| CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); | |||||
| // wait for cuda init, so its time does not get accounted in timeout | |||||
| cn.sync(); | |||||
| MIDOUT_E | |||||
| } | |||||
| #define INST(Opr) \ | |||||
| template const double TimedProfiler<megdnn::Opr>::timeout_setting; \ | |||||
| template double TimedProfiler<megdnn::Opr>::init_timeout_setting(); \ | |||||
| template typename TimedProfiler<megdnn::Opr>::TResult \ | |||||
| TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param); \ | |||||
| template Maybe<typename TimedProfiler<megdnn::Opr>::Result> \ | |||||
| TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \ | |||||
| template void TimedProfiler<megdnn::Opr>::prof_init_device(const TParam& raw_param); | |||||
| MGB_FOREACH_FASTRUN_OPR(INST) | |||||
| #undef INST | |||||
| } // namespace opr | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -12,7 +12,7 @@ | |||||
| #pragma once | #pragma once | ||||
| #include "megbrain/opr/search_policy/profiler.h" | |||||
| #include "megbrain/opr/search_policy/algo_chooser.h" | |||||
| #include "../internal/megdnn_opr_wrapper.inl" | #include "../internal/megdnn_opr_wrapper.inl" | ||||
| @@ -25,7 +25,7 @@ namespace intl { | |||||
| struct AutoAddWorkspaceNeedLimitGetter<megdnn::_Opr> { \ | struct AutoAddWorkspaceNeedLimitGetter<megdnn::_Opr> { \ | ||||
| static constexpr bool val = true; \ | static constexpr bool val = true; \ | ||||
| }; | }; | ||||
| MGB_FOREACH_FASTRUN_OPR(cb) | |||||
| DNN_FOREACH_FASTRUN_OPR(cb) | |||||
| #undef cb | #undef cb | ||||
| @@ -13,6 +13,7 @@ | |||||
| #include "megbrain/graph.h" | #include "megbrain/graph.h" | ||||
| #include "megbrain/opr/internal/mixin_base.h" | #include "megbrain/opr/internal/mixin_base.h" | ||||
| #include "megbrain/rdnn/management.h" | |||||
| #include "megdnn/handle.h" | #include "megdnn/handle.h" | ||||
| @@ -20,43 +21,6 @@ namespace mgb { | |||||
| namespace opr { | namespace opr { | ||||
| namespace intl { | namespace intl { | ||||
| //! get megdnn handle from comp node | |||||
| MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node); | |||||
| MGE_WIN_DECLSPEC_FUC std::shared_ptr<megdnn::Handle> get_megdnn_handle_shared( | |||||
| CompNode comp_node); | |||||
| /*! | |||||
| * \brief get global megdnn operator asscoated with a computing node | |||||
| * \tparam Opr megdnn operator class, must be one of: | |||||
| * * AddUpdate | |||||
| * * Relayout | |||||
| * * Checksum | |||||
| */ | |||||
| template <typename Opr> | |||||
| MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node); | |||||
| template <class Obj> | |||||
| class UniqPtrWithCN : public std::unique_ptr<Obj> { | |||||
| CompNode m_cn; | |||||
| public: | |||||
| UniqPtrWithCN() = default; | |||||
| template <class RObj> | |||||
| UniqPtrWithCN(UniqPtrWithCN<RObj>&& o) | |||||
| : std::unique_ptr<Obj>(std::move(o)), m_cn(o.comp_node()) {} | |||||
| UniqPtrWithCN(std::unique_ptr<Obj> ptr, CompNode cn) | |||||
| : std::unique_ptr<Obj>{std::move(ptr)}, m_cn{cn} {} | |||||
| CompNode comp_node() const { return m_cn; } | |||||
| }; | |||||
| //! create megdnn opr from megdnn handle in a CompNode | |||||
| template <class Opr> | |||||
| UniqPtrWithCN<Opr> create_megdnn_opr(CompNode comp_node) { | |||||
| return {get_megdnn_handle(comp_node)->create_operator<Opr>(), comp_node}; | |||||
| } | |||||
| /*! | /*! | ||||
| * \brief get temporary storage for oprs | * \brief get temporary storage for oprs | ||||
| @@ -19,7 +19,7 @@ | |||||
| #include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
| #include "megbrain/opr/dnn/pooling.h" | #include "megbrain/opr/dnn/pooling.h" | ||||
| #include "megbrain/opr/search_policy/algo_chooser_helper.h" | #include "megbrain/opr/search_policy/algo_chooser_helper.h" | ||||
| #include "megbrain/opr/search_policy/profiler.h" | |||||
| #include "megbrain/rdnn/algo_chooser.h" | |||||
| #include "megdnn/oprs/base.h" | #include "megdnn/oprs/base.h" | ||||
| template <class MegDNNOpr> | template <class MegDNNOpr> | ||||
| @@ -31,18 +31,13 @@ struct MegDNNOpr2MGBOpr; | |||||
| using MGBOpr = mgb::opr::_Opr; \ | using MGBOpr = mgb::opr::_Opr; \ | ||||
| }; | }; | ||||
| MGB_FOREACH_FASTRUN_OPR(cb) | |||||
| DNN_FOREACH_FASTRUN_OPR(cb) | |||||
| #undef cb | #undef cb | ||||
| namespace mgb { | |||||
| //! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum | |||||
| //! and megdnn::detail::AlgoAttribute enum | |||||
| using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy; | |||||
| using AlgoAttribute = megdnn::AlgoAttribute; | |||||
| #define MGB_FOREACH_FASTRUN_OPR(cb) DNN_FOREACH_FASTRUN_OPR(cb) | |||||
| namespace mgb { | |||||
| namespace opr { | namespace opr { | ||||
| /* =================== AlgoChooser =================== */ | /* =================== AlgoChooser =================== */ | ||||
| @@ -56,138 +51,14 @@ namespace opr { | |||||
| * \tparam Opr megdnn operator impl | * \tparam Opr megdnn operator impl | ||||
| */ | */ | ||||
| template <typename Opr> | template <typename Opr> | ||||
| class AlgoChooser { | |||||
| static constexpr int arity_in = OprArityTrait<Opr>::arity_in; | |||||
| static constexpr int arity_out = OprArityTrait<Opr>::arity_out; | |||||
| static constexpr int arity = OprArityTrait<Opr>::arity; | |||||
| using ImplAlgo = typename Opr::AlgorithmInfo; | |||||
| using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc; | |||||
| using ImplExecutionPolicy = megdnn::ExecutionPolicy; | |||||
| class AlgoChooser : public rdnn::AlgoChooser<Opr> { | |||||
| using Base = rdnn::AlgoChooser<Opr>; | |||||
| using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr; | using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr; | ||||
| using ImplExecutionPolicy = typename Base::ImplExecutionPolicy; | |||||
| public: | public: | ||||
| using FixedTensorLayouts = std::array<TensorLayout, arity>; | |||||
| class AlgoChooserHelper { | |||||
| //! fastrun layouts | |||||
| FixedTensorLayouts m_fastrun_layouts; | |||||
| //! layouts used when get and set cache item | |||||
| FixedTensorLayouts m_incache_layouts; | |||||
| Opr* m_dnn_opr; | |||||
| std::string m_param; | |||||
| const cg::OperatorNodeBase* m_base_mgb_opr; | |||||
| CompNode m_cn; | |||||
| megdnn::param::ExecutionPolicy m_execution_policy; | |||||
| bool m_allow_weight_preprocess; | |||||
| public: | |||||
| AlgoChooserHelper( | |||||
| const FixedTensorLayouts& layouts, Opr* megdnn_opr, | |||||
| const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, | |||||
| const CompNode& cn, | |||||
| const megdnn::param::ExecutionPolicy& execution_policy, | |||||
| bool allow_weight_preprocess); | |||||
| Opr* megdnn_opr() const { return m_dnn_opr; } | |||||
| const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } | |||||
| const TensorLayout& inp_layout(size_t idx) const { | |||||
| return m_fastrun_layouts[idx]; | |||||
| } | |||||
| cg::ComputingGraph* owner_graph() const { | |||||
| return m_base_mgb_opr->owner_graph(); | |||||
| } | |||||
| const megdnn::param::ExecutionPolicy& execution_policy() const { | |||||
| return m_execution_policy; | |||||
| } | |||||
| CompNode comp_node() const { return m_cn; } | |||||
| const std::string& param() const { return m_param; } | |||||
| bool allow_weight_preprocess() const { return m_allow_weight_preprocess; } | |||||
| megdnn::Algorithm* get_algorithm_from_desc( | |||||
| const megdnn::Algorithm::Info::Desc& desc) const { | |||||
| return m_dnn_opr->get_algorithm_from_desc(desc); | |||||
| } | |||||
| const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; } | |||||
| const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; } | |||||
| //! construct algo chain by heuristic | |||||
| ImplExecutionPolicy choose_by_heuristic( | |||||
| const ExecutionStrategy& selected_strategy) const; | |||||
| //! construct algo chain by profiling | |||||
| ImplExecutionPolicy choose_by_profile( | |||||
| const ExecutionStrategy& selected_strategy, bool enable_update) const; | |||||
| //! get all profile algorithm from cache, return invalid if not exists | |||||
| std::pair<ImplAlgoDesc, Maybe<AlgoChooserProfileCache::Result>> | |||||
| get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const; | |||||
| /** | |||||
| * \brief construct execution policy from cache or heuristic. | |||||
| * | |||||
| * \param selected_strategy select algo which matched this strategy | |||||
| * \param[in,out] policy execution policy | |||||
| * \param retrive_from_cache retrive algo from cache if set True, get | |||||
| * from heuristic otherwise. | |||||
| * \param allow_log no warning log print if set True, print warning info | |||||
| * otherwise. | |||||
| */ | |||||
| void construct_execution_policy( | |||||
| const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy, | |||||
| bool retrive_from_cache = true, bool allow_log = true) const; | |||||
| //! get workspace size required for specific execution policy | |||||
| size_t get_workspace_size_bytes( | |||||
| const ImplExecutionPolicy& policy, | |||||
| const FixedTensorLayouts& layouts = {}) const; | |||||
| //! get all candidate algos, and the one choose_by_heuristic() is | |||||
| //! put first | |||||
| std::vector<ImplAlgo> get_all_candidates() const; | |||||
| /*! | |||||
| * \brief profile a single algorithm | |||||
| * | |||||
| * This is actually a wrapper that constructs param and call | |||||
| * TimedProfiler<Opr>::profile for the actual profiling | |||||
| * | |||||
| * \param[in,out] timeout set the timeout, and return the actual | |||||
| * timeout used during profiling | |||||
| */ | |||||
| Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo( | |||||
| const ImplExecutionPolicy& policy, double& timeout) const; | |||||
| //! profile and save to cache | |||||
| void profile(const ExecutionStrategy& selected_strategy) const; | |||||
| /** | |||||
| * \brief extract algo attribute from execution strategy and graph | |||||
| * option. | |||||
| * | |||||
| * \param strategy select algo which matched this strategy | |||||
| * \return pair<positive_attr, negative_attr> | |||||
| */ | |||||
| std::pair<AlgoAttribute, AlgoAttribute> extract_algo_attribute( | |||||
| const ExecutionStrategy& strategy) const; | |||||
| private: | |||||
| Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter( | |||||
| const FixedTensorLayouts& layouts = {}) const; | |||||
| }; | |||||
| template <typename U> | |||||
| friend class AlgoChooser; | |||||
| private: | |||||
| //! entrance for getting algorithm according to execution strategy | |||||
| static ImplExecutionPolicy get_policy(const AlgoChooserHelper& helper); | |||||
| public: | |||||
| using AlgoChooserHelper = typename Base::AlgoChooserHelper; | |||||
| using FixedTensorLayouts = typename Base::FixedTensorLayouts; | |||||
| /*! | /*! | ||||
| * \brief setup algorithm and return workspace size | * \brief setup algorithm and return workspace size | ||||
| */ | */ | ||||
| @@ -1,165 +0,0 @@ | |||||
| /** | |||||
| * \file src/opr/include/megbrain/opr/search_policy/profile.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megbrain/comp_node.h" | |||||
| #include "megbrain/opr/internal/megdnn_opr_wrapper.h" | |||||
| #include "megbrain/system.h" | |||||
| #include "megbrain/tensor.h" | |||||
| #include "megbrain/utils/hash_ct.h" | |||||
| #include "megbrain/utils/timer.h" | |||||
| #include "megdnn/basic_types.h" | |||||
| #include "megdnn/oprs.h" | |||||
| namespace mgb { | |||||
| namespace opr { | |||||
| // clang-format off | |||||
| #define MGB_FOREACH_FASTRUN_OPR(cb) \ | |||||
| cb(ConvolutionForward) \ | |||||
| cb(ConvBiasForward) \ | |||||
| cb(ConvolutionBackwardData) \ | |||||
| cb(ConvolutionBackwardFilter) \ | |||||
| cb(Convolution3DForward) \ | |||||
| cb(Convolution3DBackwardData) \ | |||||
| cb(Convolution3DBackwardFilter) \ | |||||
| cb(LocalShareForward) \ | |||||
| cb(LocalShareBackwardData) \ | |||||
| cb(LocalShareBackwardFilter) \ | |||||
| cb(DeformableConvForward) \ | |||||
| cb(DeformableConvBackwardFilter) \ | |||||
| cb(DeformableConvBackwardData) \ | |||||
| cb(BatchConvBiasForward) \ | |||||
| cb(MatrixMul) \ | |||||
| cb(BatchedMatrixMul) \ | |||||
| cb(PoolingForward) \ | |||||
| cb(PoolingBackward) | |||||
| // clang-format on | |||||
| template <typename Opr> | |||||
| constexpr bool opr_supports_preprocess() { | |||||
| return std::is_same<Opr, megdnn::ConvolutionForward>::value || | |||||
| std::is_same<Opr, megdnn::ConvBias>::value; | |||||
| } | |||||
| template <typename Opr> | |||||
| constexpr bool opr_contain_bias() { | |||||
| return std::is_same<Opr, megdnn::ConvBias>::value; | |||||
| } | |||||
| //! matmul and batchedMatrixMul | |||||
| template <typename Opr> | |||||
| constexpr bool is_matmul() { | |||||
| return std::is_same<Opr, megdnn::MatrixMul>::value || | |||||
| std::is_same<Opr, megdnn::BatchedMatrixMul>::value; | |||||
| } | |||||
| template <typename Opr, bool has_prep> | |||||
| struct PreprocessFilterImpl { | |||||
| using T = union {}; | |||||
| }; | |||||
| template <typename Opr> | |||||
| struct PreprocessFilterImpl<Opr, true> { | |||||
| using T = typename Opr::PreprocessedFilter; | |||||
| }; | |||||
| template <typename Opr> | |||||
| using PreprocessFilter = | |||||
| typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T; | |||||
| template <typename Opr> | |||||
| struct AlgoChooserFuncId {}; | |||||
| #define DEF_FUNC_ID(func) \ | |||||
| template <> \ | |||||
| struct AlgoChooserFuncId<megdnn::func> { \ | |||||
| __attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \ | |||||
| static_cast<sys::TimedFuncInvoker::FuncId>( \ | |||||
| MGB_HASH_STR("megdnn::" #func)); \ | |||||
| }; | |||||
| MGB_FOREACH_FASTRUN_OPR(DEF_FUNC_ID) | |||||
| #undef DEF_FUNC_ID | |||||
| /* =================== TimedProfiler =================== */ | |||||
| /*! | |||||
| * \brief profile a megdnn opr conv with given param | |||||
| * | |||||
| * This class only provides static methods, and the entry point is | |||||
| * TimedProfiler::profile; it would run profiler in a timed environment by | |||||
| * sys::TimedFuncInvoker | |||||
| * | |||||
| * \tparam Opr megdnn opr impl | |||||
| */ | |||||
| template <typename Opr> | |||||
| class TimedProfiler { | |||||
| static constexpr int arity_in = OprArityTrait<Opr>::arity_in; | |||||
| static constexpr int arity_out = OprArityTrait<Opr>::arity_out; | |||||
| static constexpr int arity = OprArityTrait<Opr>::arity; | |||||
| using TensorShapeArray = std::array<megdnn::TensorShape, arity>; | |||||
| public: | |||||
| struct Param { | |||||
| struct ExecutionPolicyBlob { | |||||
| //! enlarge the max size if needed | |||||
| constexpr static size_t MAX_SIZE_IN_BYTES = 10240; | |||||
| char data[MAX_SIZE_IN_BYTES]; | |||||
| uint32_t size; | |||||
| static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy); | |||||
| megdnn::ExecutionPolicy deserialize() const; | |||||
| }; | |||||
| ExecutionPolicyBlob execution_policy; | |||||
| size_t workspace; | |||||
| megdnn::DTypeEnum dtypes[arity]; | |||||
| CompNode::Locator comp_node_physical, comp_node_logical; | |||||
| TensorShapeArray shapes; | |||||
| typename Opr::Param opr_param; | |||||
| bool allow_weight_preprocess; | |||||
| //! filled by profile() | |||||
| mutable double actual_timeout; | |||||
| }; | |||||
| struct Result { | |||||
| double time; | |||||
| }; | |||||
| static Maybe<Result> profile(const Param& param, double& timeout); | |||||
| private: | |||||
| using TParam = sys::TimedFuncInvoker::Param; | |||||
| using TResult = sys::TimedFuncInvoker::Result; | |||||
| static const double timeout_setting; | |||||
| static double init_timeout_setting(); | |||||
| static void preprocess( | |||||
| const megdnn::TensorLayoutArray& preprocessed_layout, | |||||
| const SmallVector<DeviceTensorND>& flt_val, | |||||
| intl::UniqPtrWithCN<Opr>& megdnn_opr, megdnn::Workspace& mdn_workspace, | |||||
| std::array<TensorLayout, arity>& layouts, | |||||
| std::array<DeviceTensorND, arity_in>& inp_val, | |||||
| PreprocessFilter<Opr>& prep_flt); | |||||
| static TResult prof_impl(const TParam& raw_param); | |||||
| static void prof_init_device(const TParam& raw_param); | |||||
| }; | |||||
| } // namespace opr | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||