GitOrigin-RevId: 1fb68a1da2
tags/v1.9.0
| @@ -121,22 +121,6 @@ private: | |||
| }; | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(ProxyGraph::InputPlaceholder); | |||
| class ProxyGraph::ExecEnv final : public cg::GraphExecutable::ExecEnv { | |||
| public: | |||
| void dispatch_on_comp_node(CompNode, Task&& task) override { task(); } | |||
| void dispatch_on_comp_node_with_mask( | |||
| CompNode, Task&& task, cg::ExecutionMask* mask) override { | |||
| mgb_throw_if( | |||
| mask, GraphError, "ExecutionMask not supported in imperative mode"); | |||
| task(); | |||
| } | |||
| void pause_exec() override {} | |||
| void resume_exec() override {} | |||
| }; | |||
| class ProxyGraph::StaticInferManager : public cg::static_infer::StaticInferManager { | |||
| public: | |||
| using Tag = cg::static_infer::Tag; | |||
| @@ -183,26 +167,8 @@ public: | |||
| } | |||
| InferType get_infer_type(Tag var) override { | |||
| // may be called during get_proxy_opr or make_backward_graph | |||
| // don't let opr apply any immediate optimization | |||
| return {InferType::MISSING_INP, InferType::MISSING_INP}; | |||
| if (auto opr = var->owner_opr()->try_cast_final<InputPlaceholder>()) { | |||
| return {var->shape().ndim ? InferType::CONST : InferType::MISSING_INP, | |||
| opr->m_tensor ? InferType::CONST : InferType::MISSING_INP}; | |||
| } | |||
| if (cur_opr) { | |||
| auto&& outputs = cur_opr->output(); | |||
| auto&& it = std::find(outputs.begin(), outputs.end(), var); | |||
| if (it != outputs.end()) { | |||
| return {infer_shape_fallible(var) ? InferType::CONST | |||
| : InferType::MISSING_INP, | |||
| // value inference could be expensive | |||
| InferType::MISSING_INP}; | |||
| } | |||
| } | |||
| return {InferType::MISSING_INP, InferType::MISSING_INP}; | |||
| } | |||
| void update() { | |||
| @@ -471,7 +437,6 @@ std::atomic<size_t> ProxyGraph::ProxyGraphImpl::m_node_id = 0; | |||
| ProxyGraph::ProxyGraph() | |||
| : m_graph(ProxyGraphImpl::make(this)), | |||
| m_env{new ExecEnv}, | |||
| m_static_infer_manager(new StaticInferManager(this)), | |||
| m_seq_comp_node_optimizer(new SeqCompNodeOptimizer()) {} | |||
| @@ -506,32 +471,6 @@ private: | |||
| /*********************** Physical Tensor Impl ***********************/ | |||
| SmallVector<LogicalTensorDesc> ProxyGraph::infer_output_attrs( | |||
| const OpDef& opdef, const SmallVector<Tensor*>& inputs) { | |||
| SmallVector<LogicalTensorDesc> ret; | |||
| CUR_OPR_GUARD(get_proxy_opr(opdef, inputs)); | |||
| ::mgb::opr::intl::WorkspaceLimitHook::set_impl( | |||
| m_graph.get(), ProxyGraph::get_workspace_limit); | |||
| do_shape_infer(true); | |||
| for (auto&& i : m_cur_opr->usable_output()) { | |||
| mgb_assert(i->dtype().valid() && i->comp_node().valid()); | |||
| mgb_assert(i->shape().ndim || i->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)); | |||
| ret.push_back({{i->shape(), i->dtype()}, i->comp_node()}); | |||
| } | |||
| return ret; | |||
| } | |||
| void ProxyGraph::invoke_op( | |||
| const OpDef& opdef, const SmallVector<Tensor*>& inputs, | |||
| const SmallVector<Tensor*>& outputs, const SmallVector<Tensor*>& workspaces) { | |||
| CUR_OPR_GUARD(get_proxy_opr(opdef, inputs)); | |||
| init_output_tensor(outputs, workspaces); | |||
| for (auto oup : m_cur_opr->output()) { | |||
| m_graph->add_used_comp_node(oup->comp_node()); | |||
| } | |||
| m_cur_opr->execute(*m_env); | |||
| } | |||
| void ProxyGraph::cleanup() { | |||
| if (m_cur_opr) { | |||
| for (auto&& i : m_cur_opr->input()) { | |||
| @@ -545,102 +484,8 @@ void ProxyGraph::cleanup() { | |||
| m_cur_opr = nullptr; | |||
| } | |||
| void ProxyGraph::init_output_tensor( | |||
| const SmallVector<Tensor*>& outputs, const SmallVector<Tensor*>& workspaces) { | |||
| // get proxy opr | |||
| auto proxy = m_cur_opr; | |||
| auto get_workspace_size = [=](CompNode cn, size_t old_limit) { | |||
| size_t limit = 0; | |||
| for (auto&& var : workspaces) { | |||
| limit += var->dtype().size(var->shape().total_nr_elems()); | |||
| } | |||
| return limit; | |||
| }; | |||
| ::mgb::opr::intl::WorkspaceLimitHook::set_impl(m_graph.get(), get_workspace_size); | |||
| do_shape_infer(true); | |||
| size_t j = 0; | |||
| size_t k = 0; | |||
| for (auto&& var : proxy->output()) { | |||
| auto&& chk = var->m_mem_plan.reset_from_owner_var().chunk(); | |||
| if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | |||
| // workspace | |||
| if (workspaces.size()) { | |||
| mgb_assert(k < workspaces.size()); | |||
| auto&& layout = workspaces[k]->layout(); | |||
| mgb_assert( | |||
| var->comp_node() == workspaces[k]->comp_node() && | |||
| var->shape().eq_shape(layout) && var->dtype() == layout.dtype); | |||
| var->m_dev_tensor = workspaces[k]->dev_tensor(); | |||
| ++k; | |||
| } else { | |||
| TensorLayout layout{var->shape(), var->dtype(), var->format()}; | |||
| var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( | |||
| var->comp_node(), layout); | |||
| } | |||
| } else { | |||
| mgb_assert(j < outputs.size()); | |||
| auto&& tensor = outputs[j]; | |||
| auto&& layout = tensor->layout(); | |||
| mgb_assert( | |||
| var->comp_node() == tensor->comp_node() && | |||
| var->shape().eq_shape(layout) && var->dtype() == layout.dtype); | |||
| var->assign_dev_tensor_from_tensor(tensor->dev_tensor()); | |||
| ++j; | |||
| } | |||
| chk.mem_alloc_status.set_from_owner_var(); | |||
| } | |||
| mgb_assert(j == outputs.size()); | |||
| mgb_assert(k == workspaces.size()); | |||
| // Memory forwarding was bypassed in megbrain with graph option | |||
| // imerative_proxy_graph on, here we call mem_plan_fwd_in2out_readonly | |||
| // to initialize some opr(e.g. Subtensor)'s internal state | |||
| // TODO: implement memory forwarding | |||
| proxy->mem_plan_fwd_in2out_readonly(); | |||
| { | |||
| // some opr (e.g. Reduce) rely on on_mem_status_changed to set | |||
| // input/output tensor corretly, since we bypass var_node_mem_mgr | |||
| // on_mem_status_changed should be called here | |||
| auto&& cb = proxy->get_opr_event_callback().on_mem_status_changed; | |||
| if (cb.valid()) { | |||
| cb.val()(); | |||
| } | |||
| } | |||
| } | |||
| cg::OperatorNodeBase* ProxyGraph::get_proxy_opr( | |||
| const OpDef& opdef, const SmallVector<Tensor*>& inputs) { | |||
| VarNodeArray vinputs(inputs.size()); | |||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||
| vinputs[i] = InputPlaceholder::make(*m_graph, *inputs[i]).node(); | |||
| } | |||
| auto opr = OpDef::apply_on_var_node(opdef, vinputs)[0]->owner_opr(); | |||
| mgb_assert(!opr->same_type<InputPlaceholder>()); | |||
| for (auto&& i : opr->input()) { | |||
| mgb_assert(i->owner_opr()->same_type<InputPlaceholder>()); | |||
| } | |||
| return opr; | |||
| } | |||
| /*********************** Logical Tensor Impl ***********************/ | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> ProxyGraph:: | |||
| infer_output_attrs_fallible( | |||
| const OpDef& opdef, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| // this function is just a placeholder | |||
| // it will be overrided by ProxyGraphTypeI::infer_output_attrs_fallible in minigraph | |||
| mgb_assert(0); | |||
| } | |||
| struct ProxyGraph::GradGraph { | |||
| cg::VarNodeArray inputs; | |||
| cg::VarNodeArray outputs; | |||
| cg::VarNodeArray output_grads; | |||
| cg::VarNode* grad; | |||
| }; | |||
| EncodedSubgraph ProxyGraph::make_backward_graph( | |||
| const OpDef& opdef, const SmallVector<LogicalTensorDesc>& input_descs, | |||
| const SmallVector<bool>& input_requires_grad, | |||
| @@ -793,22 +638,6 @@ VarNodeArray ProxyGraph::make_input_place_holders( | |||
| /*********************** Common Impl ***********************/ | |||
| bool ProxyGraph::do_shape_infer(bool sync_value) { | |||
| m_static_infer_manager->update(); | |||
| bool validated = true; | |||
| for (auto* var : m_cur_opr->output()) { | |||
| if (sync_value) { | |||
| var->shape(m_static_infer_manager->infer_shape(var)); | |||
| } else if (auto* shape = m_static_infer_manager->infer_shape_fallible(var)) { | |||
| var->shape(*shape); | |||
| } else { | |||
| validated = false; | |||
| } | |||
| } | |||
| return validated; | |||
| } | |||
| TensorPtr ProxyGraph::as_tensor(cg::OperatorNodeBase* opr, bool share) { | |||
| // TODO : maybe some tensor should copy value from origin opr rather than | |||
| // share the RawStorage | |||
| @@ -27,44 +27,22 @@ public: | |||
| static std::unique_ptr<MegBrainError> get_async_error() { | |||
| return std::move(tm_async_error); | |||
| } | |||
| static size_t get_workspace_limit(CompNode cn, size_t old_limit) { | |||
| size_t free = cn.get_free_mem(); | |||
| size_t lmt = cn.get_max_block_size_available(); | |||
| return std::max(lmt, free); | |||
| } | |||
| /********************** Physical Tensor API **********************/ | |||
| SmallVector<LogicalTensorDesc> infer_output_attrs( | |||
| const OpDef& opdef, const SmallVector<Tensor*>& inputs); | |||
| void invoke_op( | |||
| const OpDef& opdef, const SmallVector<Tensor*>& inputs, | |||
| const SmallVector<Tensor*>& outputs, const SmallVector<Tensor*>& workspace); | |||
| EncodedSubgraph make_backward_graph( | |||
| const OpDef& opdef, const SmallVector<LogicalTensorDesc>& input_descs, | |||
| const SmallVector<bool>& input_requires_grad, | |||
| const SmallVector<bool>& output_has_grad); | |||
| /********************** Logical Tensor API **********************/ | |||
| size_t get_opr_output_size( | |||
| const OpDef& opdef, const SmallVector<LogicalTensorDesc>& inputs); | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& opdef, const SmallVector<LogicalTensorDesc>& inputs); | |||
| private: | |||
| ProxyGraph(); | |||
| class ProxyGraphImpl; | |||
| class ExecEnv; | |||
| class StaticInferManager; | |||
| class SeqCompNodeOptimizer; | |||
| class InputPlaceholder; | |||
| struct ProxyGraphInst; | |||
| struct GradGraph; | |||
| class CurOprGuard; | |||
| void reset(); | |||
| @@ -73,12 +51,6 @@ private: | |||
| void cleanup(); | |||
| void init_output_tensor( | |||
| const SmallVector<Tensor*>& outputs, const SmallVector<Tensor*>& workspace); | |||
| cg::OperatorNodeBase* get_proxy_opr( | |||
| const OpDef& opdef, const SmallVector<Tensor*>& inputs); | |||
| /********************** Logical Tensor Helper **********************/ | |||
| cg::VarNodeArray make_input_place_holders( | |||
| @@ -86,14 +58,11 @@ private: | |||
| /********************** Common Helper **********************/ | |||
| bool do_shape_infer(bool sync_value); | |||
| TensorPtr as_tensor(cg::OperatorNodeBase* opr, bool share = true); | |||
| cg::OperatorNodeBase* m_cur_opr = nullptr; | |||
| std::unique_ptr<ProxyGraphImpl> m_graph; | |||
| size_t m_max_op_cnt = 100; | |||
| std::unique_ptr<ExecEnv> m_env; | |||
| std::unique_ptr<StaticInferManager> m_static_infer_manager; | |||
| std::unique_ptr<SeqCompNodeOptimizer> m_seq_comp_node_optimizer; | |||
| @@ -801,18 +801,19 @@ public: | |||
| return ret; | |||
| } | |||
| SmallVector<LogicalTensorDesc> infer_output_attrs( | |||
| const OpDef& def, const SmallVector<Tensor*>& inputs) { | |||
| SmallVector<LogicalTensorDesc> descs; | |||
| auto& minigraph = get_cached_minigraph(def, inputs); | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, SmallVector<TensorPtr> inputs) { | |||
| auto raw_inputs = to_raw_ptr_array(inputs); | |||
| auto& minigraph = get_cached_minigraph(def, raw_inputs); | |||
| auto _ = scoped_attach(&minigraph); | |||
| auto sess = minigraph.infer_session(inputs); | |||
| auto sess = minigraph.infer_session(raw_inputs); | |||
| ::mgb::opr::intl::WorkspaceLimitHook::set_impl( | |||
| minigraph.opr()->owner_graph(), get_workspace_limit); | |||
| // some output var in minigraph.opr()->output() may not appears in | |||
| // minigraph.opr()->usable_output() bug execution may use the attrs for those | |||
| // output var, so we infer attrs for all outputs, but only return | |||
| // LogicalTensorDesc for minigraph.opr()->usable_output() | |||
| ::mgb::opr::intl::WorkspaceLimitHook::set_impl( | |||
| minigraph.opr()->owner_graph(), get_workspace_limit); | |||
| SmallVector<LogicalTensorDesc> output_descs; | |||
| for (size_t i = 0; i < minigraph.opr()->output().size(); ++i) { | |||
| auto* shape = sess.infer(sess.output_data[i].shape_infer, true); | |||
| mgb_assert(shape); | |||
| @@ -825,15 +826,9 @@ public: | |||
| mgb_assert( | |||
| ovar->shape().ndim || | |||
| ovar->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)); | |||
| descs.push_back({{ovar->shape(), ovar->dtype()}, ovar->comp_node()}); | |||
| output_descs.push_back({{ovar->shape(), ovar->dtype()}, ovar->comp_node()}); | |||
| } | |||
| return descs; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, SmallVector<TensorPtr> inputs) { | |||
| auto raw_inputs = to_raw_ptr_array(inputs); | |||
| auto output_descs = infer_output_attrs(def, raw_inputs); | |||
| SmallVector<TensorPtr> outputs(output_descs.size(), {}); | |||
| for (size_t i = 0; i < outputs.size(); i++) { | |||
| outputs[i] = | |||
| @@ -853,11 +848,8 @@ public: | |||
| } | |||
| } | |||
| } | |||
| auto& minigraph = get_cached_minigraph(def, raw_inputs); | |||
| auto _ = scoped_attach(&minigraph); | |||
| // some opr (e.g. Subtensor) may invoke infer_value during execution, | |||
| // so we need create inference session here | |||
| auto sess = minigraph.infer_session(raw_inputs); | |||
| minigraph.execute(raw_inputs, raw_outputs, m_env); | |||
| for (auto&& cn : used_cns) { | |||
| for (auto&& in : inputs) { | |||
| @@ -10,11 +10,6 @@ | |||
| */ | |||
| #include "./mini_graph.h" | |||
| #if 0 | |||
| // ../proxy_graph.h is deprecated, leave here for debug purpose | |||
| // uncomment #if 0 macro to debug | |||
| #include "../proxy_graph.h" | |||
| #endif | |||
| namespace mgb::imperative::proxy_graph { | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(ProxyGraph::InputPlaceholder); | |||
| @@ -28,18 +23,6 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| auto ret = proxy_graph::ProxyGraphTypeI::inst().infer_output_attrs_fallible( | |||
| def, inputs); | |||
| #if 0 | |||
| // delete me after the new implementation is stable | |||
| auto ref = ProxyGraph::get_default_graph()->infer_output_attrs_fallible(def, inputs); | |||
| auto& [a, _1] = ret; | |||
| auto& [b, _2] = ref; | |||
| if (a.size() != b.size()) mgb_trap(); | |||
| for (size_t i = 0; i < a.size(); ++i) { | |||
| if (a[i].layout.dtype != b[i].layout.dtype) mgb_trap(); | |||
| if (a[i].comp_node != b[i].comp_node) mgb_trap(); | |||
| if (!a[i].layout.eq_shape(b[i].layout)) mgb_trap(); | |||
| } | |||
| #endif | |||
| return ret; | |||
| } | |||
| @@ -17,83 +17,6 @@ namespace mgb { | |||
| namespace imperative { | |||
| namespace proxy_graph_detail { | |||
| // those functions are reimplemented with opr cache | |||
| // in ./proxy_graph/mini_graph.h | |||
| #if 0 | |||
| namespace { | |||
| SmallVector<Tensor*> to_raw_ptr_array( | |||
| const SmallVector<TensorPtr>& inputs, bool ensure_storage = true) { | |||
| SmallVector<Tensor*> ret; | |||
| for (auto&& i : inputs) { | |||
| mgb_assert(i); | |||
| ret.push_back(i.get()); | |||
| if (ensure_storage) { | |||
| // apply lazy allocation | |||
| i->blob()->storage(); | |||
| } | |||
| } | |||
| return ret; | |||
| } | |||
| SmallVector<LogicalTensorDesc> infer_output_attrs( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs) { | |||
| auto&& graph = ProxyGraph::get_default_graph(); | |||
| return graph->infer_output_attrs(def, to_raw_ptr_array(inputs)); | |||
| } | |||
| } // anonymous namespace | |||
| void exec( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| const SmallVector<TensorPtr>& outputs, | |||
| const SmallVector<TensorPtr>& workspaces) { | |||
| auto&& graph = ProxyGraph::get_default_graph(); | |||
| auto raw_inputs = to_raw_ptr_array(inputs), raw_outputs = to_raw_ptr_array(outputs), | |||
| raw_workspaces = to_raw_ptr_array(workspaces); | |||
| CompNode::UnorderedSet used_cns; | |||
| for (auto&& out : raw_outputs) { | |||
| auto cn = out->comp_node(); | |||
| if (used_cns.insert(cn).second) { | |||
| for (auto&& in : inputs) { | |||
| if (in->comp_node() != cn) { | |||
| auto&& e = in->get_or_create_event(); | |||
| e->device_wait_by(cn); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| graph->invoke_op(def, raw_inputs, raw_outputs, raw_workspaces); | |||
| for (auto&& cn : used_cns) { | |||
| for (auto&& in : inputs) { | |||
| if (in->comp_node() != cn) { | |||
| in->add_release_callback(cn); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, SmallVector<TensorPtr> inputs) { | |||
| auto output_descs = infer_output_attrs(def, inputs); | |||
| SmallVector<TensorPtr> outputs(output_descs.size(), {}); | |||
| for (size_t i = 0; i < outputs.size(); i++) { | |||
| outputs[i] = Tensor::make(output_descs[i].layout, output_descs[i].comp_node); | |||
| } | |||
| exec(def, inputs, outputs, {}); | |||
| auto async_error = ProxyGraph::get_async_error(); | |||
| if (async_error) { | |||
| throw *async_error; | |||
| } | |||
| return outputs; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(const OpDef& def, | |||
| const SmallVector<LogicalTensorDesc>& inputs) { | |||
| auto&& graph = ProxyGraph::get_default_graph(); | |||
| return graph->infer_output_attrs_fallible(def, inputs); | |||
| } | |||
| #endif | |||
| EncodedSubgraph make_backward_graph( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs, | |||
| const SmallVector<bool>& input_requires_grad, | |||