GitOrigin-RevId: c610c8bf9a
tags/v1.7.0
| @@ -207,6 +207,9 @@ void NetworkImplDft::use_tensorrt() { | |||
| //! set the callback in async model | |||
| void NetworkImplDft::set_async_callback(const AsyncCallback& callback) { | |||
| LITE_ASSERT(!m_is_cpu_inplace_mode, "cpu inplace mode not support async mode"); | |||
| LITE_ASSERT( | |||
| m_user_config->options.comp_node_seq_record_level == 0, | |||
| "record mode not support async mode"); | |||
| LITE_ASSERT( | |||
| m_user_config->device_type == LiteDeviceType::LITE_CPU || | |||
| m_user_config->device_type == LiteDeviceType::LITE_CUDA, | |||
| @@ -659,4 +659,21 @@ void CompNode::ImplBase::add_callback(megdnn::thin_function<void()>&&) { | |||
| locator().to_string().c_str()); | |||
| } | |||
| void CompNode::ImplBase::enable_dispatch() { | |||
| mgb_throw( | |||
| MegBrainError, | |||
| "Unsupported add callback to " | |||
| "comp node %s", | |||
| locator().to_string().c_str()); | |||
| } | |||
| void CompNode::ImplBase::disable_dispatch(bool* flag) { | |||
| MGB_MARK_USED_VAR(flag); | |||
| mgb_throw( | |||
| MegBrainError, | |||
| "Unsupported add callback to " | |||
| "comp node %s", | |||
| locator().to_string().c_str()); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -810,6 +810,12 @@ public: | |||
| task(); | |||
| } | |||
| } | |||
| void enable_dispatch() override { m_env.cpu_env().enable_dispatch(); } | |||
| void disable_dispatch(bool* flag) override { | |||
| m_env.cpu_env().disable_dispatch(flag); | |||
| } | |||
| }; | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(CompNodeRecorderImpl); | |||
| #if MGB_HAVE_THREAD | |||
| @@ -474,4 +474,35 @@ void CompNodeEnv::on_bad_device_type(DeviceType expected) const { | |||
| MGB_VERSION_SYMBOL3(MEGDNN, MEGDNN_MAJOR, MEGDNN_MINOR, MEGDNN_PATCH); | |||
| void CompNodeEnv::CpuEnv::enable_dispatch() { | |||
| do_task_inplace = nullptr; | |||
| } | |||
| void CompNodeEnv::CpuEnv::disable_dispatch(bool* flag) { | |||
| do_task_inplace = flag; | |||
| } | |||
| void CompNodeEnv::CpuEnv::dispatch(Task&& task) const { | |||
| if (do_task_inplace && *do_task_inplace) { | |||
| task(); | |||
| } else { | |||
| dispatcher->dispatch(std::move(task)); | |||
| } | |||
| } | |||
| void CompNodeEnv::CpuEnv::dispatch( | |||
| MultiThreadingTask&& task, size_t parallelism) const { | |||
| if (do_task_inplace && *do_task_inplace) { | |||
| for (size_t i = 0; i < parallelism; ++i) { | |||
| task(i, 0); | |||
| } | |||
| } else { | |||
| dispatcher->dispatch(std::move(task), parallelism); | |||
| } | |||
| } | |||
| #if MGB_HAVE_THREAD | |||
| MGB_THREAD_LOCAL_PTR(bool) CompNodeEnv::CpuEnv::do_task_inplace = nullptr; | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -168,12 +168,32 @@ MGB_DEFINE_OPR_CLASS( | |||
| ComputingGraphImpl::CallbackCaller, SingleCNOperatorNodeBase) // { | |||
| std::vector<std::vector<ComputingGraph::Callback>> m_cb; | |||
| //! CallbackCaller supports change memory address in output tensor record mode(only | |||
| //! on CPU). The whole callback will be dispatched(like dispatching tensor copy | |||
| //! instead of dispatching memcpy). | |||
| //! Side effect: sync() is not supported in callback anymore. Users should call | |||
| //! func->wait() instead out of callback to sync data from Device to Host. | |||
| //! Note : only record level 1 supports change memory address in output tensor. | |||
| //! HostTensor captured in callback should not on cpu default. | |||
| void scn_do_execute() override { | |||
| for (size_t i = 0; i < input().size(); ++i) { | |||
| auto&& in = input(i)->dev_tensor(); | |||
| for (auto&& callback : m_cb[i]) { | |||
| // const cast for backward API compatibility | |||
| callback(const_cast<DeviceTensorND&>(in)); | |||
| if (this->owner_graph()->options().comp_node_seq_record_level == 1 && | |||
| in.comp_node().device_type() == CompNode::DeviceType::CPU && | |||
| in.comp_node() != CompNode::default_cpu()) { | |||
| auto record_cb = [&in, &callback]() { | |||
| auto comp_node = in.comp_node(); | |||
| bool do_task_inplace = true; | |||
| comp_node.disable_dispatch(&do_task_inplace); | |||
| callback(const_cast<DeviceTensorND&>(in)); | |||
| comp_node.enable_dispatch(); | |||
| }; | |||
| in.comp_node().add_callback(record_cb); | |||
| } else { | |||
| // const cast for backward API compatibility | |||
| callback(const_cast<DeviceTensorND&>(in)); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -412,6 +412,16 @@ public: | |||
| return m_impl->add_callback(std::move(cb)); | |||
| } | |||
| /*! | |||
| * enable dispatcher | |||
| */ | |||
| void enable_dispatch() { m_impl->enable_dispatch(); } | |||
| /*! | |||
| * disable dispatcher so that task will be done inplace | |||
| */ | |||
| void disable_dispatch(bool* flag) { m_impl->disable_dispatch(flag); } | |||
| enum class Flag : uint32_t { | |||
| //! Whether computing recorder is supported on this comp node (i.e. | |||
| //! whether non-zero comp_node_seq_record_level is allowed) | |||
| @@ -532,6 +542,10 @@ protected: | |||
| virtual void add_callback(megdnn::thin_function<void()>&&); | |||
| virtual void enable_dispatch(); | |||
| virtual void disable_dispatch(bool* flag); | |||
| virtual uint64_t get_uid() { | |||
| mgb_throw(MegBrainError, "get_uid is not impl yet"); | |||
| }; | |||
| @@ -503,12 +503,19 @@ public: | |||
| using AffinityCallBack = thin_function<void(size_t)>; | |||
| std::shared_ptr<CPUDispatcher> dispatcher; | |||
| #if MGB_HAVE_THREAD | |||
| static MGB_THREAD_LOCAL_PTR(bool) do_task_inplace; | |||
| #else | |||
| bool* do_task_inplace = nullptr; | |||
| #endif | |||
| void dispatch(Task&& task) const { dispatcher->dispatch(std::move(task)); } | |||
| void enable_dispatch(); | |||
| void dispatch(MultiThreadingTask&& task, size_t parallelism) const { | |||
| dispatcher->dispatch(std::move(task), parallelism); | |||
| } | |||
| void disable_dispatch(bool* flag); | |||
| void dispatch(Task&& task) const; | |||
| void dispatch(MultiThreadingTask&& task, size_t parallelism) const; | |||
| void set_affinity(AffinityCallBack&& cb) const { | |||
| dispatcher->set_affinity(std::move(cb)); | |||
| @@ -521,6 +528,12 @@ public: | |||
| return m_cpu_env; | |||
| } | |||
| CpuEnv& cpu_env() { | |||
| if (mgb_unlikely(m_property.type != DeviceType::CPU)) | |||
| on_bad_device_type(DeviceType::CPU); | |||
| return m_cpu_env; | |||
| } | |||
| //! init this as a cpu env | |||
| void init_cpu(const CpuEnv& env, CompNode comp_node); | |||
| @@ -44,7 +44,7 @@ void run_comp_seq_rec_basic(CompNode cn, bool fake_first) { | |||
| graph->options().fake_next_exec = true; | |||
| graph->options().var_sanity_check_first_run = false; | |||
| } | |||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
| auto func = graph->compile({make_callback_copy(z, host_z, false)}); | |||
| if (fake_first) { | |||
| func->execute(); // first exec | |||
| } | |||
| @@ -55,6 +55,8 @@ void run_comp_seq_rec_basic(CompNode cn, bool fake_first) { | |||
| } | |||
| host_x->copy_from_fixlayout(*gen(host_x->shape(), cn)); | |||
| func->execute(); | |||
| func->wait(); | |||
| host_z.sync(); | |||
| auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param); | |||
| MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3) << "iter " << iter; | |||
| } | |||
| @@ -70,6 +72,28 @@ void run_comp_seq_rec_basic(CompNode cn, bool fake_first) { | |||
| ASSERT_EQ(executed[2], change); | |||
| // create new recorder, exec with recorder | |||
| ASSERT_EQ(executed[3], change + 1); | |||
| //! then we change host_z's ptr each time and check result | |||
| HostTensorND host_iter; | |||
| host_iter.copy_from(host_z); | |||
| std::vector<std::shared_ptr<HostTensorND>> m_hosts(10); | |||
| for (size_t i = 0; i < 10; i++) { | |||
| m_hosts[i] = gen(host_z.shape(), host_z.comp_node()); | |||
| } | |||
| iter = 0; | |||
| for (; iter < 10; ++iter) { | |||
| auto host_tmp = m_hosts[iter]; | |||
| auto host_z_storage = host_z.storage(); | |||
| auto origin_ptr = host_z_storage.raw_storage(); | |||
| host_z_storage.reset( | |||
| host_z.comp_node(), host_z_storage.size(), | |||
| host_tmp->storage().raw_storage()); | |||
| auto changed_ptr = host_z_storage.raw_storage(); | |||
| ASSERT_TRUE(origin_ptr != changed_ptr); | |||
| func->execute(); | |||
| func->wait(); | |||
| MGB_ASSERT_TENSOR_NEAR(host_iter, host_z, 1e-3) << "iter " << iter; | |||
| } | |||
| } | |||
| void run_comp_seq_rec_basic_level2(CompNode cn) { | |||
| @@ -154,7 +178,7 @@ void run_comp_seq_rec_dyn_elemwise(CompNode cn, bool fake_first) { | |||
| w = opr::Elemwise::make({x, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3); | |||
| HostTensorND host_w; | |||
| auto func = graph->compile({make_callback_copy(w, host_w)}); | |||
| auto func = graph->compile({make_callback_copy(w, host_w, false)}); | |||
| if (fake_first) { | |||
| func->execute(); | |||
| } | |||
| @@ -166,9 +190,30 @@ void run_comp_seq_rec_dyn_elemwise(CompNode cn, bool fake_first) { | |||
| } | |||
| host_x->copy_from(*gen(host_x->shape(), cn)); | |||
| func->execute(); | |||
| func->wait(); | |||
| auto expect = check(); | |||
| MGB_ASSERT_TENSOR_EQ(expect, host_w) << "iter " << i; | |||
| } | |||
| //! then we change host_z's ptr each time and check result | |||
| HostTensorND host_iter; | |||
| host_iter.copy_from(host_w); | |||
| std::vector<std::shared_ptr<HostTensorND>> m_hosts(10); | |||
| for (size_t i = 0; i < 10; i++) { | |||
| m_hosts[i] = gen(host_w.shape(), host_w.comp_node()); | |||
| } | |||
| for (size_t iter = 0; iter < 10; ++iter) { | |||
| auto host_tmp = m_hosts[iter]; | |||
| auto host_w_storage = host_w.storage(); | |||
| auto origin_ptr = host_w_storage.raw_storage(); | |||
| host_w_storage.reset( | |||
| host_w.comp_node(), host_w_storage.size(), | |||
| host_tmp->storage().raw_storage()); | |||
| auto changed_ptr = host_w_storage.raw_storage(); | |||
| ASSERT_TRUE(origin_ptr != changed_ptr); | |||
| func->execute(); | |||
| func->wait(); | |||
| MGB_ASSERT_TENSOR_EQ(host_iter, host_w) << "iter " << iter; | |||
| } | |||
| } | |||
| void run_level2(CompNode cn, bool use_multi_holder) { | |||
| @@ -381,6 +426,9 @@ void run<sync_from_func>(CompNode cn) { | |||
| HostTensorND host_y; | |||
| graph->options().var_sanity_check_first_run = false; | |||
| graph->options().comp_node_seq_record_level = level; | |||
| if (level == 1) { | |||
| sync = false; | |||
| } | |||
| auto cb = [&](const DeviceTensorND& dv) { | |||
| host_y.copy_from(dv); | |||
| if (sync) { | |||
| @@ -418,6 +466,9 @@ void run<cb_non_contig>(CompNode cn) { | |||
| HostTensorND host_y; | |||
| graph->options().var_sanity_check_first_run = false; | |||
| graph->options().comp_node_seq_record_level = level; | |||
| if (level == 1) { | |||
| sync = false; | |||
| } | |||
| auto cb = [&](const DeviceTensorND& dv) { | |||
| host_y.copy_from(dv); | |||
| if (sync) { | |||
| @@ -428,8 +479,8 @@ void run<cb_non_contig>(CompNode cn) { | |||
| if (level == 2) { | |||
| ComputingGraph::assert_destroy(graph); | |||
| } | |||
| for (int i = 0; i < 3; ++i) { | |||
| host_x->copy_from(*gen(host_x->shape())); | |||
| for (int k = 0; k < 3; ++k) { | |||
| host_x->copy_from(*gen(host_x->shape(), cn)); | |||
| HostTensorND expect{host_x->comp_node(), {5, 4}}; | |||
| auto px = host_x->ptr<float>(), py = expect.ptr<float>(); | |||
| for (int i = 0; i < 5; ++i) { | |||
| @@ -504,14 +555,16 @@ void run<multi_recorder_run>(CompNode cn) { | |||
| y = opr::Host2DeviceCopy::make(*graph, host_y), | |||
| z = opr::Convolution::make(x, y, param); | |||
| graph->options().comp_node_seq_record_level = 1; | |||
| return graph->compile({make_callback_copy(z, host_z_v[graph_id])}); | |||
| return graph->compile({make_callback_copy(z, host_z_v[graph_id], false)}); | |||
| }; | |||
| funcs.push_back(gen_graph(0)); | |||
| funcs.push_back(gen_graph(1)); | |||
| for (int iter = 0; iter < 10; ++iter) { | |||
| host_x->copy_from_fixlayout(*gen(host_x->shape(), cn)); | |||
| funcs[0]->execute(); | |||
| funcs[0]->wait(); | |||
| funcs[1]->execute(); | |||
| funcs[1]->wait(); | |||
| auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param); | |||
| MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[0], 1e-3) << "iter " << iter; | |||
| MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[1], 1e-3) << "iter " << iter; | |||
| @@ -1375,13 +1375,17 @@ TEST(TestGraph, CompNodeFinalize) { | |||
| graph->options().var_sanity_check_first_run = false; | |||
| graph->options().comp_node_seq_record_level = rec; | |||
| } | |||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
| auto sync = (rec != 1); | |||
| auto func = graph->compile({make_callback_copy(z, host_z, sync)}); | |||
| if (rec == 2) { | |||
| ComputingGraph::assert_destroy(graph); | |||
| } | |||
| for (int i = 0; i < 5; ++i) { | |||
| host_x->copy_from(*gen({1}, cn)); | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| MGB_ASSERT_FLOAT_EQ( | |||
| host_x->ptr<float>()[0] + host_y->ptr<float>()[0], | |||
| host_z.ptr<float>()[0]); | |||
| @@ -1933,6 +1937,7 @@ void test_free_memory_in_weight_preprocess(int record_level, CompNode cn) { | |||
| #endif | |||
| graph->options().graph_opt.weight_preprocess = true; | |||
| graph->options().comp_node_seq_record_level = record_level; | |||
| auto sync = (record_level != 1); | |||
| auto mkvar = [&](const char* name, const TensorShape& shp) { | |||
| return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name); | |||
| }; | |||
| @@ -1970,11 +1975,17 @@ void test_free_memory_in_weight_preprocess(int record_level, CompNode cn) { | |||
| }); | |||
| HostTensorND host_y; | |||
| auto func = graph->compile({make_callback_copy(y, host_y)}); | |||
| auto func = graph->compile({make_callback_copy(y, host_y, sync)}); | |||
| //! flag the no need memory of var | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| //! free the no need memory of var | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| auto check = [&](SymbolVar v) { | |||
| ASSERT_TRUE(v.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED)); | |||
| ASSERT_TRUE(v.node()->dev_tensor().empty()); | |||
| @@ -213,9 +213,13 @@ TEST(TestGraph, MultiThreadRecorder) { | |||
| z = opr::Convolution::make(x, y, param); | |||
| graph->options().comp_node_seq_record_level = record_level; | |||
| graph->options().var_sanity_check_first_run = false; | |||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
| auto sync = (record_level != 1); | |||
| auto func = graph->compile({make_callback_copy(z, host_z, sync)}); | |||
| for (int i = 0; i < 5; i++) { | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| } | |||
| auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param); | |||
| MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3); | |||
| @@ -50,6 +50,7 @@ void run_test(CompNode cn, const PluginMaker& plugin_maker) { | |||
| graph->options().var_sanity_check_first_run = false; | |||
| graph->options().comp_node_seq_record_level = record; | |||
| graph->options().graph_opt_level = 0; | |||
| auto sync = (record != 1); | |||
| auto plug = plugin_maker(graph.get(), record); | |||
| // make a non-contiguous value, also introduce some shape dependencies | |||
| @@ -76,11 +77,14 @@ void run_test(CompNode cn, const PluginMaker& plugin_maker) { | |||
| cg::DepOprIter{cb_rename}.add(y); | |||
| HostTensorND host_y; | |||
| auto func = graph->compile({make_callback_copy(y, host_y)}); | |||
| auto func = graph->compile({make_callback_copy(y, host_y, sync)}); | |||
| if (record == 2) { | |||
| ComputingGraph::assert_destroy(graph); | |||
| } | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| plug->flush_lazy(); | |||
| MGB_ASSERT_TENSOR_EQ(make_expect(), host_y); | |||
| @@ -91,10 +95,16 @@ void run_test(CompNode cn, const PluginMaker& plugin_maker) { | |||
| *host_x = *gen(host_x->shape(), cn); | |||
| } | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| MGB_ASSERT_TENSOR_EQ(make_expect(), host_y); | |||
| for (int i = 0; i < 2; ++i) { | |||
| host_x->copy_from(*gen(host_x->shape(), cn)); | |||
| func->execute(); | |||
| if (!sync) { | |||
| func->wait(); | |||
| } | |||
| MGB_ASSERT_TENSOR_EQ(make_expect(), host_y); | |||
| } | |||