GitOrigin-RevId: 5b9488cb93
tags/v1.7.0
| @@ -93,6 +93,7 @@ struct LITE_API Options { | |||
| bool const_shape = false; | |||
| bool force_dynamic_alloc = false; | |||
| bool force_output_dynamic_alloc = false; | |||
| bool force_output_use_user_specified_memory = false; | |||
| bool no_profiling_on_shape_change = false; | |||
| uint8_t jit_level = 0; | |||
| uint8_t comp_node_seq_record_level = 0; | |||
| @@ -83,6 +83,7 @@ typedef struct Options { | |||
| int const_shape; | |||
| int force_dynamic_alloc; | |||
| int force_output_dynamic_alloc; | |||
| int force_output_use_user_specified_memory; | |||
| int no_profiling_on_shape_change; | |||
| int jit_level; | |||
| int comp_node_seq_record_level; | |||
| @@ -29,6 +29,7 @@ const LiteOptions default_option = { | |||
| .const_shape = false, | |||
| .force_dynamic_alloc = false, | |||
| .force_output_dynamic_alloc = false, | |||
| .force_output_use_user_specified_memory = false, | |||
| .no_profiling_on_shape_change = false, | |||
| .jit_level = 0, | |||
| .comp_node_seq_record_level = 0, | |||
| @@ -122,7 +123,9 @@ lite::Config convert_to_lite_config(const LiteConfig c_config) { | |||
| lite_config.options.var_sanity_check_first_run = | |||
| c_config.options.var_sanity_check_first_run; | |||
| lite_config.options.const_shape = c_config.options.const_shape; | |||
| lite_config.options.force_dynamic_alloc = c_config.options.const_shape; | |||
| lite_config.options.force_dynamic_alloc = c_config.options.force_dynamic_alloc; | |||
| lite_config.options.force_output_use_user_specified_memory = | |||
| c_config.options.force_output_use_user_specified_memory; | |||
| lite_config.options.force_output_dynamic_alloc = | |||
| c_config.options.force_output_dynamic_alloc; | |||
| lite_config.options.no_profiling_on_shape_change = | |||
| @@ -29,6 +29,7 @@ class LiteOptions(Structure): | |||
| ("const_shape", c_int), | |||
| ("force_dynamic_alloc", c_int), | |||
| ("force_output_dynamic_alloc", c_int), | |||
| ("force_output_use_user_specified_memory", c_int), | |||
| ("no_profiling_on_shape_change", c_int), | |||
| ("jit_level", c_int), | |||
| ("comp_node_seq_record_level", c_int), | |||
| @@ -52,6 +53,7 @@ class LiteOptions(Structure): | |||
| self.const_shape = False | |||
| self.force_dynamic_alloc = False | |||
| self.force_output_dynamic_alloc = False | |||
| self.force_output_use_user_specified_memory = False | |||
| self.no_profiling_on_shape_change = False | |||
| self.jit_level = 0 | |||
| self.comp_node_seq_record_level = 0 | |||
| @@ -67,6 +69,7 @@ class LiteOptions(Structure): | |||
| "const_shape": bool(self.const_shape), | |||
| "force_dynamic_alloc": bool(self.force_dynamic_alloc), | |||
| "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc), | |||
| "force_output_nocopy": bool(self.force_output_nocopy), | |||
| "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change), | |||
| "jit_level": self.jit_level, | |||
| "comp_node_seq_record_level": self.comp_node_seq_record_level, | |||
| @@ -84,6 +84,9 @@ void NetworkImplDft::application_config() { | |||
| m_load_config.const_var_shape = m_user_config->options.const_shape; | |||
| ConfigOption(force_dynamic_alloc, force_dynamic_alloc); | |||
| ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc); | |||
| ConfigOption( | |||
| force_output_use_user_specified_memory, | |||
| force_output_use_user_specified_memory); | |||
| ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change); | |||
| LITE_ASSERT( | |||
| m_user_config->options.jit_level == 0 || | |||
| @@ -250,7 +253,13 @@ void NetworkImplDft::make_output_spec() { | |||
| } | |||
| } | |||
| }; | |||
| m_output_spec.emplace_back(load_out, std::move(cb)); | |||
| //! if write to user-specified memory, the CallbackCaller must be nullptr. | |||
| if (m_user_config->options.force_output_use_user_specified_memory || | |||
| m_user_config->options.force_output_dynamic_alloc) { | |||
| m_output_spec.emplace_back(load_out, nullptr); | |||
| } else { | |||
| m_output_spec.emplace_back(load_out, std::move(cb)); | |||
| } | |||
| } else { | |||
| LITE_THROW(ssprintf("no output named : %s in the mode", out.name.c_str())); | |||
| } | |||
| @@ -444,8 +453,7 @@ void NetworkImplDft::set_io(const NetworkIO& network_io) { | |||
| } | |||
| } | |||
| void NetworkImplDft::try_infer_tensor_layout( | |||
| std::shared_ptr<Tensor> tensor, mgb::cg::SymbolVar var) { | |||
| void NetworkImplDft::try_infer_tensor_layout(std::shared_ptr<Tensor> tensor, Var var) { | |||
| auto&& static_infer_mgr = m_load_config.comp_graph->static_infer_manager(); | |||
| auto infer_trait = var.node()->get_static_infer_trait(); | |||
| if (std::get<0>(infer_trait)) { | |||
| @@ -455,9 +463,13 @@ void NetworkImplDft::try_infer_tensor_layout( | |||
| "Lite infer output shape failed, maybe the model is " | |||
| "dynamic " | |||
| "shape.\n"); | |||
| LITE_ASSERT( | |||
| !m_user_config->options.force_output_use_user_specified_memory, | |||
| "force_output_use_user_specified_memory can't be used when output " | |||
| "shape can't be derived."); | |||
| return; | |||
| } | |||
| Layout layout = to_lite_layout(mgb::TensorLayout{*shape, var.dtype()}); | |||
| Layout layout = to_lite_layout(TensorLayout{*shape, var.dtype()}); | |||
| tensor->set_layout(layout); | |||
| } | |||
| } | |||
| @@ -559,8 +571,7 @@ void NetworkImplDft::update_output() { | |||
| out_it != m_network_io->outputs.end();) { | |||
| if (std::find_if( | |||
| m_load_result.output_var_list.begin(), | |||
| m_load_result.output_var_list.end(), | |||
| [out_it](const mgb::SymbolVar var) { | |||
| m_load_result.output_var_list.end(), [out_it](const SymbolVar var) { | |||
| return var.node()->name() == out_it->name; | |||
| }) == m_load_result.output_var_list.end()) { | |||
| LITE_LOG("%s is not the network output, ignore it.", out_it->name.c_str()); | |||
| @@ -584,7 +595,7 @@ void NetworkImplDft::update_output() { | |||
| out_it->lite_tensor = | |||
| std::make_shared<Tensor>(device_id, stream_id, device_type); | |||
| } | |||
| mgb::SymbolVar var; | |||
| SymbolVar var; | |||
| for (auto&& out_var : m_load_result.output_var_list) { | |||
| if (out_var.node()->name() == out_it->name) { | |||
| var = out_var; | |||
| @@ -592,10 +603,12 @@ void NetworkImplDft::update_output() { | |||
| } | |||
| } | |||
| try_infer_tensor_layout(out_it->lite_tensor, var); | |||
| output_tensor_copy_optimize(var, out_it->lite_tensor); | |||
| } | |||
| //! user not set, use default output | |||
| } else { | |||
| for (auto&& out : m_load_result.output_var_list) { | |||
| std::shared_ptr<Tensor> lite_tensor = nullptr; | |||
| auto it = std::find_if( | |||
| m_network_io->outputs.begin(), m_network_io->outputs.end(), | |||
| [&out](const IOInner io) { return io.name == out.node()->name(); }); | |||
| @@ -608,6 +621,7 @@ void NetworkImplDft::update_output() { | |||
| std::make_shared<Tensor>(device_id, stream_id, device_type); | |||
| } | |||
| try_infer_tensor_layout(it->lite_tensor, out); | |||
| lite_tensor = it->lite_tensor; | |||
| } else { | |||
| IOInner output; | |||
| output.name = out.node()->name(); | |||
| @@ -615,11 +629,47 @@ void NetworkImplDft::update_output() { | |||
| device_id, stream_id, device_type, true); | |||
| m_network_io->outputs.push_back({output}); | |||
| try_infer_tensor_layout(output.lite_tensor, out); | |||
| lite_tensor = output.lite_tensor; | |||
| } | |||
| output_tensor_copy_optimize(out, lite_tensor); | |||
| } | |||
| } | |||
| } | |||
| void NetworkImplDft::output_tensor_copy_optimize( | |||
| Var var, std::shared_ptr<Tensor> tensor) { | |||
| LITE_ASSERT( | |||
| !(m_user_config->options.force_output_use_user_specified_memory && | |||
| m_user_config->options.force_output_dynamic_alloc), | |||
| "Can't set force_output_use_user_specified_memory and " | |||
| "force_output_dynamic_alloc at the same time."); | |||
| if (m_user_config->options.force_output_use_user_specified_memory) { | |||
| TensorHelper::implement(tensor) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .set_reset_callback([var](TensorImplDft* dft_tensor) { | |||
| dft_tensor->device_share_host_memory(); | |||
| auto dv = dft_tensor->dev_tensor().get(); | |||
| dv->comp_node(var.node()->comp_node(), true); | |||
| var.node()->init_mem_plan(dv); | |||
| var.node()->reset_dev_tensor_from_tensor(*dv); | |||
| }); | |||
| } | |||
| if (m_user_config->options.force_output_dynamic_alloc) { | |||
| TensorHelper::implement(tensor) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .set_get_memory_callback([var](TensorImplDft* dft_tensor) { | |||
| if (dft_tensor->is_host()) { | |||
| auto host_tensor = dft_tensor->m_host_tensor; | |||
| *host_tensor = | |||
| HostTensorND::make_proxy(var.node()->dev_tensor()); | |||
| } else { | |||
| auto dev_tensor = dft_tensor->m_dev_tensor; | |||
| *dev_tensor = var.node()->dev_tensor(); | |||
| } | |||
| }); | |||
| } | |||
| } | |||
| std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor( | |||
| std::string io_name, LiteTensorPhase phase) { | |||
| if (phase == LiteTensorPhase::LITE_INPUT || phase == LiteTensorPhase::LITE_IO) { | |||
| @@ -12,6 +12,7 @@ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #include "megbrain/graph.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "lite/network.h" | |||
| @@ -41,6 +42,7 @@ class NetworkImplDft final : public Network::NetworkImplBase { | |||
| public: | |||
| NetworkImplDft() { m_load_config.comp_graph = mgb::ComputingGraph::make(); } | |||
| using S = megdnn::param::ExecutionPolicy::Strategy; | |||
| using Var = mgb::cg::SymbolVar; | |||
| //! set the config of the network, include: | |||
| //! the inference device | |||
| //! the other inference options, such as record_level, weight_preprocess... | |||
| @@ -207,8 +209,10 @@ private: | |||
| void compile_graph(); | |||
| //! try to infer output tensor layout | |||
| void try_infer_tensor_layout( | |||
| std::shared_ptr<Tensor> tensor, mgb::cg::SymbolVar var); | |||
| void try_infer_tensor_layout(std::shared_ptr<Tensor> tensor, Var var); | |||
| //! optimized output tensor copy | |||
| void output_tensor_copy_optimize(Var var, std::shared_ptr<Tensor> tensor); | |||
| private: | |||
| bool m_async = false; | |||
| @@ -149,6 +149,9 @@ Layout TensorImplDft::get_layout() const { | |||
| } | |||
| void* TensorImplDft::get_memory_ptr() const { | |||
| if (m_get_memory_callback) { | |||
| m_get_memory_callback(const_cast<TensorImplDft*>(this)); | |||
| } | |||
| if (is_host()) { | |||
| return static_cast<void*>(m_host_tensor->raw_ptr()); | |||
| } else { | |||
| @@ -157,6 +160,9 @@ void* TensorImplDft::get_memory_ptr() const { | |||
| } | |||
| void* TensorImplDft::get_memory_ptr(const std::vector<size_t>& idx) const { | |||
| if (m_get_memory_callback) { | |||
| m_get_memory_callback(const_cast<TensorImplDft*>(this)); | |||
| } | |||
| if (is_host()) { | |||
| auto elemsize_log = m_host_tensor->layout().dtype.size_log(); | |||
| switch (elemsize_log) { | |||
| @@ -317,6 +323,9 @@ void TensorImplDft::reset(void* prepared_data) { | |||
| storage.reset(cn, size, raw_storage); | |||
| m_dev_tensor->reset(storage, mge_layout); | |||
| } | |||
| if (m_reset_callback) { | |||
| m_reset_callback(this); | |||
| } | |||
| } | |||
| void TensorImplDft::reset(void* prepared_data, const Layout& layout) { | |||
| @@ -430,6 +439,34 @@ void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) { | |||
| } | |||
| } | |||
| void TensorImplDft::set_reset_callback(const std::function<void(TensorImplDft*)>& cb) { | |||
| m_reset_callback = cb; | |||
| } | |||
| void TensorImplDft::set_get_memory_callback( | |||
| const std::function<void(TensorImplDft*)>& cb) { | |||
| m_get_memory_callback = cb; | |||
| } | |||
| void TensorImplDft::device_share_host_memory() { | |||
| if (is_host()) { | |||
| if (!m_dev_tensor) { | |||
| m_dev_tensor = std::make_shared<mgb::DeviceTensorND>( | |||
| m_host_tensor->comp_node(), m_host_tensor->layout()); | |||
| } | |||
| if (m_host_tensor->raw_ptr() != m_dev_tensor->raw_ptr()) { | |||
| auto raw_storage = std::shared_ptr<mgb::dt_byte>( | |||
| m_host_tensor->raw_ptr(), [](void*) {}); | |||
| auto cn = m_host_tensor->comp_node(); | |||
| auto mge_layout = m_host_tensor->layout(); | |||
| size_t size = mge_layout.span().dist_byte(); | |||
| mgb::DeviceTensorStorage storage; | |||
| storage.reset(cn, size, raw_storage); | |||
| m_dev_tensor->reset(storage, mge_layout); | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -97,11 +97,22 @@ public: | |||
| //! get host tensor | |||
| std::shared_ptr<mgb::HostTensorND> host_tensor() const { return m_host_tensor; } | |||
| //! get device tensor | |||
| std::shared_ptr<mgb::DeviceTensorND> dev_tensor() const { return m_dev_tensor; } | |||
| //! copy from mgb tensor | |||
| void copy_from_mge_tensor(const mgb::DeviceTensorND& dv); | |||
| //! set tensor reset callback | |||
| void set_reset_callback(const std::function<void(TensorImplDft*)>& cb); | |||
| //! set tensor get memory callback | |||
| void set_get_memory_callback(const std::function<void(TensorImplDft*)>& cb); | |||
| //! shared the same memory with host and device tensor | |||
| void device_share_host_memory(); | |||
| public: | |||
| friend class NetworkImplDft; | |||
| @@ -115,6 +126,8 @@ private: | |||
| void set_mge_tensor_compnode(const mgb::CompNode& comp_node); | |||
| private: | |||
| std::function<void(TensorImplDft*)> m_get_memory_callback; | |||
| std::function<void(TensorImplDft*)> m_reset_callback; | |||
| std::shared_ptr<mgb::HostTensorND> m_host_tensor; | |||
| std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor; | |||
| }; | |||
| @@ -153,6 +153,10 @@ std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) { | |||
| Network& Network::set_async_callback(const AsyncCallback& callback) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT( | |||
| !m_config.options.force_output_use_user_specified_memory, | |||
| "Async mode can't run with force_output_use_user_specified_memory which " | |||
| "output data is written to use specific memory."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->set_async_callback(std::move(callback)); | |||
| return *this; | |||
| @@ -397,6 +397,73 @@ TEST(TestNetWork, ResetOutput) { | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWork, OutputNoCopy) { | |||
| Config config; | |||
| config.options.force_output_use_user_specified_memory = true; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| size_t times = 5; | |||
| std::vector<std::shared_ptr<Tensor>> result_tensors; | |||
| for (size_t i = 0; i < times; i++) { | |||
| auto tmp = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| result_tensors.push_back(tmp); | |||
| } | |||
| for (size_t i = 0; i < times; i++) { | |||
| void* out_data = result_tensors[i]->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensors[i]->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| ASSERT_EQ(output_tensor->get_memory_ptr(), out_data); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| for (size_t i = 0; i < times; i++) { | |||
| compare_lite_tensor<float>(result_tensors[i], result_mgb); | |||
| } | |||
| } | |||
| TEST(TestNetWork, OutputDynamicAlloc) { | |||
| Config config; | |||
| config.options.force_output_dynamic_alloc = true; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| size_t times = 5; | |||
| for (size_t i = 0; i < times; i++) { | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| } | |||
| TEST(TestNetWork, AsyncExec) { | |||
| Config config; | |||
| config.options.var_sanity_check_first_run = false; | |||
| @@ -507,13 +507,12 @@ void ComputingGraphImpl::dest_var_optimize(VarNodeArray& dest_vars) { | |||
| i->add_flag(F::NO_MEM_RECLAIM); | |||
| } | |||
| } | |||
| if (dest_vars[0]->owner_graph()->options().force_output_write_to_user_memory) { | |||
| if (dest_vars[0]->owner_graph()->options().force_output_use_user_specified_memory) { | |||
| for (auto&& i : dest_vars) { | |||
| mgb_assert( | |||
| !i->contain_flag(F::RT_FORCE_DYNAMIC_MEM_ALLOC), | |||
| "var %s with force dynamic allocate should be set to write output " | |||
| "to " | |||
| "user memory", | |||
| "var %s with RT_FORCE_DYNAMIC_MEM_ALLOC flag should not set " | |||
| "force write output to user memory", | |||
| i->cname()); | |||
| i->add_flag( | |||
| F::NO_SYS_MEM_ALLOC | F::NO_SYS_STATIC_MEM_ALLOC | | |||
| @@ -574,6 +574,10 @@ MemAllocPlan& VarNode::init_mem_plan(const DeviceTensorND* fixed_alloc) { | |||
| return m_mem_plan; | |||
| } | |||
| bool VarNode::is_graph_dest_varnode() { | |||
| return ComputingGraphImpl::downcast(owner_graph())->var_receiver(this).size() == 0; | |||
| } | |||
| VarNode& VarNode::add_flag(Flag flag) { | |||
| modify_flag(flag, m_flag | flag); | |||
| return *this; | |||
| @@ -582,10 +586,13 @@ VarNode& VarNode::add_flag(Flag flag) { | |||
| void VarNode::modify_flag(Flag delta, Flag new_flag) { | |||
| if (contain_flag(Flag::FLAG_FREEZED)) { | |||
| mgb_assert( | |||
| (delta & (Flag::NO_SYS_MEM_ALLOC | Flag::NO_MEM_RECLAIM | | |||
| Flag::NO_SYS_STATIC_MEM_ALLOC | | |||
| Flag::RT_FORCE_DYNAMIC_MEM_ALLOC)) == delta || | |||
| (new_flag & Flag::MEMORY_NO_NEED)); | |||
| (delta & (Flag::NO_MEM_RECLAIM | Flag::NO_SYS_STATIC_MEM_ALLOC | | |||
| Flag::RT_FORCE_DYNAMIC_MEM_ALLOC | Flag::MEMORY_NO_NEED)) == | |||
| delta || | |||
| is_graph_dest_varnode(), | |||
| "After the FLAG_FREEZED flag setting, var can only modify " | |||
| "NO_MEM_RECLAIM, NO_SYS_STATIC_MEM_ALLOC, RT_FORCE_DYNAMIC_MEM_ALLOC, " | |||
| "MEMORY_NO_NEED flag except graph dest var."); | |||
| mgb_assert( | |||
| !ComputingGraphImpl::downcast(owner_graph()) | |||
| @@ -421,7 +421,7 @@ public: | |||
| * Force the output to be written to the user specified memory, which | |||
| * can optimize the copy of output data at one time | |||
| */ | |||
| bool force_output_write_to_user_memory = false; | |||
| bool force_output_use_user_specified_memory = false; | |||
| //! whether to perform var sanity check on first run | |||
| bool var_sanity_check_first_run = true; | |||
| @@ -549,6 +549,10 @@ private: | |||
| MGE_WIN_DECLSPEC_FUC void modify_flag(Flag delta, Flag new_flag); | |||
| //! whether the var is graph output, if it is output, the Flag of | |||
| //! NO_SYS_MEM_ALLOC can be modified. | |||
| bool is_graph_dest_varnode(); | |||
| MGE_WIN_DECLSPEC_FUC void assign_dev_tensor_from_tensor( | |||
| const DeviceTensorND& value); | |||
| @@ -82,7 +82,7 @@ TEST(TestNoCopy, BasicInputNoCopy) { | |||
| TEST(TestNoCopy, IONoCopyPtrEQ) { | |||
| auto test_graph = TestGraph(); | |||
| auto compute_graph = test_graph.m_network->graph; | |||
| compute_graph->options().force_output_write_to_user_memory = true; | |||
| compute_graph->options().force_output_use_user_specified_memory = true; | |||
| test_graph.create_graph(); | |||
| auto func = test_graph.compile_without_copy(); | |||
| auto&& outvar = func->get_output_vars()[0]; | |||
| @@ -123,7 +123,7 @@ TEST(TestNoCopy, IONoCopyPtrEQ) { | |||
| TEST(TestNoCopy, IONoCopyCorrect) { | |||
| auto test_graph = TestGraph(); | |||
| auto compute_graph = test_graph.m_network->graph; | |||
| compute_graph->options().force_output_write_to_user_memory = true; | |||
| compute_graph->options().force_output_use_user_specified_memory = true; | |||
| test_graph.create_graph(); | |||
| HostTensorND truth; | |||
| auto func = test_graph.compile_without_copy(); | |||