GitOrigin-RevId: b12f1c4a66
tags/v1.7.0
| @@ -15,6 +15,8 @@ | |||||
| #include "megdnn/dtype.h" | #include "megdnn/dtype.h" | ||||
| #include "megdnn/internal/defs.h" | #include "megdnn/internal/defs.h" | ||||
| #include <memory> | |||||
| #if MEGDNN_CC_HOST | #if MEGDNN_CC_HOST | ||||
| #include <cstdarg> | #include <cstdarg> | ||||
| #include <string> | #include <string> | ||||
| @@ -402,31 +404,94 @@ struct TensorLayout : public TensorShape { | |||||
| MGE_WIN_DECLSPEC_FUC size_t access_bytes() const; | MGE_WIN_DECLSPEC_FUC size_t access_bytes() const; | ||||
| }; | }; | ||||
| class RefPtr { | |||||
| std::shared_ptr<void*> m_ref; | |||||
| size_t m_offset; | |||||
| bool m_mutable; | |||||
| public: | |||||
| RefPtr() { | |||||
| m_ref = std::make_shared<void*>((void*)nullptr); | |||||
| m_offset = 0; | |||||
| m_mutable = true; | |||||
| } | |||||
| RefPtr(void* ref_ptr, const size_t offset = 0) { | |||||
| m_ref = std::make_shared<void*>(ref_ptr); | |||||
| m_offset = offset; | |||||
| m_mutable = true; | |||||
| } | |||||
| explicit RefPtr( | |||||
| std::shared_ptr<void*> ref_ptr, const size_t offset = 0, | |||||
| bool is_mutable = true) { | |||||
| m_ref = ref_ptr; | |||||
| m_offset = offset; | |||||
| m_mutable = is_mutable; | |||||
| } | |||||
| void* get_ptr() const { | |||||
| return static_cast<void*>( | |||||
| (*m_ref != NULL) ? static_cast<dt_byte*>(*m_ref) + m_offset : nullptr); | |||||
| } | |||||
| bool is_mutable() const { return m_mutable; } | |||||
| void reset(const void* ptr, size_t offset = 0); | |||||
| RefPtr& operator+=(size_t offset) { | |||||
| m_offset += offset; | |||||
| return *this; | |||||
| } | |||||
| bool operator==(const RefPtr& other) const { | |||||
| return *m_ref == *other.m_ref && m_offset == other.m_offset; | |||||
| } | |||||
| template <typename T> | |||||
| T* ptr() const { | |||||
| return static_cast<T*>(get_ptr()); | |||||
| } | |||||
| }; | |||||
| /** | /** | ||||
| * \brief A simple encapsulation class for n-dimensional tensor. | * \brief A simple encapsulation class for n-dimensional tensor. | ||||
| */ | */ | ||||
| struct TensorND { | struct TensorND { | ||||
| void* raw_ptr; | |||||
| TensorLayout layout; | TensorLayout layout; | ||||
| TensorND() : raw_ptr(NULL) {} | |||||
| TensorND() : m_ref_ptr(RefPtr((void*)nullptr)) {} | |||||
| TensorND(void* raw_ptr_, const TensorLayout& layout_) | TensorND(void* raw_ptr_, const TensorLayout& layout_) | ||||
| : raw_ptr(raw_ptr_), layout(layout_) {} | |||||
| : layout(layout_), m_ref_ptr(raw_ptr_) {} | |||||
| TensorND(const TensorLayout& layout_, const RefPtr& ref_ptr) | |||||
| : layout(layout_), m_ref_ptr(ref_ptr) {} | |||||
| MGE_WIN_DECLSPEC_FUC void reset_ptr(void* ptr, size_t offset = 0); | |||||
| void* raw_ptr() const { return m_ref_ptr.get_ptr(); } | |||||
| const RefPtr get_ref_ptr() const { return m_ref_ptr; } | |||||
| RefPtr& get_ref_ptr() { return m_ref_ptr; } | |||||
| //! get typed pointer; type check is performed | //! get typed pointer; type check is performed | ||||
| template <typename T> | template <typename T> | ||||
| T* ptr() const { | T* ptr() const { | ||||
| layout.dtype.assert_is_ctype<T>(); | layout.dtype.assert_is_ctype<T>(); | ||||
| return static_cast<T*>(raw_ptr); | |||||
| return static_cast<T*>(m_ref_ptr.get_ptr()); | |||||
| } | } | ||||
| //! get typed pointer of compatible type | //! get typed pointer of compatible type | ||||
| template <typename T> | template <typename T> | ||||
| T* compatible_ptr() const { | T* compatible_ptr() const { | ||||
| layout.dtype.assert_is_compatible_ctype<T>(); | layout.dtype.assert_is_compatible_ctype<T>(); | ||||
| return reinterpret_cast<T*>(raw_ptr); | |||||
| return reinterpret_cast<T*>(m_ref_ptr.get_ptr()); | |||||
| } | } | ||||
| private: | |||||
| RefPtr m_ref_ptr; | |||||
| }; | }; | ||||
| #if MEGDNN_CC_HOST | #if MEGDNN_CC_HOST | ||||
| @@ -605,4 +605,14 @@ std::string TensorLayout::serialize() const { | |||||
| return rst; | return rst; | ||||
| } | } | ||||
| void RefPtr::reset(const void* ptr, size_t offset) { | |||||
| megdnn_assert(m_mutable, "this RefPtr can't change."); | |||||
| *m_ref = const_cast<void*>(ptr); | |||||
| m_offset = offset; | |||||
| } | |||||
| void TensorND::reset_ptr(void* ptr, size_t offset) { | |||||
| m_ref_ptr.reset(ptr, offset); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -342,7 +342,7 @@ void param_pack_concat_execute( | |||||
| [comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; | [comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; | ||||
| TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | ||||
| for (size_t i = 0; i < nr_inputs; ++i) { | for (size_t i = 0; i < nr_inputs; ++i) { | ||||
| srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr; | |||||
| srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||||
| } | } | ||||
| HostTensorStorage srcs_storage; | HostTensorStorage srcs_storage; | ||||
| srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | ||||
| @@ -392,7 +392,7 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor( | |||||
| src_shapes, inputs.back()->shape(), TensorShape{}); | src_shapes, inputs.back()->shape(), TensorShape{}); | ||||
| } | } | ||||
| for (size_t i = 0; i < nr_inputs; ++i) { | for (size_t i = 0; i < nr_inputs; ++i) { | ||||
| srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr; | |||||
| srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||||
| } | } | ||||
| HostTensorStorage srcs_storage; | HostTensorStorage srcs_storage; | ||||
| srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | ||||
| @@ -26,7 +26,7 @@ TensorChecksumCalc::ChecksumResult TensorChecksumCalc::calc(TensorPtr ptr) { | |||||
| auto span = dt.layout().span(); | auto span = dt.layout().span(); | ||||
| megdnn::TensorND tensor; | megdnn::TensorND tensor; | ||||
| tensor.raw_ptr = dt.raw_ptr() + span.low_byte; | |||||
| tensor.reset_ptr(dt.raw_ptr() + span.low_byte); | |||||
| tensor.layout.init_contiguous_stride({span.dist_byte()}); | tensor.layout.init_contiguous_stride({span.dist_byte()}); | ||||
| tensor.layout.dtype = dtype::Byte(); | tensor.layout.dtype = dtype::Byte(); | ||||
| @@ -527,6 +527,10 @@ void NetworkImplDft::update_input() { | |||||
| config_in.lite_tensor->set_layout( | config_in.lite_tensor->set_layout( | ||||
| to_lite_layout(in_tensor_iter.second->layout())); | to_lite_layout(in_tensor_iter.second->layout())); | ||||
| } | } | ||||
| TensorHelper::implement(config_in.lite_tensor) | |||||
| ->cast_final_safe<TensorImplDft>() | |||||
| .m_record_reset = | |||||
| m_user_config->options.comp_node_seq_record_level > 0; | |||||
| if (config_in.config_layout.ndim && | if (config_in.config_layout.ndim && | ||||
| !(config_in.config_layout == config_in.lite_tensor->get_layout())) { | !(config_in.config_layout == config_in.lite_tensor->get_layout())) { | ||||
| config_in.lite_tensor->set_layout(config_in.config_layout); | config_in.lite_tensor->set_layout(config_in.config_layout); | ||||
| @@ -541,6 +545,10 @@ void NetworkImplDft::update_input() { | |||||
| TensorHelper::implement(io_in.lite_tensor) | TensorHelper::implement(io_in.lite_tensor) | ||||
| ->cast_final_safe<TensorImplDft>() | ->cast_final_safe<TensorImplDft>() | ||||
| .m_host_tensor = in_tensor_iter.second; | .m_host_tensor = in_tensor_iter.second; | ||||
| TensorHelper::implement(io_in.lite_tensor) | |||||
| ->cast_final_safe<TensorImplDft>() | |||||
| .m_record_reset = | |||||
| m_user_config->options.comp_node_seq_record_level > 0; | |||||
| io_in.lite_tensor->update_from_implement(); | io_in.lite_tensor->update_from_implement(); | ||||
| m_network_io->inputs.push_back(io_in); | m_network_io->inputs.push_back(io_in); | ||||
| } | } | ||||
| @@ -603,6 +611,10 @@ void NetworkImplDft::update_output() { | |||||
| } | } | ||||
| try_infer_tensor_layout(out_it->lite_tensor, var); | try_infer_tensor_layout(out_it->lite_tensor, var); | ||||
| output_tensor_copy_optimize(var, out_it->lite_tensor); | output_tensor_copy_optimize(var, out_it->lite_tensor); | ||||
| TensorHelper::implement(out_it->lite_tensor) | |||||
| ->cast_final_safe<TensorImplDft>() | |||||
| .m_record_reset = | |||||
| m_user_config->options.comp_node_seq_record_level > 0; | |||||
| } | } | ||||
| //! user not set, use default output | //! user not set, use default output | ||||
| } else { | } else { | ||||
| @@ -631,6 +643,10 @@ void NetworkImplDft::update_output() { | |||||
| lite_tensor = output.lite_tensor; | lite_tensor = output.lite_tensor; | ||||
| } | } | ||||
| output_tensor_copy_optimize(out, lite_tensor); | output_tensor_copy_optimize(out, lite_tensor); | ||||
| TensorHelper::implement(lite_tensor) | |||||
| ->cast_final_safe<TensorImplDft>() | |||||
| .m_record_reset = | |||||
| m_user_config->options.comp_node_seq_record_level > 0; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -643,14 +659,20 @@ void NetworkImplDft::output_tensor_copy_optimize( | |||||
| "Can't set force_output_use_user_specified_memory and " | "Can't set force_output_use_user_specified_memory and " | ||||
| "force_output_dynamic_alloc at the same time."); | "force_output_dynamic_alloc at the same time."); | ||||
| if (m_user_config->options.force_output_use_user_specified_memory) { | if (m_user_config->options.force_output_use_user_specified_memory) { | ||||
| bool in_record = m_user_config->options.comp_node_seq_record_level > 0; | |||||
| TensorHelper::implement(tensor) | TensorHelper::implement(tensor) | ||||
| ->cast_final_safe<TensorImplDft>() | ->cast_final_safe<TensorImplDft>() | ||||
| .set_reset_callback([var](TensorImplDft* dft_tensor) { | |||||
| .set_reset_callback([var, in_record](TensorImplDft* dft_tensor) { | |||||
| dft_tensor->device_share_host_memory(); | dft_tensor->device_share_host_memory(); | ||||
| auto dv = dft_tensor->dev_tensor().get(); | auto dv = dft_tensor->dev_tensor().get(); | ||||
| dv->comp_node(var.node()->comp_node(), true); | dv->comp_node(var.node()->comp_node(), true); | ||||
| var.node()->init_mem_plan(dv); | var.node()->init_mem_plan(dv); | ||||
| var.node()->reset_dev_tensor_from_tensor(*dv); | |||||
| if (in_record) { | |||||
| auto&& device_tensor = var.node()->mutable_dev_tensor(); | |||||
| device_tensor.only_reset_raw_storage(dv->storage()); | |||||
| } else { | |||||
| var.node()->reset_dev_tensor_from_tensor(*dv); | |||||
| } | |||||
| }); | }); | ||||
| } | } | ||||
| if (m_user_config->options.force_output_dynamic_alloc) { | if (m_user_config->options.force_output_dynamic_alloc) { | ||||
| @@ -314,14 +314,22 @@ void TensorImplDft::reset(void* prepared_data) { | |||||
| size_t size = mge_layout.span().dist_byte(); | size_t size = mge_layout.span().dist_byte(); | ||||
| mgb::HostTensorStorage storage; | mgb::HostTensorStorage storage; | ||||
| storage.reset(cn, size, raw_storage); | storage.reset(cn, size, raw_storage); | ||||
| m_host_tensor->reset(storage, mge_layout); | |||||
| if (m_record_reset) { | |||||
| m_host_tensor->only_reset_raw_storage(storage); | |||||
| } else { | |||||
| m_host_tensor->reset(storage, mge_layout); | |||||
| } | |||||
| } else { | } else { | ||||
| auto cn = m_dev_tensor->comp_node(); | auto cn = m_dev_tensor->comp_node(); | ||||
| auto mge_layout = m_dev_tensor->layout(); | auto mge_layout = m_dev_tensor->layout(); | ||||
| size_t size = mge_layout.span().dist_byte(); | size_t size = mge_layout.span().dist_byte(); | ||||
| mgb::DeviceTensorStorage storage; | mgb::DeviceTensorStorage storage; | ||||
| storage.reset(cn, size, raw_storage); | storage.reset(cn, size, raw_storage); | ||||
| m_dev_tensor->reset(storage, mge_layout); | |||||
| if (m_record_reset) { | |||||
| m_dev_tensor->only_reset_raw_storage(storage); | |||||
| } else { | |||||
| m_dev_tensor->reset(storage, mge_layout); | |||||
| } | |||||
| } | } | ||||
| if (m_reset_callback) { | if (m_reset_callback) { | ||||
| m_reset_callback(this); | m_reset_callback(this); | ||||
| @@ -455,14 +463,9 @@ void TensorImplDft::device_share_host_memory() { | |||||
| m_host_tensor->comp_node(), m_host_tensor->layout()); | m_host_tensor->comp_node(), m_host_tensor->layout()); | ||||
| } | } | ||||
| if (m_host_tensor->raw_ptr() != m_dev_tensor->raw_ptr()) { | if (m_host_tensor->raw_ptr() != m_dev_tensor->raw_ptr()) { | ||||
| auto raw_storage = std::shared_ptr<mgb::dt_byte>( | |||||
| m_host_tensor->raw_ptr(), [](void*) {}); | |||||
| auto cn = m_host_tensor->comp_node(); | |||||
| auto mge_layout = m_host_tensor->layout(); | |||||
| size_t size = mge_layout.span().dist_byte(); | |||||
| mgb::DeviceTensorStorage storage; | |||||
| storage.reset(cn, size, raw_storage); | |||||
| m_dev_tensor->reset(storage, mge_layout); | |||||
| auto&& storage = | |||||
| mgb::DeviceTensorStorage::make_proxy(m_host_tensor->storage()); | |||||
| m_dev_tensor->only_reset_raw_storage(storage); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -126,6 +126,7 @@ private: | |||||
| void set_mge_tensor_compnode(const mgb::CompNode& comp_node); | void set_mge_tensor_compnode(const mgb::CompNode& comp_node); | ||||
| private: | private: | ||||
| bool m_record_reset = false; | |||||
| std::function<void(TensorImplDft*)> m_get_memory_callback; | std::function<void(TensorImplDft*)> m_get_memory_callback; | ||||
| std::function<void(TensorImplDft*)> m_reset_callback; | std::function<void(TensorImplDft*)> m_reset_callback; | ||||
| std::shared_ptr<mgb::HostTensorND> m_host_tensor; | std::shared_ptr<mgb::HostTensorND> m_host_tensor; | ||||
| @@ -412,9 +412,12 @@ TEST(TestNetWork, ResetOutput) { | |||||
| compare_lite_tensor<float>(output_tensor, result_mgb); | compare_lite_tensor<float>(output_tensor, result_mgb); | ||||
| } | } | ||||
| TEST(TestNetWork, OutputNoCopy) { | |||||
| namespace { | |||||
| void test_output_no_copy(int record) { | |||||
| Config config; | Config config; | ||||
| config.options.force_output_use_user_specified_memory = true; | config.options.force_output_use_user_specified_memory = true; | ||||
| config.options.comp_node_seq_record_level = record; | |||||
| auto tensor = get_input_data("./input_data.npy"); | auto tensor = get_input_data("./input_data.npy"); | ||||
| std::string model_path = "./shufflenet.mge"; | std::string model_path = "./shufflenet.mge"; | ||||
| std::string input_name = "data"; | std::string input_name = "data"; | ||||
| @@ -453,6 +456,65 @@ TEST(TestNetWork, OutputNoCopy) { | |||||
| } | } | ||||
| } | } | ||||
| void test_input_no_copy(int record) { | |||||
| Config config; | |||||
| config.options.force_output_use_user_specified_memory = true; | |||||
| config.options.comp_node_seq_record_level = record; | |||||
| std::string model_path = "./shufflenet.mge"; | |||||
| std::string input_name = "data"; | |||||
| Layout layout_in{{1, 3, 224, 224}, 4}; | |||||
| std::vector<std::shared_ptr<Tensor>> inputs; | |||||
| std::vector<std::shared_ptr<Tensor>> outputs; | |||||
| for (int i = 0; i < 3; i++) { | |||||
| auto tmp_in = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU, layout_in); | |||||
| auto ptr = static_cast<float*>(tmp_in->get_memory_ptr()); | |||||
| for (size_t id = 0; id < 2 * 224 * 224; id++) { | |||||
| ptr[id] = i + 1; | |||||
| } | |||||
| inputs.push_back(tmp_in); | |||||
| outputs.push_back(mgb_lar(model_path, config, input_name, tmp_in)); | |||||
| } | |||||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||||
| network->load_model(model_path); | |||||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||||
| for (int i = 0; i < 3; i++) { | |||||
| auto ptr = inputs[i]->get_memory_ptr(); | |||||
| input_tensor->reset(ptr, layout_in); | |||||
| auto tmp_out = std::make_shared<Tensor>( | |||||
| LiteDeviceType::LITE_CPU, | |||||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||||
| output_tensor->reset(tmp_out->get_memory_ptr(), output_tensor->get_layout()); | |||||
| network->forward(); | |||||
| network->wait(); | |||||
| compare_lite_tensor<float>(output_tensor, outputs[i]); | |||||
| } | |||||
| } | |||||
| } // namespace | |||||
| TEST(TestNetWork, OutputNoCopy) { | |||||
| test_output_no_copy(0); | |||||
| } | |||||
| TEST(TestNetWork, OutputNoCopyRecord) { | |||||
| test_output_no_copy(1); | |||||
| } | |||||
| TEST(TestNetWork, IONoCopy) { | |||||
| test_input_no_copy(0); | |||||
| } | |||||
| TEST(TestNetWork, IONoCopyRecord) { | |||||
| test_input_no_copy(1); | |||||
| } | |||||
| TEST(TestNetWork, OutputDynamicAlloc) { | TEST(TestNetWork, OutputDynamicAlloc) { | ||||
| Config config; | Config config; | ||||
| config.options.force_output_dynamic_alloc = true; | config.options.force_output_dynamic_alloc = true; | ||||
| @@ -250,9 +250,14 @@ std::unique_ptr<CompNodeSeqRecorder> ComputingGraphImpl::ComputingSequence:: | |||||
| "graph."); | "graph."); | ||||
| return {}; | return {}; | ||||
| } | } | ||||
| auto is_graph_dest_varnode = [&](VarNode* var) { | |||||
| return ComputingGraphImpl::downcast(owner_graph())->var_receiver(var).size() == | |||||
| 0; | |||||
| }; | |||||
| for (auto i : *m_opr_seq) { | for (auto i : *m_opr_seq) { | ||||
| for (auto j : i->output()) { | for (auto j : i->output()) { | ||||
| if (!is_static_var_storage(j)) { | |||||
| if (!is_static_var_storage(j) && !is_graph_dest_varnode(j)) { | |||||
| mgb_log_error( | mgb_log_error( | ||||
| "can not enable CompNodeSeqRecorder because var " | "can not enable CompNodeSeqRecorder because var " | ||||
| "storage not static: %s", | "storage not static: %s", | ||||
| @@ -319,7 +319,7 @@ bool VarNodeMemManager::DynamicAllocOprInfo::check_if_mem_status_change() { | |||||
| for (size_t i = 0; i < dev_val_input.size(); i++) { | for (size_t i = 0; i < dev_val_input.size(); i++) { | ||||
| auto&& t = prev_dev_val_input[i]; | auto&& t = prev_dev_val_input[i]; | ||||
| auto s = dev_val_input[i]->dev_tensor().as_megdnn(); | auto s = dev_val_input[i]->dev_tensor().as_megdnn(); | ||||
| if (t.raw_ptr != s.raw_ptr || !t.layout.eq_layout(s.layout)) { | |||||
| if (t.raw_ptr() != s.raw_ptr() || !t.layout.eq_layout(s.layout)) { | |||||
| same = false; | same = false; | ||||
| t = s; | t = s; | ||||
| } | } | ||||
| @@ -233,6 +233,7 @@ TensorStorage<Trait>& TensorStorage<Trait>::operator=(const TensorStorage& rhs) | |||||
| m_capacity = rhs.m_capacity; | m_capacity = rhs.m_capacity; | ||||
| m_offset = rhs.m_offset; | m_offset = rhs.m_offset; | ||||
| m_data = rhs.m_data; | m_data = rhs.m_data; | ||||
| m_ref_ptr = rhs.m_ref_ptr; | |||||
| return *this; | return *this; | ||||
| } | } | ||||
| @@ -264,7 +265,8 @@ TensorStorage<Trait> TensorStorage<Trait>::sub(ptrdiff_t offset) const { | |||||
| m_size - offset, | m_size - offset, | ||||
| m_capacity - offset, | m_capacity - offset, | ||||
| static_cast<size_t>(toff), | static_cast<size_t>(toff), | ||||
| m_data}; | |||||
| m_data, | |||||
| m_ref_ptr}; | |||||
| } | } | ||||
| template <class Trait> | template <class Trait> | ||||
| @@ -278,8 +280,10 @@ dt_byte* TensorStorage<Trait>::apply_lazy_and_get_ptr() { | |||||
| mgb_throw_if(!ptr, SystemError, "failed to allocate memory"); | mgb_throw_if(!ptr, SystemError, "failed to allocate memory"); | ||||
| CompNode cn = m_comp_node; | CompNode cn = m_comp_node; | ||||
| m_data.reset(ptr, [cn](void* p) { Trait::free(cn, p); }); | m_data.reset(ptr, [cn](void* p) { Trait::free(cn, p); }); | ||||
| m_ref_ptr = std::make_shared<void*>(static_cast<void*>(nullptr)); | |||||
| m_capacity = m_size; | m_capacity = m_size; | ||||
| } | } | ||||
| *m_ref_ptr = static_cast<void*>(m_data.get()); | |||||
| return m_data.get() + m_offset; | return m_data.get() + m_offset; | ||||
| } | } | ||||
| @@ -305,6 +309,19 @@ void TensorStorage<Trait>::reset(CompNode node, size_t size, RawStorage data) { | |||||
| m_capacity = size; | m_capacity = size; | ||||
| m_offset = 0; | m_offset = 0; | ||||
| m_data = std::move(data); | m_data = std::move(data); | ||||
| m_ref_ptr = std::make_shared<void*>(static_cast<void*>(m_data.get())); | |||||
| } | |||||
| template <class Trait> | |||||
| void TensorStorage<Trait>::only_reset_raw_storage( | |||||
| CompNode node, size_t size, RawStorage data, size_t offset) { | |||||
| mgb_assert(m_allow_realloc); | |||||
| m_comp_node = node; | |||||
| m_size = size; | |||||
| m_capacity = size; | |||||
| m_offset = offset; | |||||
| m_data = std::move(data); | |||||
| *m_ref_ptr = static_cast<void*>(m_data.get()); | |||||
| } | } | ||||
| template <class Trait> | template <class Trait> | ||||
| @@ -316,8 +333,8 @@ TensorStorage<Trait> TensorStorage<Trait>::make_proxy( | |||||
| "proxy source should be on CPU; got %s", | "proxy source should be on CPU; got %s", | ||||
| src.comp_node().to_string().c_str()); | src.comp_node().to_string().c_str()); | ||||
| src.ptr(); | src.ptr(); | ||||
| return {true, src.m_comp_node, src.m_size, | |||||
| src.m_capacity, src.m_offset, src.m_data}; | |||||
| return {true, src.m_comp_node, src.m_size, src.m_capacity, | |||||
| src.m_offset, src.m_data, src.m_ref_ptr}; | |||||
| } | } | ||||
| template <class Trait> | template <class Trait> | ||||
| @@ -481,6 +498,17 @@ DEF(reset, &)(TensorStorage storage, const TensorLayout& layout) { | |||||
| return static_cast<ChainReturnType&>(*this); | return static_cast<ChainReturnType&>(*this); | ||||
| } | } | ||||
| DEF(only_reset_raw_storage, &)(TensorStorage storage) { | |||||
| //! The storage to be reset is either satisfy the layout or empty. | |||||
| //! Empty storage is used after weight preprocess for saving memory and | |||||
| //! checking layout when running | |||||
| mgb_assert(storage.valid_span(m_layout.span()) || storage.empty()); | |||||
| m_storage.only_reset_raw_storage( | |||||
| storage.comp_node(), storage.size(), storage.raw_storage(), | |||||
| storage.offset()); | |||||
| return static_cast<ChainReturnType&>(*this); | |||||
| } | |||||
| DEF(comp_node, &)(CompNode comp_node, bool allow_mem_node_change) { | DEF(comp_node, &)(CompNode comp_node, bool allow_mem_node_change) { | ||||
| auto orig_cn = m_storage.comp_node_allow_invalid(); | auto orig_cn = m_storage.comp_node_allow_invalid(); | ||||
| m_storage.comp_node(comp_node, allow_mem_node_change); | m_storage.comp_node(comp_node, allow_mem_node_change); | ||||
| @@ -225,6 +225,12 @@ public: | |||||
| */ | */ | ||||
| MGE_WIN_DECLSPEC_FUC void reset(CompNode node, size_t size, RawStorage data); | MGE_WIN_DECLSPEC_FUC void reset(CompNode node, size_t size, RawStorage data); | ||||
| /*! | |||||
| * \brief reset the tensor storage to given memory area | |||||
| */ | |||||
| MGE_WIN_DECLSPEC_FUC void only_reset_raw_storage( | |||||
| CompNode node, size_t size, RawStorage data, size_t offset); | |||||
| /*! | /*! | ||||
| * \brief make a TensorStorage that shares memory with another | * \brief make a TensorStorage that shares memory with another | ||||
| * TensorStorage some different storage type | * TensorStorage some different storage type | ||||
| @@ -270,6 +276,11 @@ public: | |||||
| return m_data; | return m_data; | ||||
| } | } | ||||
| std::shared_ptr<void*> get_ref_ptr() const { | |||||
| ptr(); | |||||
| return m_ref_ptr; | |||||
| } | |||||
| private: | private: | ||||
| template <class T> | template <class T> | ||||
| friend class TensorStorage; | friend class TensorStorage; | ||||
| @@ -289,16 +300,20 @@ private: | |||||
| RawStorage m_data; | RawStorage m_data; | ||||
| std::shared_ptr<void*> m_ref_ptr = std::make_shared<void*>((void*)nullptr); | |||||
| //! used internally for returning a predefined TensorStorage | //! used internally for returning a predefined TensorStorage | ||||
| TensorStorage( | TensorStorage( | ||||
| bool allow_realloc, CompNode comp_node, size_t size, size_t capacity, | bool allow_realloc, CompNode comp_node, size_t size, size_t capacity, | ||||
| size_t offset, const RawStorage& data) | |||||
| size_t offset, const RawStorage& data, | |||||
| std::shared_ptr<void*> ref_ptr = std::make_shared<void*>((void*)nullptr)) | |||||
| : m_allow_realloc(allow_realloc), | : m_allow_realloc(allow_realloc), | ||||
| m_comp_node(comp_node), | m_comp_node(comp_node), | ||||
| m_size(size), | m_size(size), | ||||
| m_capacity(capacity), | m_capacity(capacity), | ||||
| m_offset(offset), | m_offset(offset), | ||||
| m_data(data) {} | |||||
| m_data(data), | |||||
| m_ref_ptr(ref_ptr) {} | |||||
| void check_comp_node_valid() const { | void check_comp_node_valid() const { | ||||
| if (mgb_unlikely(!m_comp_node.valid())) | if (mgb_unlikely(!m_comp_node.valid())) | ||||
| @@ -423,6 +438,8 @@ public: | |||||
| MGE_WIN_DECLSPEC_FUC ChainReturnType& reset( | MGE_WIN_DECLSPEC_FUC ChainReturnType& reset( | ||||
| TensorStorage storage, const TensorLayout& layout); | TensorStorage storage, const TensorLayout& layout); | ||||
| MGE_WIN_DECLSPEC_FUC ChainReturnType& only_reset_raw_storage(TensorStorage storage); | |||||
| /* ================= getter and setters ================= */ | /* ================= getter and setters ================= */ | ||||
| /*! | /*! | ||||
| @@ -501,7 +518,8 @@ public: | |||||
| //! convert to megdnn::TensorND | //! convert to megdnn::TensorND | ||||
| megdnn::TensorND as_megdnn() const { | megdnn::TensorND as_megdnn() const { | ||||
| return {const_cast<void*>(static_cast<const void*>(raw_ptr())), m_layout}; | |||||
| megdnn::RefPtr ref_ptr(m_storage.get_ref_ptr(), m_storage.offset(), false); | |||||
| return {m_layout, ref_ptr}; | |||||
| } | } | ||||
| /* ================= misc ================= */ | /* ================= misc ================= */ | ||||
| @@ -816,4 +816,79 @@ TYPED_TEST(TestCPUCompSeqRec, run_multi_thread_default) { | |||||
| } | } | ||||
| } // anonymous namespace | } // anonymous namespace | ||||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||||
| #include "megbrain/opr/io.h" | |||||
| #include "megbrain/opr/tensor_manip.h" | |||||
| #include "megbrain/opr/utility.h" | |||||
| TEST(TestCPUCompSeqRec, run_dyn_ptr) { | |||||
| CompNode cn = CompNode::load("cpux"); | |||||
| HostTensorGenerator<> gen; | |||||
| auto host_x0 = gen({4, 1}, cn), host_y0 = gen({4, 1}, cn), | |||||
| host_z0 = gen({4, 1}, cn); | |||||
| auto host_x1 = gen({4, 1}, cn), host_y1 = gen({4, 1}, cn), | |||||
| host_z1 = gen({4, 1}, cn); | |||||
| auto dev_x0 = std::make_shared<DeviceTensorND>(cn); | |||||
| auto dev_y0 = std::make_shared<DeviceTensorND>(cn); | |||||
| auto dev_z0 = std::make_shared<DeviceTensorND>(cn); | |||||
| auto dev_x1 = std::make_shared<DeviceTensorND>(cn); | |||||
| auto dev_y1 = std::make_shared<DeviceTensorND>(cn); | |||||
| auto dev_z1 = std::make_shared<DeviceTensorND>(cn); | |||||
| (*dev_x0).comp_node(cn).copy_from(*host_x0).sync(); | |||||
| (*dev_y0).comp_node(cn).copy_from(*host_y0).sync(); | |||||
| (*dev_z0).comp_node(cn).copy_from(*host_z0).sync(); | |||||
| (*dev_x1).comp_node(cn).copy_from(*host_x1).sync(); | |||||
| (*dev_y1).comp_node(cn).copy_from(*host_y1).sync(); | |||||
| (*dev_z1).comp_node(cn).copy_from(*host_z1).sync(); | |||||
| auto check = [&]() { | |||||
| HostTensorND ret(CompNode::load("cpux"), host_x0->shape()); | |||||
| auto px = host_x0->ptr<float>(), py = host_y0->ptr<float>(), | |||||
| pz = host_z0->ptr<float>(), pw = ret.ptr<float>(); | |||||
| auto sz0 = host_x0->shape()[0], sz1 = host_x0->shape()[1]; | |||||
| for (size_t i = 0; i < sz0; ++i) { | |||||
| for (size_t j = 0; j < sz1; ++j) { | |||||
| pw[i * sz1 + j] = px[i * sz1 + j] * py[i * sz1 + j] + pz[i * sz1 + j]; | |||||
| } | |||||
| } | |||||
| return ret; | |||||
| }; | |||||
| auto graph = ComputingGraph::make(); | |||||
| // test record on first run | |||||
| graph->options().var_sanity_check_first_run = false; | |||||
| graph->options().graph_opt_level = 0; | |||||
| graph->options().comp_node_seq_record_level = 1; | |||||
| graph->options().fake_next_exec = true; | |||||
| auto x = opr::VolatileSharedDeviceTensor::make(*graph, dev_x0), | |||||
| y = opr::VolatileSharedDeviceTensor::make(*graph, dev_y0), | |||||
| z = opr::VolatileSharedDeviceTensor::make(*graph, dev_z0), | |||||
| w = opr::Elemwise::make({x, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3); | |||||
| HostTensorND host_w; | |||||
| auto func = graph->compile({{w, [&host_w](DeviceTensorND& d) { | |||||
| host_w = mgb::HostTensorND::make_proxy(d); | |||||
| }}}); | |||||
| func->execute(); | |||||
| for (int i = 0; i < 4; ++i) { | |||||
| if (i == 2) { | |||||
| *host_x0 = *host_x1; | |||||
| *host_y0 = *host_y1; | |||||
| *host_z0 = *host_z1; | |||||
| dev_x0->only_reset_raw_storage(dev_x1->storage()); | |||||
| dev_y0->only_reset_raw_storage(dev_y1->storage()); | |||||
| dev_z0->only_reset_raw_storage(dev_z1->storage()); | |||||
| } | |||||
| func->execute(); | |||||
| auto expect = check(); | |||||
| MGB_ASSERT_TENSOR_EQ(expect, host_w) << "iter " << i; | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
| @@ -13,6 +13,7 @@ | |||||
| #include "./network.h" | #include "./network.h" | ||||
| #include "megbrain/comp_node_env.h" | #include "megbrain/comp_node_env.h" | ||||
| #include "megbrain/opr/basic_arith.h" | #include "megbrain/opr/basic_arith.h" | ||||
| #include "megbrain/opr/tensor_manip.h" | |||||
| #include "megbrain/test/helper.h" | #include "megbrain/test/helper.h" | ||||
| using namespace mgb; | using namespace mgb; | ||||
| @@ -20,9 +21,11 @@ using namespace mgb; | |||||
| struct TestGraph { | struct TestGraph { | ||||
| CompNode m_cn; | CompNode m_cn; | ||||
| HostTensorGenerator<> m_gen; | HostTensorGenerator<> m_gen; | ||||
| HostTensorGenerator<dtype::Int32> m_gen_int; | |||||
| std::unique_ptr<Network> m_network; | std::unique_ptr<Network> m_network; | ||||
| SymbolVar m_out_var; | SymbolVar m_out_var; | ||||
| std::shared_ptr<HostTensorND> input_tensor; | std::shared_ptr<HostTensorND> input_tensor; | ||||
| std::shared_ptr<HostTensorND> input_tensor2; | |||||
| TestGraph() { | TestGraph() { | ||||
| m_cn = CompNode::load("cpu0"); | m_cn = CompNode::load("cpu0"); | ||||
| @@ -41,6 +44,78 @@ struct TestGraph { | |||||
| m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); | m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); | ||||
| } | } | ||||
| void create_graph_with_subtensor_forward() { | |||||
| input_tensor = m_gen({2, 3, 32, 32}, m_cn); | |||||
| auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) | |||||
| .rename("input"); | |||||
| auto cv = [&](int v) { | |||||
| auto rst = input.make_scalar(v); | |||||
| return rst; | |||||
| }; | |||||
| using Ad = opr::Subtensor::AxisIndexer; | |||||
| auto sub = | |||||
| opr::Subtensor::make(input, {Ad::make_interval(0, cv(1), cv(2), None)}); | |||||
| auto f = m_network->add_conv( | |||||
| sub, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); | |||||
| f = m_network->add_elemwise( | |||||
| {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); | |||||
| f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); | |||||
| m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); | |||||
| } | |||||
| void create_graph_with_subtensor_relayout() { | |||||
| input_tensor = m_gen({2, 3, 32, 40}, m_cn); | |||||
| auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) | |||||
| .rename("input"); | |||||
| auto cv = [&](int v) { | |||||
| auto rst = input.make_scalar(v); | |||||
| return rst; | |||||
| }; | |||||
| using Ad = opr::Subtensor::AxisIndexer; | |||||
| auto sub = opr::Subtensor::make( | |||||
| input, {Ad::make_interval(0, cv(1), cv(2), None), | |||||
| Ad::make_interval(3, cv(0), cv(32), None)}); | |||||
| auto f = m_network->add_conv( | |||||
| sub, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); | |||||
| f = m_network->add_elemwise( | |||||
| {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); | |||||
| f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); | |||||
| m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); | |||||
| } | |||||
| void create_graph_with_setsubtensor() { | |||||
| input_tensor = m_gen({1, 3, 32, 32}, m_cn); | |||||
| input_tensor2 = m_gen({1, 1, 32, 32}, m_cn); | |||||
| auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) | |||||
| .rename("input"); | |||||
| auto input_sub = | |||||
| opr::Host2DeviceCopy::make(*m_network->graph, input_tensor2, m_cn) | |||||
| .rename("input2"); | |||||
| auto cv = [&](int v) { | |||||
| auto rst = input.make_scalar(v); | |||||
| return rst; | |||||
| }; | |||||
| using Ad = opr::Subtensor::AxisIndexer; | |||||
| input = opr::SetSubtensor::make( | |||||
| input, input_sub, {Ad::make_interval(1, cv(1), cv(2), None)}); | |||||
| auto f = m_network->add_conv( | |||||
| input, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); | |||||
| f = m_network->add_elemwise( | |||||
| {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); | |||||
| f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); | |||||
| m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); | |||||
| } | |||||
| std::unique_ptr<cg::AsyncExecutable> compile_without_copy() { | std::unique_ptr<cg::AsyncExecutable> compile_without_copy() { | ||||
| return m_network->graph->compile({{m_out_var, nullptr}}); | return m_network->graph->compile({{m_out_var, nullptr}}); | ||||
| } | } | ||||
| @@ -51,8 +126,11 @@ struct TestGraph { | |||||
| } | } | ||||
| }; | }; | ||||
| TEST(TestNoCopy, BasicInputNoCopy) { | |||||
| namespace { | |||||
| void test_basic_input_no_copy(bool record) { | |||||
| auto test_graph = TestGraph(); | auto test_graph = TestGraph(); | ||||
| auto compute_graph = test_graph.m_network->graph; | |||||
| compute_graph->options().comp_node_seq_record_level = record; | |||||
| test_graph.create_graph(); | test_graph.create_graph(); | ||||
| HostTensorND out, out_pre; | HostTensorND out, out_pre; | ||||
| auto func = test_graph.compile_with_copy(out); | auto func = test_graph.compile_with_copy(out); | ||||
| @@ -68,7 +146,11 @@ TEST(TestNoCopy, BasicInputNoCopy) { | |||||
| for (size_t d = 0; d < length; d++) { | for (size_t d = 0; d < length; d++) { | ||||
| ptr[d] = i; | ptr[d] = i; | ||||
| } | } | ||||
| input_tensor->reset(storage, layout); | |||||
| if (record) { | |||||
| input_tensor->only_reset_raw_storage(storage); | |||||
| } else { | |||||
| input_tensor->reset(storage, layout); | |||||
| } | |||||
| } | } | ||||
| func->execute(); | func->execute(); | ||||
| func->wait(); | func->wait(); | ||||
| @@ -78,6 +160,11 @@ TEST(TestNoCopy, BasicInputNoCopy) { | |||||
| out_pre.copy_from(out).sync(); | out_pre.copy_from(out).sync(); | ||||
| } | } | ||||
| } | } | ||||
| } // namespace | |||||
| TEST(TestNoCopy, InputNoCopyPtrEQ) { | |||||
| test_basic_input_no_copy(0); | |||||
| } | |||||
| TEST(TestNoCopy, IONoCopyPtrEQ) { | TEST(TestNoCopy, IONoCopyPtrEQ) { | ||||
| auto test_graph = TestGraph(); | auto test_graph = TestGraph(); | ||||
| @@ -158,8 +245,112 @@ TEST(TestNoCopy, IONoCopyCorrect) { | |||||
| } | } | ||||
| } | } | ||||
| TEST(TestNoCopy, InputNoCopyRecord) {} | |||||
| TEST(TestNoCopy, InputNoCopyRecord) { | |||||
| test_basic_input_no_copy(1); | |||||
| } | |||||
| TEST(TestNoCopy, IONoCopyRecord) { | |||||
| auto test_graph = TestGraph(); | |||||
| auto compute_graph = test_graph.m_network->graph; | |||||
| compute_graph->options().force_output_use_user_specified_memory = true; | |||||
| compute_graph->options().comp_node_seq_record_level = 1; | |||||
| test_graph.create_graph(); | |||||
| HostTensorND truth; | |||||
| auto func = test_graph.compile_without_copy(); | |||||
| auto&& outvar = func->get_output_vars()[0]; | |||||
| DeviceTensorND tmp(test_graph.m_cn, {1, 8, 7, 7}); | |||||
| outvar->init_mem_plan(&tmp); | |||||
| size_t times = 10; | |||||
| for (size_t i = 0; i < times; i++) { | |||||
| auto input_tensor = test_graph.input_tensor; | |||||
| auto layout = input_tensor->layout(); | |||||
| size_t length = layout.total_nr_elems(); | |||||
| auto storage = TensorStorage<HostTensorStorageTrait>(test_graph.m_cn); | |||||
| storage.ensure_size(length * sizeof(float)); | |||||
| float* ptr = storage.ptr()->as<float>(); | |||||
| for (size_t d = 0; d < length; d++) { | |||||
| ptr[d] = i / 5 + 3; | |||||
| } | |||||
| input_tensor->only_reset_raw_storage(storage); | |||||
| DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); | |||||
| dv.raw_ptr(); | |||||
| auto& dev_tensor = outvar->mutable_dev_tensor(); | |||||
| dev_tensor.only_reset_raw_storage(dv.storage()); | |||||
| func->execute(); | |||||
| func->wait(); | |||||
| if (i % 5 == 0) { | |||||
| truth.copy_from(dv).sync(); | |||||
| continue; | |||||
| } | |||||
| HostTensorND to_check; | |||||
| to_check.copy_from(dv).sync(); | |||||
| MGB_ASSERT_TENSOR_EQ(to_check, truth); | |||||
| } | |||||
| } | |||||
| namespace { | |||||
| void test_subtensor_record(int level) { | |||||
| auto test_graph = TestGraph(); | |||||
| auto compute_graph = test_graph.m_network->graph; | |||||
| compute_graph->options().force_output_use_user_specified_memory = true; | |||||
| compute_graph->options().comp_node_seq_record_level = 1; | |||||
| if (level == 2) { | |||||
| test_graph.create_graph_with_setsubtensor(); | |||||
| } else if (level == 1) { | |||||
| test_graph.create_graph_with_subtensor_forward(); | |||||
| } else { | |||||
| test_graph.create_graph_with_subtensor_relayout(); | |||||
| } | |||||
| HostTensorND truth; | |||||
| auto func = test_graph.compile_without_copy(); | |||||
| auto&& outvar = func->get_output_vars()[0]; | |||||
| DeviceTensorND tmp(test_graph.m_cn, {1, 8, 7, 7}); | |||||
| outvar->init_mem_plan(&tmp); | |||||
| size_t times = 10; | |||||
| for (size_t i = 0; i < times; i++) { | |||||
| auto input_tensor = test_graph.input_tensor; | |||||
| auto layout = input_tensor->layout(); | |||||
| size_t length = layout.total_nr_elems(); | |||||
| auto storage = TensorStorage<HostTensorStorageTrait>(test_graph.m_cn); | |||||
| storage.ensure_size(length * sizeof(float)); | |||||
| float* ptr = storage.ptr()->as<float>(); | |||||
| for (size_t d = 0; d < length; d++) { | |||||
| ptr[d] = i / 5 + 3; | |||||
| } | |||||
| input_tensor->only_reset_raw_storage(storage); | |||||
| DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); | |||||
| dv.raw_ptr(); | |||||
| auto& dev_tensor = outvar->mutable_dev_tensor(); | |||||
| dev_tensor.only_reset_raw_storage(dv.storage()); | |||||
| func->execute(); | |||||
| func->wait(); | |||||
| if (i % 5 == 0) { | |||||
| truth.copy_from(dv).sync(); | |||||
| continue; | |||||
| } | |||||
| HostTensorND to_check; | |||||
| to_check.copy_from(dv).sync(); | |||||
| MGB_ASSERT_TENSOR_EQ(to_check, truth); | |||||
| } | |||||
| } | |||||
| } // namespace | |||||
| TEST(TestNoCopy, IONoCopyRecordSubTensor) { | |||||
| test_subtensor_record(0); | |||||
| } | |||||
| TEST(TestNoCopy, IONoCopyRecordSubTensorRelayout) { | |||||
| test_subtensor_record(1); | |||||
| } | |||||
| TEST(TestNoCopy, OutputNoCopyRecord) {} | |||||
| //! TODO: the test should fix compnode memory copy, which now not record reference | |||||
| //! ptr, when support it, the test will pass | |||||
| /*TEST(TestNoCopy, IONoCopyRecordSetSubTensor) { | |||||
| test_subtensor_record(2); | |||||
| }*/ | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
| @@ -133,7 +133,7 @@ void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_ | |||||
| host_init_pvisitor<out_dim>(pvisitors[i], args.inputs[i].layout); | host_init_pvisitor<out_dim>(pvisitors[i], args.inputs[i].layout); | ||||
| } | } | ||||
| datum[nr_inps] = reinterpret_cast<CUdeviceptr>( | datum[nr_inps] = reinterpret_cast<CUdeviceptr>( | ||||
| args.outputs[0].from->dev_tensor().as_megdnn().raw_ptr); | |||||
| args.outputs[0].from->dev_tensor().as_megdnn().raw_ptr()); | |||||
| size_t num_elements = args.outputs[0].layout.total_nr_elems(); | size_t num_elements = args.outputs[0].layout.total_nr_elems(); | ||||
| mgb_assert( | mgb_assert( | ||||
| num_elements <= UINT32_MAX, | num_elements <= UINT32_MAX, | ||||
| @@ -152,11 +152,11 @@ void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_ | |||||
| exec_args[0] = datum.data(); | exec_args[0] = datum.data(); | ||||
| exec_args[2] = pvisitors.data(); | exec_args[2] = pvisitors.data(); | ||||
| } else { | } else { | ||||
| datum_dev = args.outputs[1].from->dev_tensor().as_megdnn().raw_ptr; | |||||
| datum_dev = args.outputs[1].from->dev_tensor().as_megdnn().raw_ptr(); | |||||
| MGB_CUDA_CHECK(cudaMemcpyAsync( | MGB_CUDA_CHECK(cudaMemcpyAsync( | ||||
| datum_dev, datum.data(), (nr_inps + 1) * sizeof(CUdeviceptr), | datum_dev, datum.data(), (nr_inps + 1) * sizeof(CUdeviceptr), | ||||
| cudaMemcpyHostToDevice, env.cuda_env().stream)); | cudaMemcpyHostToDevice, env.cuda_env().stream)); | ||||
| p_visitors_dev = args.outputs[2].from->dev_tensor().as_megdnn().raw_ptr; | |||||
| p_visitors_dev = args.outputs[2].from->dev_tensor().as_megdnn().raw_ptr(); | |||||
| MGB_CUDA_CHECK(cudaMemcpyAsync( | MGB_CUDA_CHECK(cudaMemcpyAsync( | ||||
| p_visitors_dev, pvisitors.data(), | p_visitors_dev, pvisitors.data(), | ||||
| nr_inps * sizeof(ParamElemVisitor<out_dim>), cudaMemcpyHostToDevice, | nr_inps * sizeof(ParamElemVisitor<out_dim>), cudaMemcpyHostToDevice, | ||||
| @@ -1269,7 +1269,9 @@ void Reduce::KernScheduler::update_ptr( | |||||
| mgb_assert( | mgb_assert( | ||||
| dest.shape().total_nr_elems() == | dest.shape().total_nr_elems() == | ||||
| m_kern_param.back().output.layout.total_nr_elems()); | m_kern_param.back().output.layout.total_nr_elems()); | ||||
| m_kern_param[0].input.raw_ptr = const_cast<dt_byte*>(input.raw_ptr()); | |||||
| auto in_tensor = input.as_megdnn(); | |||||
| in_tensor.layout = m_kern_param[0].input.layout; | |||||
| m_kern_param[0].input = in_tensor; | |||||
| dt_byte *workspace_begin = workspace_size() | dt_byte *workspace_begin = workspace_size() | ||||
| ? const_cast<dt_byte*>(workspace.raw_ptr()) | ? const_cast<dt_byte*>(workspace.raw_ptr()) | ||||
| @@ -1280,12 +1282,14 @@ void Reduce::KernScheduler::update_ptr( | |||||
| *kern_workspace = workspace_begin + m_workspace_spec[2].offset; | *kern_workspace = workspace_begin + m_workspace_spec[2].offset; | ||||
| for (size_t i = 0; i < m_kern_param.size() - 1; ++i) { | for (size_t i = 0; i < m_kern_param.size() - 1; ++i) { | ||||
| auto optr = tmp_reduce_ptr[i % 2]; | auto optr = tmp_reduce_ptr[i % 2]; | ||||
| m_kern_param[i].output.raw_ptr = optr; | |||||
| m_kern_param[i + 1].input.raw_ptr = optr; | |||||
| m_kern_param[i].output.reset_ptr(optr); | |||||
| m_kern_param[i + 1].input.reset_ptr(optr); | |||||
| } | } | ||||
| for (auto&& i : m_kern_param) | for (auto&& i : m_kern_param) | ||||
| i.workspace.raw_ptr = kern_workspace; | i.workspace.raw_ptr = kern_workspace; | ||||
| m_kern_param.back().output.raw_ptr = const_cast<dt_byte*>(dest.raw_ptr()); | |||||
| auto out_tensor = dest.as_megdnn(); | |||||
| out_tensor.layout = m_kern_param.back().output.layout; | |||||
| m_kern_param.back().output = out_tensor; | |||||
| } | } | ||||
| void Reduce::KernScheduler::execute( | void Reduce::KernScheduler::execute( | ||||
| @@ -1343,8 +1347,8 @@ void Reduce::KernScheduler::execute( | |||||
| } | } | ||||
| mgb_assert( | mgb_assert( | ||||
| input.layout().is_contiguous() && | input.layout().is_contiguous() && | ||||
| input.raw_ptr() == m_kern_param[0].input.raw_ptr && | |||||
| dest.raw_ptr() == m_kern_param.back().output.raw_ptr); | |||||
| input.raw_ptr() == m_kern_param[0].input.raw_ptr() && | |||||
| dest.raw_ptr() == m_kern_param.back().output.raw_ptr()); | |||||
| for (auto&& i : m_kern_param) { | for (auto&& i : m_kern_param) { | ||||
| opr->param() = i.KernParam::kparam; | opr->param() = i.KernParam::kparam; | ||||
| opr->exec(i.input, i.output, i.workspace); | opr->exec(i.input, i.output, i.workspace); | ||||
| @@ -1157,7 +1157,7 @@ void CondExecMerge::scn_do_execute() { | |||||
| if (forwarded[oidx]) { | if (forwarded[oidx]) { | ||||
| ovar->shape_alloc(ovar->shape()); | ovar->shape_alloc(ovar->shape()); | ||||
| auto&& own_dest = ovar->dev_tensor().as_megdnn(); | auto&& own_dest = ovar->dev_tensor().as_megdnn(); | ||||
| mgb_assert(own_dest.raw_ptr != dest.raw_ptr); | |||||
| mgb_assert(own_dest.raw_ptr() != dest.raw_ptr()); | |||||
| dnn_opr->exec({dest, src}, own_dest); | dnn_opr->exec({dest, src}, own_dest); | ||||
| forwarded[oidx] = false; | forwarded[oidx] = false; | ||||
| } else { | } else { | ||||
| @@ -241,9 +241,9 @@ void NvOf::scn_do_execute() { | |||||
| } | } | ||||
| nv_flow_extractor->extract_flow( | nv_flow_extractor->extract_flow( | ||||
| static_cast<unsigned char*>(input(0)->dev_tensor().as_megdnn().raw_ptr), | |||||
| static_cast<unsigned char*>(input(0)->dev_tensor().as_megdnn().raw_ptr()), | |||||
| vshape, | vshape, | ||||
| reinterpret_cast<int16_t*>(output(0)->dev_tensor().as_megdnn().raw_ptr)); | |||||
| reinterpret_cast<int16_t*>(output(0)->dev_tensor().as_megdnn().raw_ptr())); | |||||
| } | } | ||||
| void NvOf::init_output_static_infer_desc() { | void NvOf::init_output_static_infer_desc() { | ||||
| @@ -1425,7 +1425,7 @@ void ParamPackConcat::scn_do_execute() { | |||||
| m_inp_ptr.resize(inputs.size() - 1); | m_inp_ptr.resize(inputs.size() - 1); | ||||
| auto ptr = m_inp_ptr.data(); | auto ptr = m_inp_ptr.data(); | ||||
| for (size_t i = 0; i < inputs.size() - 1; i++) { | for (size_t i = 0; i < inputs.size() - 1; i++) { | ||||
| ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr; | |||||
| ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||||
| } | } | ||||
| auto offsets = inputs.back()->dev_tensor().as_megdnn(); | auto offsets = inputs.back()->dev_tensor().as_megdnn(); | ||||
| megdnn::TensorND srcs( | megdnn::TensorND srcs( | ||||
| @@ -2572,8 +2572,8 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { | |||||
| ASSERT_EQ(pf->tensors.size(), 2); | ASSERT_EQ(pf->tensors.size(), 2); | ||||
| ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4})); | ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4})); | ||||
| ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8})); | ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8})); | ||||
| ASSERT_NE(pf->tensors[0].raw_ptr, nullptr); | |||||
| ASSERT_NE(pf->tensors[1].raw_ptr, nullptr); | |||||
| ASSERT_NE(pf->tensors[0].raw_ptr(), nullptr); | |||||
| ASSERT_NE(pf->tensors[1].raw_ptr(), nullptr); | |||||
| pf->tensors[0].ptr<float>()[0] = 114.514f; | pf->tensors[0].ptr<float>()[0] = 114.514f; | ||||
| pf->tensors[1].ptr<float>()[0] = 1926.0817f; | pf->tensors[1].ptr<float>()[0] = 1926.0817f; | ||||
| })); | })); | ||||
| @@ -178,7 +178,7 @@ VarSanityCheck::ChecksumResult VarSanityCheck::calc_checksum(VarNode* var) { | |||||
| auto span = dt.layout().span(); | auto span = dt.layout().span(); | ||||
| megdnn::TensorND tensor; | megdnn::TensorND tensor; | ||||
| tensor.raw_ptr = dt.raw_ptr() + span.low_byte; | |||||
| tensor.reset_ptr(dt.raw_ptr() + span.low_byte); | |||||
| tensor.layout.init_contiguous_stride({span.dist_byte()}); | tensor.layout.init_contiguous_stride({span.dist_byte()}); | ||||
| tensor.layout.dtype = dtype::Byte(); | tensor.layout.dtype = dtype::Byte(); | ||||