From 22972a89a73cc333c42d8f25beca5c031f6ea5d6 Mon Sep 17 00:00:00 2001 From: limingqi107 Date: Fri, 9 Apr 2021 09:38:04 +0800 Subject: [PATCH] support the output address of graph reapply --- .../backend/session/anf_runtime_algorithm.h | 15 ++++- mindspore/ccsrc/backend/session/executor.cc | 34 +---------- .../ccsrc/backend/session/gpu_session.cc | 43 ++++++++++++++ mindspore/ccsrc/backend/session/gpu_session.h | 3 + .../ccsrc/backend/session/session_basic.cc | 59 +++++++++++++++++-- .../ccsrc/backend/session/session_basic.h | 3 + 6 files changed, 117 insertions(+), 40 deletions(-) diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h index 9d3fcf8f70..57eb49e8b4 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h @@ -38,9 +38,22 @@ namespace mindspore { namespace session { using AnfVisitFuncion = std::function; -using KernelWithIndex = std::pair; using DeviceAddress = device::DeviceAddress; using DeviceAddressPtr = device::DeviceAddressPtr; + +using KernelWithIndex = std::pair; +struct KernelWithIndexCmp { + bool operator()(const KernelWithIndex &key1, const KernelWithIndex &key2) const { + if (key1.first != key2.first) { + return key1.first < key2.first; + } + if (key1.second != key2.second) { + return key1.second < key2.second; + } + return false; + } +}; + class AnfRuntimeAlgorithm { public: static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg); diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc index 1c915ebd84..cb7cf3777e 100644 --- a/mindspore/ccsrc/backend/session/executor.cc +++ b/mindspore/ccsrc/backend/session/executor.cc @@ -29,38 +29,6 @@ using mindspore::tensor::TensorPy; namespace mindspore { namespace session { namespace { -void UpdateOutputTensors(const VectorRef *outputs, - const std::map &tensor_to_node) { - MS_EXCEPTION_IF_NULL(outputs); - for (auto &item : *outputs) { - if (utils::isa(item)) { - auto vector_ref = utils::cast(item); - UpdateOutputTensors(&vector_ref, tensor_to_node); - } else if (utils::isa(item)) { - auto tensor = utils::cast(item); - MS_EXCEPTION_IF_NULL(tensor); - auto iter = tensor_to_node.find(tensor); - if (iter != tensor_to_node.end()) { - auto &node = iter->second.first; - auto &output_index = iter->second.second; - auto address = AnfAlgo::GetMutableOutputAddr(node, output_index); - tensor->set_device_address(address); - if (AnfAlgo::IsDynamicShape(node)) { - auto updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); - ShapeVector int_shape; - std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); - tensor->set_shape(int_shape); - } - } - if (tensor->NeedSyncDeviceToHostImmediately()) { - tensor->data_sync(false); - tensor->set_device_address(nullptr); - tensor->set_sync_status(kNeedSyncHostToDevice); - } - } - } -} - void SetOutputTensorsWaitStatus(const VectorRef *outputs) { MS_EXCEPTION_IF_NULL(outputs); for (auto &item : *outputs) { @@ -163,7 +131,7 @@ void RunGraphTask::Run() { try { session_->LoadInputs(graph_id_, input_tensors_); session_->RunGraphImpl(graph_id_, input_tensors_, &outputs_); - UpdateOutputTensors(&outputs_, tensor_to_node_); + session_->UpdateOutputTensors(&outputs_, tensor_to_node_); } catch (const std::exception &e) { ExecutorManager::Instance().OnEvent(ExecutorEvent::kException); MsException::Instance().SetException(); diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 5a9242e02a..7229fa8a3b 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -66,6 +66,7 @@ #include "runtime/device/gpu/cuda_driver.h" #include "runtime/device/gpu/distribution/collective_init.h" #include "runtime/device/gpu/gpu_bucket.h" +#include "runtime/device/gpu/gpu_device_address.h" #include "utils/ms_utils.h" #include "utils/config_manager.h" #include "utils/ms_context.h" @@ -456,6 +457,48 @@ void GPUSession::ExecuteGraph(const std::shared_ptr &kernel_graph) } } +void GPUSession::UpdateOutputTensors(const VectorRef *outputs, + const std::map &tensor_to_node) { + MS_EXCEPTION_IF_NULL(outputs); + for (const auto &item : *outputs) { + if (utils::isa(item)) { + const auto &vector_ref = utils::cast(item); + UpdateOutputTensors(&vector_ref, tensor_to_node); + } else if (utils::isa(item)) { + const auto &tensor = utils::cast(item); + MS_EXCEPTION_IF_NULL(tensor); + const auto &iter = tensor_to_node.find(tensor); + if (iter != tensor_to_node.end()) { + const auto &node = iter->second.first; + const auto &output_index = iter->second.second; + const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index); + // The outputs may have the same tensor, so need skip when the tensor has been set to device address. + if ((address == nullptr) || (address->GetPtr() == nullptr)) { + return; + } + tensor->set_device_address(address); + + // When the device address of graph output is set in tensor, the graph output need be set new device address, + // to avoid that the device address context of tensor be rewritten in the next step or next loop. + auto new_address = std::make_shared(nullptr, address->GetSize()); + AnfAlgo::SetOutputAddr(new_address, output_index, node.get()); + + if (AnfAlgo::IsDynamicShape(node)) { + const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); + ShapeVector int_shape; + std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); + tensor->set_shape(int_shape); + } + } + if (tensor->NeedSyncDeviceToHostImmediately()) { + tensor->data_sync(false); + tensor->set_device_address(nullptr); + tensor->set_sync_status(kNeedSyncHostToDevice); + } + } + } +} + void GPUSession::Execute(const std::shared_ptr &kernel_graph) const { auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h index 3cd54ec6e4..c9089c5113 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.h +++ b/mindspore/ccsrc/backend/session/gpu_session.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "backend/session/session_basic.h" #include "backend/session/kernel_graph.h" #include "backend/session/session_factory.h" @@ -53,6 +54,8 @@ class GPUSession : public SessionBasic { std::string GetCommWorldGroup() override { return kNcclWorldGroup; } void LoadInputData(const std::shared_ptr &kernel_graph, const std::vector &inputs_const) const override; + void UpdateOutputTensors(const VectorRef *outputs, + const std::map &tensor_to_node) override; private: void SelectKernel(const std::shared_ptr &kernel_graph) const; diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc index a7e8dda65a..14b32f409a 100644 --- a/mindspore/ccsrc/backend/session/session_basic.cc +++ b/mindspore/ccsrc/backend/session/session_basic.cc @@ -210,9 +210,11 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair, BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &graph, const std::vector &input_tensors, - std::map *tensor_to_node) { + std::map *tensor_to_node, + KernelMapTensor *node_to_tensor) { MS_EXCEPTION_IF_NULL(anf); MS_EXCEPTION_IF_NULL(tensor_to_node); + MS_EXCEPTION_IF_NULL(node_to_tensor); MS_LOG(INFO) << "Create tensor for output[" << anf->DebugString() << "]"; auto item_with_index = AnfAlgo::VisitKernelWithReturnType(anf, 0); MS_EXCEPTION_IF_NULL(item_with_index.first); @@ -223,7 +225,7 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra MS_EXCEPTION_IF_NULL(cnode); VectorRef ret; for (size_t i = 1; i < cnode->inputs().size(); ++i) { - auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node); + auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node, node_to_tensor); ret.push_back(out); } return ret; @@ -233,7 +235,16 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra if (size == 0) { return VectorRef(); } - return CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node); + + // The outputs of graph may have the same kernel node, no need to create new tensor. + const auto &iter = node_to_tensor->find(item_with_index); + if (iter != node_to_tensor->end()) { + return iter->second; + } + + const auto &tensor = CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node); + (*node_to_tensor)[item_with_index] = tensor; + return tensor; } ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) { @@ -1503,11 +1514,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr &kernel_grap MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(outputs); std::map tensor_to_node; + KernelMapTensor node_to_tensor; auto anf_outputs = kernel_graph->outputs(); for (auto &item : anf_outputs) { MS_EXCEPTION_IF_NULL(item); MS_LOG(INFO) << "Update output[" << item->DebugString() << "]"; - outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node)); + outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node, &node_to_tensor)); } auto ms_context = MsContext::GetInstance(); @@ -1572,10 +1584,44 @@ void SessionBasic::CreateOutputTensors(const GraphId &graph_id, const std::vecto MS_EXCEPTION_IF_NULL(outputs); MS_EXCEPTION_IF_NULL(tensor_to_node); auto anf_outputs = kernel_graph->outputs(); + KernelMapTensor node_to_tensor; for (auto &item : anf_outputs) { MS_EXCEPTION_IF_NULL(item); MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; - outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node)); + outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node, &node_to_tensor)); + } +} + +void SessionBasic::UpdateOutputTensors(const VectorRef *outputs, + const std::map &tensor_to_node) { + MS_EXCEPTION_IF_NULL(outputs); + for (const auto &item : *outputs) { + if (utils::isa(item)) { + const auto &vector_ref = utils::cast(item); + UpdateOutputTensors(&vector_ref, tensor_to_node); + } else if (utils::isa(item)) { + const auto &tensor = utils::cast(item); + MS_EXCEPTION_IF_NULL(tensor); + const auto &iter = tensor_to_node.find(tensor); + if (iter != tensor_to_node.end()) { + const auto &node = iter->second.first; + const auto &output_index = iter->second.second; + const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index); + tensor->set_device_address(address); + + if (AnfAlgo::IsDynamicShape(node)) { + const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); + ShapeVector int_shape; + std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); + tensor->set_shape(int_shape); + } + } + if (tensor->NeedSyncDeviceToHostImmediately()) { + tensor->data_sync(false); + tensor->set_device_address(nullptr); + tensor->set_sync_status(kNeedSyncHostToDevice); + } + } } } @@ -1622,11 +1668,12 @@ void SessionBasic::GetModelOutputsInfo(uint32_t graph_id, std::vector tensor_to_node; + KernelMapTensor node_to_tensor; auto anf_outputs = kernel_graph->outputs(); for (auto &item : anf_outputs) { MS_EXCEPTION_IF_NULL(item); MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; - vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node)); + vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node, &node_to_tensor)); } *outputs = TransformVectorRefToMultiTensor(vector_outputs); for (size_t i = 0; i < outputs->size(); i++) { diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h index a44fa56c2b..429048391c 100644 --- a/mindspore/ccsrc/backend/session/session_basic.h +++ b/mindspore/ccsrc/backend/session/session_basic.h @@ -73,6 +73,7 @@ struct OutputTensorInfo { }; using OpRunInfoPtr = std::shared_ptr; +using KernelMapTensor = std::map; class Executor; class SessionBasic : public std::enable_shared_from_this { @@ -169,6 +170,8 @@ class SessionBasic : public std::enable_shared_from_this { virtual void CreateOutputTensors(const GraphId &graph_id, const std::vector &input_tensors, VectorRef *outputs, std::map *tensor_to_node); + virtual void UpdateOutputTensors(const VectorRef *outputs, + const std::map &tensor_to_node); virtual void UnifyMindIR(const KernelGraphPtr &graph) {} virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; } virtual GraphId CompileGraphImpl(NotNull func_graph) { return kInvalidGraphId; }