From: @limingqi107 Reviewed-by: @cristoval,@cristoval,@wilfchen Signed-off-by: @wilfchenpull/14827/MERGE
| @@ -38,9 +38,22 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace session { | namespace session { | ||||
| using AnfVisitFuncion = std::function<Any(const AnfNodePtr &node, int index)>; | using AnfVisitFuncion = std::function<Any(const AnfNodePtr &node, int index)>; | ||||
| using KernelWithIndex = std::pair<AnfNodePtr, size_t>; | |||||
| using DeviceAddress = device::DeviceAddress; | using DeviceAddress = device::DeviceAddress; | ||||
| using DeviceAddressPtr = device::DeviceAddressPtr; | using DeviceAddressPtr = device::DeviceAddressPtr; | ||||
| using KernelWithIndex = std::pair<AnfNodePtr, size_t>; | |||||
| struct KernelWithIndexCmp { | |||||
| bool operator()(const KernelWithIndex &key1, const KernelWithIndex &key2) const { | |||||
| if (key1.first != key2.first) { | |||||
| return key1.first < key2.first; | |||||
| } | |||||
| if (key1.second != key2.second) { | |||||
| return key1.second < key2.second; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| }; | |||||
| class AnfRuntimeAlgorithm { | class AnfRuntimeAlgorithm { | ||||
| public: | public: | ||||
| static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg); | static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg); | ||||
| @@ -29,38 +29,6 @@ using mindspore::tensor::TensorPy; | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace session { | namespace session { | ||||
| namespace { | namespace { | ||||
| void UpdateOutputTensors(const VectorRef *outputs, | |||||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) { | |||||
| MS_EXCEPTION_IF_NULL(outputs); | |||||
| for (auto &item : *outputs) { | |||||
| if (utils::isa<VectorRefPtr>(item)) { | |||||
| auto vector_ref = utils::cast<VectorRef>(item); | |||||
| UpdateOutputTensors(&vector_ref, tensor_to_node); | |||||
| } else if (utils::isa<tensor::TensorPtr>(item)) { | |||||
| auto tensor = utils::cast<tensor::TensorPtr>(item); | |||||
| MS_EXCEPTION_IF_NULL(tensor); | |||||
| auto iter = tensor_to_node.find(tensor); | |||||
| if (iter != tensor_to_node.end()) { | |||||
| auto &node = iter->second.first; | |||||
| auto &output_index = iter->second.second; | |||||
| auto address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||||
| tensor->set_device_address(address); | |||||
| if (AnfAlgo::IsDynamicShape(node)) { | |||||
| auto updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); | |||||
| ShapeVector int_shape; | |||||
| std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); | |||||
| tensor->set_shape(int_shape); | |||||
| } | |||||
| } | |||||
| if (tensor->NeedSyncDeviceToHostImmediately()) { | |||||
| tensor->data_sync(false); | |||||
| tensor->set_device_address(nullptr); | |||||
| tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void SetOutputTensorsWaitStatus(const VectorRef *outputs) { | void SetOutputTensorsWaitStatus(const VectorRef *outputs) { | ||||
| MS_EXCEPTION_IF_NULL(outputs); | MS_EXCEPTION_IF_NULL(outputs); | ||||
| for (auto &item : *outputs) { | for (auto &item : *outputs) { | ||||
| @@ -163,7 +131,7 @@ void RunGraphTask::Run() { | |||||
| try { | try { | ||||
| session_->LoadInputs(graph_id_, input_tensors_); | session_->LoadInputs(graph_id_, input_tensors_); | ||||
| session_->RunGraphImpl(graph_id_, input_tensors_, &outputs_); | session_->RunGraphImpl(graph_id_, input_tensors_, &outputs_); | ||||
| UpdateOutputTensors(&outputs_, tensor_to_node_); | |||||
| session_->UpdateOutputTensors(&outputs_, tensor_to_node_); | |||||
| } catch (const std::exception &e) { | } catch (const std::exception &e) { | ||||
| ExecutorManager::Instance().OnEvent(ExecutorEvent::kException); | ExecutorManager::Instance().OnEvent(ExecutorEvent::kException); | ||||
| MsException::Instance().SetException(); | MsException::Instance().SetException(); | ||||
| @@ -66,6 +66,7 @@ | |||||
| #include "runtime/device/gpu/cuda_driver.h" | #include "runtime/device/gpu/cuda_driver.h" | ||||
| #include "runtime/device/gpu/distribution/collective_init.h" | #include "runtime/device/gpu/distribution/collective_init.h" | ||||
| #include "runtime/device/gpu/gpu_bucket.h" | #include "runtime/device/gpu/gpu_bucket.h" | ||||
| #include "runtime/device/gpu/gpu_device_address.h" | |||||
| #include "utils/ms_utils.h" | #include "utils/ms_utils.h" | ||||
| #include "utils/config_manager.h" | #include "utils/config_manager.h" | ||||
| #include "utils/ms_context.h" | #include "utils/ms_context.h" | ||||
| @@ -456,6 +457,48 @@ void GPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) | |||||
| } | } | ||||
| } | } | ||||
| void GPUSession::UpdateOutputTensors(const VectorRef *outputs, | |||||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) { | |||||
| MS_EXCEPTION_IF_NULL(outputs); | |||||
| for (const auto &item : *outputs) { | |||||
| if (utils::isa<VectorRefPtr>(item)) { | |||||
| const auto &vector_ref = utils::cast<VectorRef>(item); | |||||
| UpdateOutputTensors(&vector_ref, tensor_to_node); | |||||
| } else if (utils::isa<tensor::TensorPtr>(item)) { | |||||
| const auto &tensor = utils::cast<tensor::TensorPtr>(item); | |||||
| MS_EXCEPTION_IF_NULL(tensor); | |||||
| const auto &iter = tensor_to_node.find(tensor); | |||||
| if (iter != tensor_to_node.end()) { | |||||
| const auto &node = iter->second.first; | |||||
| const auto &output_index = iter->second.second; | |||||
| const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||||
| // The outputs may have the same tensor, so need skip when the tensor has been set to device address. | |||||
| if ((address == nullptr) || (address->GetPtr() == nullptr)) { | |||||
| return; | |||||
| } | |||||
| tensor->set_device_address(address); | |||||
| // When the device address of graph output is set in tensor, the graph output need be set new device address, | |||||
| // to avoid that the device address context of tensor be rewritten in the next step or next loop. | |||||
| auto new_address = std::make_shared<device::gpu::GPUDeviceAddress>(nullptr, address->GetSize()); | |||||
| AnfAlgo::SetOutputAddr(new_address, output_index, node.get()); | |||||
| if (AnfAlgo::IsDynamicShape(node)) { | |||||
| const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); | |||||
| ShapeVector int_shape; | |||||
| std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); | |||||
| tensor->set_shape(int_shape); | |||||
| } | |||||
| } | |||||
| if (tensor->NeedSyncDeviceToHostImmediately()) { | |||||
| tensor->data_sync(false); | |||||
| tensor->set_device_address(nullptr); | |||||
| tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | ||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | ||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | MS_EXCEPTION_IF_NULL(runtime_instance); | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include <memory> | #include <memory> | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <string> | #include <string> | ||||
| #include <map> | |||||
| #include "backend/session/session_basic.h" | #include "backend/session/session_basic.h" | ||||
| #include "backend/session/kernel_graph.h" | #include "backend/session/kernel_graph.h" | ||||
| #include "backend/session/session_factory.h" | #include "backend/session/session_factory.h" | ||||
| @@ -53,6 +54,8 @@ class GPUSession : public SessionBasic { | |||||
| std::string GetCommWorldGroup() override { return kNcclWorldGroup; } | std::string GetCommWorldGroup() override { return kNcclWorldGroup; } | ||||
| void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | ||||
| const std::vector<tensor::TensorPtr> &inputs_const) const override; | const std::vector<tensor::TensorPtr> &inputs_const) const override; | ||||
| void UpdateOutputTensors(const VectorRef *outputs, | |||||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) override; | |||||
| private: | private: | ||||
| void SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const; | void SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const; | ||||
| @@ -210,9 +210,11 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair, | |||||
| BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &graph, | BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &graph, | ||||
| const std::vector<tensor::TensorPtr> &input_tensors, | const std::vector<tensor::TensorPtr> &input_tensors, | ||||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) { | |||||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node, | |||||
| KernelMapTensor *node_to_tensor) { | |||||
| MS_EXCEPTION_IF_NULL(anf); | MS_EXCEPTION_IF_NULL(anf); | ||||
| MS_EXCEPTION_IF_NULL(tensor_to_node); | MS_EXCEPTION_IF_NULL(tensor_to_node); | ||||
| MS_EXCEPTION_IF_NULL(node_to_tensor); | |||||
| MS_LOG(INFO) << "Create tensor for output[" << anf->DebugString() << "]"; | MS_LOG(INFO) << "Create tensor for output[" << anf->DebugString() << "]"; | ||||
| auto item_with_index = AnfAlgo::VisitKernelWithReturnType(anf, 0); | auto item_with_index = AnfAlgo::VisitKernelWithReturnType(anf, 0); | ||||
| MS_EXCEPTION_IF_NULL(item_with_index.first); | MS_EXCEPTION_IF_NULL(item_with_index.first); | ||||
| @@ -223,7 +225,7 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra | |||||
| MS_EXCEPTION_IF_NULL(cnode); | MS_EXCEPTION_IF_NULL(cnode); | ||||
| VectorRef ret; | VectorRef ret; | ||||
| for (size_t i = 1; i < cnode->inputs().size(); ++i) { | for (size_t i = 1; i < cnode->inputs().size(); ++i) { | ||||
| auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node); | |||||
| auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node, node_to_tensor); | |||||
| ret.push_back(out); | ret.push_back(out); | ||||
| } | } | ||||
| return ret; | return ret; | ||||
| @@ -233,7 +235,16 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra | |||||
| if (size == 0) { | if (size == 0) { | ||||
| return VectorRef(); | return VectorRef(); | ||||
| } | } | ||||
| return CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node); | |||||
| // The outputs of graph may have the same kernel node, no need to create new tensor. | |||||
| const auto &iter = node_to_tensor->find(item_with_index); | |||||
| if (iter != node_to_tensor->end()) { | |||||
| return iter->second; | |||||
| } | |||||
| const auto &tensor = CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node); | |||||
| (*node_to_tensor)[item_with_index] = tensor; | |||||
| return tensor; | |||||
| } | } | ||||
| ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) { | ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) { | ||||
| @@ -1503,11 +1514,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| MS_EXCEPTION_IF_NULL(outputs); | MS_EXCEPTION_IF_NULL(outputs); | ||||
| std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | ||||
| KernelMapTensor node_to_tensor; | |||||
| auto anf_outputs = kernel_graph->outputs(); | auto anf_outputs = kernel_graph->outputs(); | ||||
| for (auto &item : anf_outputs) { | for (auto &item : anf_outputs) { | ||||
| MS_EXCEPTION_IF_NULL(item); | MS_EXCEPTION_IF_NULL(item); | ||||
| MS_LOG(INFO) << "Update output[" << item->DebugString() << "]"; | MS_LOG(INFO) << "Update output[" << item->DebugString() << "]"; | ||||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node)); | |||||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node, &node_to_tensor)); | |||||
| } | } | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| @@ -1572,10 +1584,44 @@ void SessionBasic::CreateOutputTensors(const GraphId &graph_id, const std::vecto | |||||
| MS_EXCEPTION_IF_NULL(outputs); | MS_EXCEPTION_IF_NULL(outputs); | ||||
| MS_EXCEPTION_IF_NULL(tensor_to_node); | MS_EXCEPTION_IF_NULL(tensor_to_node); | ||||
| auto anf_outputs = kernel_graph->outputs(); | auto anf_outputs = kernel_graph->outputs(); | ||||
| KernelMapTensor node_to_tensor; | |||||
| for (auto &item : anf_outputs) { | for (auto &item : anf_outputs) { | ||||
| MS_EXCEPTION_IF_NULL(item); | MS_EXCEPTION_IF_NULL(item); | ||||
| MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; | MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; | ||||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node)); | |||||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node, &node_to_tensor)); | |||||
| } | |||||
| } | |||||
| void SessionBasic::UpdateOutputTensors(const VectorRef *outputs, | |||||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) { | |||||
| MS_EXCEPTION_IF_NULL(outputs); | |||||
| for (const auto &item : *outputs) { | |||||
| if (utils::isa<VectorRefPtr>(item)) { | |||||
| const auto &vector_ref = utils::cast<VectorRef>(item); | |||||
| UpdateOutputTensors(&vector_ref, tensor_to_node); | |||||
| } else if (utils::isa<tensor::TensorPtr>(item)) { | |||||
| const auto &tensor = utils::cast<tensor::TensorPtr>(item); | |||||
| MS_EXCEPTION_IF_NULL(tensor); | |||||
| const auto &iter = tensor_to_node.find(tensor); | |||||
| if (iter != tensor_to_node.end()) { | |||||
| const auto &node = iter->second.first; | |||||
| const auto &output_index = iter->second.second; | |||||
| const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||||
| tensor->set_device_address(address); | |||||
| if (AnfAlgo::IsDynamicShape(node)) { | |||||
| const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); | |||||
| ShapeVector int_shape; | |||||
| std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); | |||||
| tensor->set_shape(int_shape); | |||||
| } | |||||
| } | |||||
| if (tensor->NeedSyncDeviceToHostImmediately()) { | |||||
| tensor->data_sync(false); | |||||
| tensor->set_device_address(nullptr); | |||||
| tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| } | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -1622,11 +1668,12 @@ void SessionBasic::GetModelOutputsInfo(uint32_t graph_id, std::vector<tensor::Te | |||||
| VectorRef vector_outputs; | VectorRef vector_outputs; | ||||
| std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | ||||
| KernelMapTensor node_to_tensor; | |||||
| auto anf_outputs = kernel_graph->outputs(); | auto anf_outputs = kernel_graph->outputs(); | ||||
| for (auto &item : anf_outputs) { | for (auto &item : anf_outputs) { | ||||
| MS_EXCEPTION_IF_NULL(item); | MS_EXCEPTION_IF_NULL(item); | ||||
| MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; | MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; | ||||
| vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node)); | |||||
| vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node, &node_to_tensor)); | |||||
| } | } | ||||
| *outputs = TransformVectorRefToMultiTensor(vector_outputs); | *outputs = TransformVectorRefToMultiTensor(vector_outputs); | ||||
| for (size_t i = 0; i < outputs->size(); i++) { | for (size_t i = 0; i < outputs->size(); i++) { | ||||
| @@ -73,6 +73,7 @@ struct OutputTensorInfo { | |||||
| }; | }; | ||||
| using OpRunInfoPtr = std::shared_ptr<OpRunInfo>; | using OpRunInfoPtr = std::shared_ptr<OpRunInfo>; | ||||
| using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>; | |||||
| class Executor; | class Executor; | ||||
| class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | ||||
| @@ -169,6 +170,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||||
| virtual void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors, | virtual void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors, | ||||
| VectorRef *outputs, | VectorRef *outputs, | ||||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node); | std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node); | ||||
| virtual void UpdateOutputTensors(const VectorRef *outputs, | |||||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node); | |||||
| virtual void UnifyMindIR(const KernelGraphPtr &graph) {} | virtual void UnifyMindIR(const KernelGraphPtr &graph) {} | ||||
| virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; } | virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; } | ||||
| virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; } | virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; } | ||||