From: @limingqi107 Reviewed-by: @cristoval,@cristoval,@wilfchen Signed-off-by: @wilfchenpull/14827/MERGE
| @@ -38,9 +38,22 @@ | |||
| namespace mindspore { | |||
| namespace session { | |||
| using AnfVisitFuncion = std::function<Any(const AnfNodePtr &node, int index)>; | |||
| using KernelWithIndex = std::pair<AnfNodePtr, size_t>; | |||
| using DeviceAddress = device::DeviceAddress; | |||
| using DeviceAddressPtr = device::DeviceAddressPtr; | |||
| using KernelWithIndex = std::pair<AnfNodePtr, size_t>; | |||
| struct KernelWithIndexCmp { | |||
| bool operator()(const KernelWithIndex &key1, const KernelWithIndex &key2) const { | |||
| if (key1.first != key2.first) { | |||
| return key1.first < key2.first; | |||
| } | |||
| if (key1.second != key2.second) { | |||
| return key1.second < key2.second; | |||
| } | |||
| return false; | |||
| } | |||
| }; | |||
| class AnfRuntimeAlgorithm { | |||
| public: | |||
| static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg); | |||
| @@ -29,38 +29,6 @@ using mindspore::tensor::TensorPy; | |||
| namespace mindspore { | |||
| namespace session { | |||
| namespace { | |||
| void UpdateOutputTensors(const VectorRef *outputs, | |||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) { | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| for (auto &item : *outputs) { | |||
| if (utils::isa<VectorRefPtr>(item)) { | |||
| auto vector_ref = utils::cast<VectorRef>(item); | |||
| UpdateOutputTensors(&vector_ref, tensor_to_node); | |||
| } else if (utils::isa<tensor::TensorPtr>(item)) { | |||
| auto tensor = utils::cast<tensor::TensorPtr>(item); | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| auto iter = tensor_to_node.find(tensor); | |||
| if (iter != tensor_to_node.end()) { | |||
| auto &node = iter->second.first; | |||
| auto &output_index = iter->second.second; | |||
| auto address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||
| tensor->set_device_address(address); | |||
| if (AnfAlgo::IsDynamicShape(node)) { | |||
| auto updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); | |||
| ShapeVector int_shape; | |||
| std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); | |||
| tensor->set_shape(int_shape); | |||
| } | |||
| } | |||
| if (tensor->NeedSyncDeviceToHostImmediately()) { | |||
| tensor->data_sync(false); | |||
| tensor->set_device_address(nullptr); | |||
| tensor->set_sync_status(kNeedSyncHostToDevice); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void SetOutputTensorsWaitStatus(const VectorRef *outputs) { | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| for (auto &item : *outputs) { | |||
| @@ -163,7 +131,7 @@ void RunGraphTask::Run() { | |||
| try { | |||
| session_->LoadInputs(graph_id_, input_tensors_); | |||
| session_->RunGraphImpl(graph_id_, input_tensors_, &outputs_); | |||
| UpdateOutputTensors(&outputs_, tensor_to_node_); | |||
| session_->UpdateOutputTensors(&outputs_, tensor_to_node_); | |||
| } catch (const std::exception &e) { | |||
| ExecutorManager::Instance().OnEvent(ExecutorEvent::kException); | |||
| MsException::Instance().SetException(); | |||
| @@ -66,6 +66,7 @@ | |||
| #include "runtime/device/gpu/cuda_driver.h" | |||
| #include "runtime/device/gpu/distribution/collective_init.h" | |||
| #include "runtime/device/gpu/gpu_bucket.h" | |||
| #include "runtime/device/gpu/gpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| #include "utils/config_manager.h" | |||
| #include "utils/ms_context.h" | |||
| @@ -456,6 +457,48 @@ void GPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) | |||
| } | |||
| } | |||
| void GPUSession::UpdateOutputTensors(const VectorRef *outputs, | |||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) { | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| for (const auto &item : *outputs) { | |||
| if (utils::isa<VectorRefPtr>(item)) { | |||
| const auto &vector_ref = utils::cast<VectorRef>(item); | |||
| UpdateOutputTensors(&vector_ref, tensor_to_node); | |||
| } else if (utils::isa<tensor::TensorPtr>(item)) { | |||
| const auto &tensor = utils::cast<tensor::TensorPtr>(item); | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| const auto &iter = tensor_to_node.find(tensor); | |||
| if (iter != tensor_to_node.end()) { | |||
| const auto &node = iter->second.first; | |||
| const auto &output_index = iter->second.second; | |||
| const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||
| // The outputs may have the same tensor, so need skip when the tensor has been set to device address. | |||
| if ((address == nullptr) || (address->GetPtr() == nullptr)) { | |||
| return; | |||
| } | |||
| tensor->set_device_address(address); | |||
| // When the device address of graph output is set in tensor, the graph output need be set new device address, | |||
| // to avoid that the device address context of tensor be rewritten in the next step or next loop. | |||
| auto new_address = std::make_shared<device::gpu::GPUDeviceAddress>(nullptr, address->GetSize()); | |||
| AnfAlgo::SetOutputAddr(new_address, output_index, node.get()); | |||
| if (AnfAlgo::IsDynamicShape(node)) { | |||
| const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); | |||
| ShapeVector int_shape; | |||
| std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); | |||
| tensor->set_shape(int_shape); | |||
| } | |||
| } | |||
| if (tensor->NeedSyncDeviceToHostImmediately()) { | |||
| tensor->data_sync(false); | |||
| tensor->set_device_address(nullptr); | |||
| tensor->set_sync_status(kNeedSyncHostToDevice); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| @@ -20,6 +20,7 @@ | |||
| #include <memory> | |||
| #include <algorithm> | |||
| #include <string> | |||
| #include <map> | |||
| #include "backend/session/session_basic.h" | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "backend/session/session_factory.h" | |||
| @@ -53,6 +54,8 @@ class GPUSession : public SessionBasic { | |||
| std::string GetCommWorldGroup() override { return kNcclWorldGroup; } | |||
| void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||
| const std::vector<tensor::TensorPtr> &inputs_const) const override; | |||
| void UpdateOutputTensors(const VectorRef *outputs, | |||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) override; | |||
| private: | |||
| void SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const; | |||
| @@ -210,9 +210,11 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair, | |||
| BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &graph, | |||
| const std::vector<tensor::TensorPtr> &input_tensors, | |||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) { | |||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node, | |||
| KernelMapTensor *node_to_tensor) { | |||
| MS_EXCEPTION_IF_NULL(anf); | |||
| MS_EXCEPTION_IF_NULL(tensor_to_node); | |||
| MS_EXCEPTION_IF_NULL(node_to_tensor); | |||
| MS_LOG(INFO) << "Create tensor for output[" << anf->DebugString() << "]"; | |||
| auto item_with_index = AnfAlgo::VisitKernelWithReturnType(anf, 0); | |||
| MS_EXCEPTION_IF_NULL(item_with_index.first); | |||
| @@ -223,7 +225,7 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| VectorRef ret; | |||
| for (size_t i = 1; i < cnode->inputs().size(); ++i) { | |||
| auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node); | |||
| auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node, node_to_tensor); | |||
| ret.push_back(out); | |||
| } | |||
| return ret; | |||
| @@ -233,7 +235,16 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra | |||
| if (size == 0) { | |||
| return VectorRef(); | |||
| } | |||
| return CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node); | |||
| // The outputs of graph may have the same kernel node, no need to create new tensor. | |||
| const auto &iter = node_to_tensor->find(item_with_index); | |||
| if (iter != node_to_tensor->end()) { | |||
| return iter->second; | |||
| } | |||
| const auto &tensor = CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node); | |||
| (*node_to_tensor)[item_with_index] = tensor; | |||
| return tensor; | |||
| } | |||
| ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) { | |||
| @@ -1503,11 +1514,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | |||
| KernelMapTensor node_to_tensor; | |||
| auto anf_outputs = kernel_graph->outputs(); | |||
| for (auto &item : anf_outputs) { | |||
| MS_EXCEPTION_IF_NULL(item); | |||
| MS_LOG(INFO) << "Update output[" << item->DebugString() << "]"; | |||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node)); | |||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node, &node_to_tensor)); | |||
| } | |||
| auto ms_context = MsContext::GetInstance(); | |||
| @@ -1572,10 +1584,44 @@ void SessionBasic::CreateOutputTensors(const GraphId &graph_id, const std::vecto | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| MS_EXCEPTION_IF_NULL(tensor_to_node); | |||
| auto anf_outputs = kernel_graph->outputs(); | |||
| KernelMapTensor node_to_tensor; | |||
| for (auto &item : anf_outputs) { | |||
| MS_EXCEPTION_IF_NULL(item); | |||
| MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; | |||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node)); | |||
| outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node, &node_to_tensor)); | |||
| } | |||
| } | |||
| void SessionBasic::UpdateOutputTensors(const VectorRef *outputs, | |||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) { | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| for (const auto &item : *outputs) { | |||
| if (utils::isa<VectorRefPtr>(item)) { | |||
| const auto &vector_ref = utils::cast<VectorRef>(item); | |||
| UpdateOutputTensors(&vector_ref, tensor_to_node); | |||
| } else if (utils::isa<tensor::TensorPtr>(item)) { | |||
| const auto &tensor = utils::cast<tensor::TensorPtr>(item); | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| const auto &iter = tensor_to_node.find(tensor); | |||
| if (iter != tensor_to_node.end()) { | |||
| const auto &node = iter->second.first; | |||
| const auto &output_index = iter->second.second; | |||
| const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||
| tensor->set_device_address(address); | |||
| if (AnfAlgo::IsDynamicShape(node)) { | |||
| const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index); | |||
| ShapeVector int_shape; | |||
| std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt); | |||
| tensor->set_shape(int_shape); | |||
| } | |||
| } | |||
| if (tensor->NeedSyncDeviceToHostImmediately()) { | |||
| tensor->data_sync(false); | |||
| tensor->set_device_address(nullptr); | |||
| tensor->set_sync_status(kNeedSyncHostToDevice); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -1622,11 +1668,12 @@ void SessionBasic::GetModelOutputsInfo(uint32_t graph_id, std::vector<tensor::Te | |||
| VectorRef vector_outputs; | |||
| std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | |||
| KernelMapTensor node_to_tensor; | |||
| auto anf_outputs = kernel_graph->outputs(); | |||
| for (auto &item : anf_outputs) { | |||
| MS_EXCEPTION_IF_NULL(item); | |||
| MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]"; | |||
| vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node)); | |||
| vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node, &node_to_tensor)); | |||
| } | |||
| *outputs = TransformVectorRefToMultiTensor(vector_outputs); | |||
| for (size_t i = 0; i < outputs->size(); i++) { | |||
| @@ -73,6 +73,7 @@ struct OutputTensorInfo { | |||
| }; | |||
| using OpRunInfoPtr = std::shared_ptr<OpRunInfo>; | |||
| using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>; | |||
| class Executor; | |||
| class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||
| @@ -169,6 +170,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||
| virtual void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors, | |||
| VectorRef *outputs, | |||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node); | |||
| virtual void UpdateOutputTensors(const VectorRef *outputs, | |||
| const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node); | |||
| virtual void UnifyMindIR(const KernelGraphPtr &graph) {} | |||
| virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; } | |||
| virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; } | |||