From 22972a89a73cc333c42d8f25beca5c031f6ea5d6 Mon Sep 17 00:00:00 2001
From: limingqi107 <limingqi@huawei.com>
Date: Fri, 9 Apr 2021 09:38:04 +0800
Subject: [PATCH] support the output address of graph reapply

---
 .../backend/session/anf_runtime_algorithm.h   | 15 ++++-
 mindspore/ccsrc/backend/session/executor.cc   | 34 +----------
 .../ccsrc/backend/session/gpu_session.cc      | 43 ++++++++++++++
 mindspore/ccsrc/backend/session/gpu_session.h |  3 +
 .../ccsrc/backend/session/session_basic.cc    | 59 +++++++++++++++++--
 .../ccsrc/backend/session/session_basic.h     |  3 +
 6 files changed, 117 insertions(+), 40 deletions(-)
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
index 9d3fcf8f70..57eb49e8b4 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -38,9 +38,22 @@
 namespace mindspore {
 namespace session {
 using AnfVisitFuncion = std::function<Any(const AnfNodePtr &node, int index)>;
-using KernelWithIndex = std::pair<AnfNodePtr, size_t>;
 using DeviceAddress = device::DeviceAddress;
 using DeviceAddressPtr = device::DeviceAddressPtr;
+
+using KernelWithIndex = std::pair<AnfNodePtr, size_t>;
+struct KernelWithIndexCmp {
+  bool operator()(const KernelWithIndex &key1, const KernelWithIndex &key2) const {
+    if (key1.first != key2.first) {
+      return key1.first < key2.first;
+    }
+    if (key1.second != key2.second) {
+      return key1.second < key2.second;
+    }
+    return false;
+  }
+};
+
 class AnfRuntimeAlgorithm {
  public:
   static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg);
diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc
index 1c915ebd84..cb7cf3777e 100644
--- a/mindspore/ccsrc/backend/session/executor.cc
+++ b/mindspore/ccsrc/backend/session/executor.cc
@@ -29,38 +29,6 @@ using mindspore::tensor::TensorPy;
 namespace mindspore {
 namespace session {
 namespace {
-void UpdateOutputTensors(const VectorRef *outputs,
-                         const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) {
-  MS_EXCEPTION_IF_NULL(outputs);
-  for (auto &item : *outputs) {
-    if (utils::isa<VectorRefPtr>(item)) {
-      auto vector_ref = utils::cast<VectorRef>(item);
-      UpdateOutputTensors(&vector_ref, tensor_to_node);
-    } else if (utils::isa<tensor::TensorPtr>(item)) {
-      auto tensor = utils::cast<tensor::TensorPtr>(item);
-      MS_EXCEPTION_IF_NULL(tensor);
-      auto iter = tensor_to_node.find(tensor);
-      if (iter != tensor_to_node.end()) {
-        auto &node = iter->second.first;
-        auto &output_index = iter->second.second;
-        auto address = AnfAlgo::GetMutableOutputAddr(node, output_index);
-        tensor->set_device_address(address);
-        if (AnfAlgo::IsDynamicShape(node)) {
-          auto updated_shape = AnfAlgo::GetOutputInferShape(node, output_index);
-          ShapeVector int_shape;
-          std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt);
-          tensor->set_shape(int_shape);
-        }
-      }
-      if (tensor->NeedSyncDeviceToHostImmediately()) {
-        tensor->data_sync(false);
-        tensor->set_device_address(nullptr);
-        tensor->set_sync_status(kNeedSyncHostToDevice);
-      }
-    }
-  }
-}
-
 void SetOutputTensorsWaitStatus(const VectorRef *outputs) {
   MS_EXCEPTION_IF_NULL(outputs);
   for (auto &item : *outputs) {
@@ -163,7 +131,7 @@ void RunGraphTask::Run() {
   try {
     session_->LoadInputs(graph_id_, input_tensors_);
     session_->RunGraphImpl(graph_id_, input_tensors_, &outputs_);
-    UpdateOutputTensors(&outputs_, tensor_to_node_);
+    session_->UpdateOutputTensors(&outputs_, tensor_to_node_);
   } catch (const std::exception &e) {
     ExecutorManager::Instance().OnEvent(ExecutorEvent::kException);
     MsException::Instance().SetException();
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 5a9242e02a..7229fa8a3b 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -66,6 +66,7 @@
 #include "runtime/device/gpu/cuda_driver.h"
 #include "runtime/device/gpu/distribution/collective_init.h"
 #include "runtime/device/gpu/gpu_bucket.h"
+#include "runtime/device/gpu/gpu_device_address.h"
 #include "utils/ms_utils.h"
 #include "utils/config_manager.h"
 #include "utils/ms_context.h"
@@ -456,6 +457,48 @@ void GPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph)
   }
 }
 
+void GPUSession::UpdateOutputTensors(const VectorRef *outputs,
+                                     const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) {
+  MS_EXCEPTION_IF_NULL(outputs);
+  for (const auto &item : *outputs) {
+    if (utils::isa<VectorRefPtr>(item)) {
+      const auto &vector_ref = utils::cast<VectorRef>(item);
+      UpdateOutputTensors(&vector_ref, tensor_to_node);
+    } else if (utils::isa<tensor::TensorPtr>(item)) {
+      const auto &tensor = utils::cast<tensor::TensorPtr>(item);
+      MS_EXCEPTION_IF_NULL(tensor);
+      const auto &iter = tensor_to_node.find(tensor);
+      if (iter != tensor_to_node.end()) {
+        const auto &node = iter->second.first;
+        const auto &output_index = iter->second.second;
+        const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index);
+        // The outputs may have the same tensor, so need skip when the tensor has been set to device address.
+        if ((address == nullptr) || (address->GetPtr() == nullptr)) {
+          return;
+        }
+        tensor->set_device_address(address);
+
+        // When the device address of graph output is set in tensor, the graph output need be set new device address,
+        // to avoid that the device address context of tensor be rewritten in the next step or next loop.
+        auto new_address = std::make_shared<device::gpu::GPUDeviceAddress>(nullptr, address->GetSize());
+        AnfAlgo::SetOutputAddr(new_address, output_index, node.get());
+
+        if (AnfAlgo::IsDynamicShape(node)) {
+          const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index);
+          ShapeVector int_shape;
+          std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt);
+          tensor->set_shape(int_shape);
+        }
+      }
+      if (tensor->NeedSyncDeviceToHostImmediately()) {
+        tensor->data_sync(false);
+        tensor->set_device_address(nullptr);
+        tensor->set_sync_status(kNeedSyncHostToDevice);
+      }
+    }
+  }
+}
+
 void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
   MS_EXCEPTION_IF_NULL(runtime_instance);
diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h
index 3cd54ec6e4..c9089c5113 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <algorithm>
 #include <string>
+#include <map>
 #include "backend/session/session_basic.h"
 #include "backend/session/kernel_graph.h"
 #include "backend/session/session_factory.h"
@@ -53,6 +54,8 @@ class GPUSession : public SessionBasic {
   std::string GetCommWorldGroup() override { return kNcclWorldGroup; }
   void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
                      const std::vector<tensor::TensorPtr> &inputs_const) const override;
+  void UpdateOutputTensors(const VectorRef *outputs,
+                           const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) override;
 
  private:
   void SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const;
diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index a7e8dda65a..14b32f409a 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -210,9 +210,11 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair,
 
 BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &graph,
                                 const std::vector<tensor::TensorPtr> &input_tensors,
-                                std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
+                                std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node,
+                                KernelMapTensor *node_to_tensor) {
   MS_EXCEPTION_IF_NULL(anf);
   MS_EXCEPTION_IF_NULL(tensor_to_node);
+  MS_EXCEPTION_IF_NULL(node_to_tensor);
   MS_LOG(INFO) << "Create tensor for output[" << anf->DebugString() << "]";
   auto item_with_index = AnfAlgo::VisitKernelWithReturnType(anf, 0);
   MS_EXCEPTION_IF_NULL(item_with_index.first);
@@ -223,7 +225,7 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra
     MS_EXCEPTION_IF_NULL(cnode);
     VectorRef ret;
     for (size_t i = 1; i < cnode->inputs().size(); ++i) {
-      auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node);
+      auto out = CreateNodeOutputTensors(cnode->input(i), graph, input_tensors, tensor_to_node, node_to_tensor);
       ret.push_back(out);
     }
     return ret;
@@ -233,7 +235,16 @@ BaseRef CreateNodeOutputTensors(const AnfNodePtr &anf, const KernelGraphPtr &gra
   if (size == 0) {
     return VectorRef();
   }
-  return CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node);
+
+  //  The outputs of graph may have the same kernel node, no need to create new tensor.
+  const auto &iter = node_to_tensor->find(item_with_index);
+  if (iter != node_to_tensor->end()) {
+    return iter->second;
+  }
+
+  const auto &tensor = CreateNodeOutputTensor(item_with_index, graph, input_tensors, tensor_to_node);
+  (*node_to_tensor)[item_with_index] = tensor;
+  return tensor;
 }
 
 ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) {
@@ -1503,11 +1514,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap
   MS_EXCEPTION_IF_NULL(kernel_graph);
   MS_EXCEPTION_IF_NULL(outputs);
   std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
+  KernelMapTensor node_to_tensor;
   auto anf_outputs = kernel_graph->outputs();
   for (auto &item : anf_outputs) {
     MS_EXCEPTION_IF_NULL(item);
     MS_LOG(INFO) << "Update output[" << item->DebugString() << "]";
-    outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node));
+    outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, &tensor_to_node, &node_to_tensor));
   }
 
   auto ms_context = MsContext::GetInstance();
@@ -1572,10 +1584,44 @@ void SessionBasic::CreateOutputTensors(const GraphId &graph_id, const std::vecto
   MS_EXCEPTION_IF_NULL(outputs);
   MS_EXCEPTION_IF_NULL(tensor_to_node);
   auto anf_outputs = kernel_graph->outputs();
+  KernelMapTensor node_to_tensor;
   for (auto &item : anf_outputs) {
     MS_EXCEPTION_IF_NULL(item);
     MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]";
-    outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node));
+    outputs->emplace_back(CreateNodeOutputTensors(item, kernel_graph, input_tensors, tensor_to_node, &node_to_tensor));
+  }
+}
+
+void SessionBasic::UpdateOutputTensors(const VectorRef *outputs,
+                                       const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) {
+  MS_EXCEPTION_IF_NULL(outputs);
+  for (const auto &item : *outputs) {
+    if (utils::isa<VectorRefPtr>(item)) {
+      const auto &vector_ref = utils::cast<VectorRef>(item);
+      UpdateOutputTensors(&vector_ref, tensor_to_node);
+    } else if (utils::isa<tensor::TensorPtr>(item)) {
+      const auto &tensor = utils::cast<tensor::TensorPtr>(item);
+      MS_EXCEPTION_IF_NULL(tensor);
+      const auto &iter = tensor_to_node.find(tensor);
+      if (iter != tensor_to_node.end()) {
+        const auto &node = iter->second.first;
+        const auto &output_index = iter->second.second;
+        const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index);
+        tensor->set_device_address(address);
+
+        if (AnfAlgo::IsDynamicShape(node)) {
+          const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index);
+          ShapeVector int_shape;
+          std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt);
+          tensor->set_shape(int_shape);
+        }
+      }
+      if (tensor->NeedSyncDeviceToHostImmediately()) {
+        tensor->data_sync(false);
+        tensor->set_device_address(nullptr);
+        tensor->set_sync_status(kNeedSyncHostToDevice);
+      }
+    }
   }
 }
 
@@ -1622,11 +1668,12 @@ void SessionBasic::GetModelOutputsInfo(uint32_t graph_id, std::vector<tensor::Te
 
   VectorRef vector_outputs;
   std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
+  KernelMapTensor node_to_tensor;
   auto anf_outputs = kernel_graph->outputs();
   for (auto &item : anf_outputs) {
     MS_EXCEPTION_IF_NULL(item);
     MS_LOG(INFO) << "Create node output[" << item->DebugString() << "]";
-    vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node));
+    vector_outputs.emplace_back(CreateNodeOutputTensors(item, kernel_graph, inputs, &tensor_to_node, &node_to_tensor));
   }
   *outputs = TransformVectorRefToMultiTensor(vector_outputs);
   for (size_t i = 0; i < outputs->size(); i++) {
diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h
index a44fa56c2b..429048391c 100644
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -73,6 +73,7 @@ struct OutputTensorInfo {
 };
 
 using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;
+using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>;
 class Executor;
 
 class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
@@ -169,6 +170,8 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
   virtual void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors,
                                    VectorRef *outputs,
                                    std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
+  virtual void UpdateOutputTensors(const VectorRef *outputs,
+                                   const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node);
   virtual void UnifyMindIR(const KernelGraphPtr &graph) {}
   virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; }
   virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; }