From e3a7b7ab923828c14f5373fb03531789b6d0a27f Mon Sep 17 00:00:00 2001
From: wilfChen <chenweifeng720@huawei.com>
Date: Tue, 10 Nov 2020 18:02:58 +0800
Subject: [PATCH] gpu support dynamic shape

---
 .../kernel_compiler/gpu/gpu_kernel_factory.h  | 28 ++++++--------
 .../pass/convert_const_input_to_attr.cc       |  2 +-
 .../runtime/device/executor/dynamic_kernel.cc | 37 +++++++++++--------
 .../runtime/device/executor/dynamic_kernel.h  |  6 ++-
 .../runtime/device/gpu/gpu_kernel_runtime.cc  | 20 +++-------
 5 files changed, 43 insertions(+), 50 deletions(-)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
index d131b10e4c..6bb64d5ba5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
@@ -78,44 +78,38 @@ class GpuKernelRegister {
 // variable has been created.
 #define uchar unsigned char
 
-#define UNIQUE_KERNEL_NAME(kernel) KERNEL_NAME(kernel, __COUNTER__)
+#define UNIQUE_KERNEL_NAME(kernel) KERNEL_NAME(g_##kernel##_gpu_kernel_reg, __COUNTER__)
 #define KERNEL_NAME(kernel, cnt) MERGE(kernel, cnt)
 #define MERGE(kernel, cnt) kernel##cnt
 
-#define MS_REG_GPU_KERNEL(OPNAME, OPCLASS)                                                              \
-  static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel");              \
-  static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_gpu_kernel_reg)(#OPNAME, KernelAttr(), \
-                                                                                 []() { return new OPCLASS(); });
+#define MS_REG_GPU_KERNEL(OPNAME, OPCLASS)                                                 \
+  static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel"); \
+  static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, KernelAttr(), []() { return new OPCLASS(); });
 
 // regular register of fixed accuracy kernels
-#define MS_REG_GPU_KERNEL_REGULAR(OPNAME, ATTR, OPCLASS)                                        \
-  static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel");      \
-  static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_gpu_kernel_reg)(#OPNAME, ATTR, \
-                                                                                 []() { return new OPCLASS(); });
+#define MS_REG_GPU_KERNEL_REGULAR(OPNAME, ATTR, OPCLASS)                                   \
+  static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel"); \
+  static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS(); });
 
 // register of mixed accuracy kernels which use template and maintain one typename, ignore input num
 #define MS_REG_GPU_KERNEL_SAME(OPNAME, ATTR, OPCLASS, T)                                      \
   static_assert(std::is_base_of<GpuKernel, OPCLASS<T>>::value, " must be base of GpuKernel"); \
-  static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_gpu_kernel_reg)(       \
-    #OPNAME, ATTR, []() { return new OPCLASS<T>(); });
+  static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS<T>(); });
 
 // register of mixed accuracy kernels which use template and maintain one typename
 #define MS_REG_GPU_KERNEL_ONE(OPNAME, ATTR, OPCLASS, T)                                       \
   static_assert(std::is_base_of<GpuKernel, OPCLASS<T>>::value, " must be base of GpuKernel"); \
-  static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_gpu_kernel_reg)(       \
-    #OPNAME, ATTR, []() { return new OPCLASS<T>(); });
+  static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS<T>(); });
 
 // register of mixed accuracy kernels which use template and maintain two typename
 #define MS_REG_GPU_KERNEL_TWO(OPNAME, ATTR, OPCLASS, T, S)                                       \
   static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
-  static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_##S##_gpu_kernel_reg)(    \
-    #OPNAME, ATTR, []() { return new OPCLASS<T, S>(); });
+  static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS<T, S>(); });
 
 // register of mixed accuracy kernels which use template and maintain three typename
 #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G)                                     \
   static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
-  static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg)( \
-    #OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
+  static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNELFACTORY_H_
diff --git a/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc b/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc
index 640d8d9801..bd284c1c96 100644
--- a/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc
@@ -62,7 +62,7 @@ const AnfNodePtr ConvertConstInputToAttr::Process(const FuncGraphPtr &, const An
         continue;
       }
     }
-    if (AnfAlgo::IsDynamicShape(cnode) &&
+    if (AnfAlgo::IsNodeDynamicShape(cnode) &&
         DynamicShapeConstInputToAttr.find(AnfAlgo::GetCNodeName(cnode)) == DynamicShapeConstInputToAttr.end()) {
       MS_LOG(INFO) << "current node is dynamic shape " << cnode->fullname_with_scope();
       continue;
diff --git a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
index 9f002a1c6c..e4512e3b5c 100644
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
@@ -42,24 +42,10 @@ void DynamicKernel::Initialize() {
     return;
   }
   MS_LOG(INFO) << "Have depends";
-  std::vector<int> depends_list;
   std::vector<int64_t> depends_list_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(cnode_ptr_, kDynamicShapeDepends);
-  (void)std::transform(depends_list_me.begin(), depends_list_me.end(), std::back_inserter(depends_list),
+  (void)std::transform(depends_list_me.begin(), depends_list_me.end(), std::back_inserter(depend_list_),
                        [](const int64_t &value) { return static_cast<int>(value); });
-  // Save depend input tensor. Sync data in InferShape.
-  for (auto depend : depends_list) {
-    auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode_ptr_, depend);
-    auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode_ptr_, depend);
-    std::vector<int64_t> shapes = trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second);
-    auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second);
-    auto out_tensor = std::make_shared<tensor::Tensor>(host_type, shapes);
-    out_tensor->set_device_address(output_addr);
 
-    auto ret = depend_tensor_map_.try_emplace(depend, out_tensor);
-    if (!ret.second) {
-      MS_LOG(EXCEPTION) << "Insert map failed";
-    }
-  }
   MS_LOG(INFO) << "Init End";
 }
 
@@ -74,6 +60,22 @@ bool IsTupleGetItem(const AnfNodePtr &anf_node) {
   return IsPrimitive(input0, prim::kPrimTupleGetItem);
 }
 
+void DynamicKernel::RebuildDependTensor() {
+  depend_tensor_map_.clear();
+  for (auto depend : depend_list_) {
+    auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode_ptr_, depend);
+    auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode_ptr_, depend);
+    std::vector<int64_t> shapes = trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second);
+    auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second);
+    auto out_tensor = std::make_shared<tensor::Tensor>(host_type, shapes);
+    out_tensor->set_device_address(output_addr);
+    auto ret = depend_tensor_map_.try_emplace(depend, out_tensor);
+    if (!ret.second) {
+      MS_LOG(EXCEPTION) << "Insert map failed";
+    }
+  }
+}
+
 void DynamicKernel::InferShape() {
   if (!is_input_dynamic_shape_ && is_output_dynamic_shape_ && !have_depends()) {
     return;
@@ -88,12 +90,15 @@ void DynamicKernel::InferShape() {
   AbstractBasePtrList args_spec_list;
   auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
 
+  // rebuild depend tensor map for gpu dynamic memory allocation.
+  RebuildDependTensor();
+
   auto input_size = AnfAlgo::GetInputTensorNum(cnode_ptr_);
   for (size_t i = 0; i < input_size; ++i) {
     auto input_with_index = AnfAlgo::GetPrevNodeOutput(cnode_ptr_, i);
     auto real_input = input_with_index.first;
-
     MS_EXCEPTION_IF_NULL(real_input);
+
     auto ret = depend_tensor_map_.find(i);
     if (ret != depend_tensor_map_.end()) {
       auto tensor_ptr = ret->second;
diff --git a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h
index 4689126af6..649358d1a7 100644
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h
@@ -19,6 +19,7 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 #include <map>
 #include "ir/anf.h"
 #include "ir/tensor.h"
@@ -44,16 +45,19 @@ class DynamicKernel {
   bool is_dynamic_shape() const { return is_dynamic_shape_; }
   bool is_input_dynamic_shape() const { return is_input_dynamic_shape_; }
   bool is_output_dynamic_shape() const { return is_output_dynamic_shape_; }
-  bool have_depends() const { return !depend_tensor_map_.empty(); }
+  bool have_depends() const { return !depend_list_.empty(); }
   virtual void Initialize();
   std::string GetKernelName() { return cnode_ptr_->fullname_with_scope(); }
 
  protected:
+  void RebuildDependTensor();
+
   void *stream_;
   const CNodePtr cnode_ptr_;
   bool is_dynamic_shape_;
   bool is_input_dynamic_shape_;
   bool is_output_dynamic_shape_;
+  std::vector<uint32_t> depend_list_;
   std::map<uint32_t, tensor::TensorPtr> depend_tensor_map_;
 };
 using DynamicKernelPtr = std::shared_ptr<DynamicKernel>;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 314a44b32c..c65a1f3dd8 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -37,7 +37,6 @@
 #include "utils/shape_utils.h"
 #include "debug/data_dump/dump_json_parser.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
-#include "runtime/device/executor/executor_callback.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debug_services.h"
 #endif
@@ -369,7 +368,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) {
 bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) {
   bool ret = true;
   auto graph_id = graph->graph_id();
-  if (!is_first_step_map_[graph_id]) {
+  if (!is_first_step_map_[graph_id] || graph->is_dynamic_shape()) {
     // Normally run graph
     ret = LaunchKernelDynamic(graph);
   } else {
@@ -603,16 +602,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
       dynamic_kernel = gpu_kernel->DynamicKernel();
     }
 
-    if (dynamic_kernel && dynamic_kernel->have_depends()) {
-      MS_LOG(INFO) << "Match Dynamic Kernel, Start SyncStream";
-      if (!SyncStream()) {
-        MS_LOG(ERROR) << "SyncStream failed";
-        return false;
-      }
-    }
-
     if (dynamic_kernel && dynamic_kernel->is_dynamic_shape()) {
-      ExecutorCallback::GetInstance().Consume();
       dynamic_kernel->InferShape();
       dynamic_kernel->UpdateArgs();
     }
@@ -645,9 +635,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
         LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
       }
 
-      ExecutorCallback::GetInstance().RegistCallback([&gpu_kernel] {
-        if (gpu_kernel) gpu_kernel->PostExecute();
-      });
+      if (gpu_kernel && dynamic_kernel && dynamic_kernel->is_dynamic_shape()) {
+        gpu_kernel->PostExecute();
+      }
+
       // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
       LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
                      dump_enabled);
@@ -666,7 +657,6 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
     // collect weights and bias for dump mode
     debugger_->LoadParametersAndConst();
     CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
-    ExecutorCallback::GetInstance().Consume();
   }
   ClearSwapInfo(mock);
   return true;