load inputs before suspending execution in dbg

5 years ago · 7c5e0541ba
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -171,7 +171,7 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
  device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
  // build kernel
  BuildKernel(root_graph);
  if (debugger_) {
  if (debugger_ && debugger_->partial_memory()) {
    debugger_->PreExecute(root_graph);
  }
  SetSummaryNodes(root_graph.get());
@@ -248,7 +248,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
  BuildKernel(graph);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (debugger_) {
  if (debugger_ && debugger_->partial_memory()) {
    debugger_->PreExecute(graph);
  }
  if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
@@ -312,6 +312,9 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
  }
  // load input data from user input
  LoadInputData(kernel_graph, inputs);
  if (debugger_) {
    debugger_->PreExecute(kernel_graph);
  }
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -278,9 +278,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
 void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  auto &kernel_graph = graphs_[graph_id];
  PreIterationDbg(kernel_graph);
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
  PreIterationDbg(kernel_graph);
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -22,7 +22,6 @@
 #include <utility>
 #include <memory>
 #include <map>
 #include "backend/session/session_context.h"
 #include "backend/session/kernel_graph.h"
 #include "backend/session/anf_runtime_algorithm.h"
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -30,6 +30,7 @@
 #include "pipeline/jit/pipeline.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "runtime/device/kernel_runtime.h"
 using debugger::EventReply;
 using debugger::GraphProto;
@@ -47,6 +48,7 @@ namespace mindspore {
 DebuggerPtr Debugger::debugger_ = nullptr;
 std::mutex Debugger::instance_lock_;
 static const size_t PRAMATER_OUTPUT_INDEX = 0;
 Debugger::Debugger()
    : grpc_client_(nullptr),
@@ -62,7 +64,26 @@ Debugger::Debugger()
      is_dataset_graph_(false),
      partial_memory_(false),
      last_overflow_bin_(0),
      overflow_bin_path_("") {}
      overflow_bin_path_("") {
  if (CheckDebuggerEnabled()) {
    // configure partial memory reuse
    partial_memory_ = CheckDebuggerPartialMemoryEnabled();
    // switch memory reuse on or off
    auto context_ptr = MsContext::GetInstance();
    MS_EXCEPTION_IF_NULL(context_ptr);
    context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
    // print some message about memory reuse to user
    if (partial_memory_) {
      MS_LOG(WARNING)
        << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
           "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
    } else {
      MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
                      "usage for large models.";
    }
  }
 }
 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
  // access lock for public method
@@ -133,27 +154,6 @@ void Debugger::EnableDebugger() {
    MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
    port = "50051";
  }
  // configure partial memory reuse
  const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
  if (env_partial_mem_str != nullptr) {
    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
    if (std::strcmp(env_partial_mem_str, "1") == 0) {
      partial_memory_ = true;
    }
  }
  // switch memory reuse on or off
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
  // print some message about memory reuse to user
  if (partial_memory_) {
    MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
                       "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
  } else {
    MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
                    "usage for large models.";
  }
 #ifdef ENABLE_D
  // set operation overflow info
  overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
@@ -195,9 +195,7 @@ void Debugger::EnableDebugger() {
 bool Debugger::CheckDebuggerDumpEnabled() {
  // see if dump is enabled
  if (device_target_ == kGPUDevice) {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    return runtime_instance->DumpDataEnabled();
    return device::KernelRuntime::DumpDataEnabled();
  }
  return false;
 }
@@ -213,6 +211,17 @@ bool Debugger::CheckDebuggerEnabled() {
  return false;
 }
 bool Debugger::CheckDebuggerPartialMemoryEnabled() {
  const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
  if (env_partial_mem_str != nullptr) {
    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
    if (std::strcmp(env_partial_mem_str, "1") == 0) {
      return true;
    }
  }
  return false;
 }
 bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
 void Debugger::Reset() {
@@ -324,6 +333,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
      // only try to enable debugger if it is not a dataset graph
      EnableDebugger();
      if (debugger_enabled_) {
        LoadParameters();
        // get graph proto and send to mindinsight
        SendGraphAndSuspend(GetGraphProto());
      }
@@ -839,4 +849,34 @@ bool Debugger::CheckPort(const char *port) {
  return true;
 }
 void Debugger::LoadParameters() {
  if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
        (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
    return;
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  const auto &parameters = graph_ptr_->inputs();
  // for parameters, set its execution order to be 0;
  int exec_order = 0;
  for (auto &item : parameters) {
    if (!item->isa<Parameter>()) {
      continue;
    }
    std::string parameter_name = item->fullname_with_scope();
    auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
    auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
    auto format = kOpFormat_DEFAULT;
    string tensor_name = parameter_name + ':' + "0";
    ShapeVector int_shapes;
    auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
    }
  }
 }
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -103,6 +103,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void SendMetadata();
  void LoadParameters();
 private:
  // private constructor for singleton
  Debugger();
@@ -118,6 +120,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // check if debugger enabled
  bool CheckDebuggerEnabled();
  bool CheckDebuggerPartialMemoryEnabled();
  // check and save graph pointer
  void CheckGraphPtr(const KernelGraphPtr &graph_ptr);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -663,39 +663,25 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file
 }
 #ifdef ENABLE_DEBUGGER
 bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order,
 bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order,
                                        const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
                                        size_t slot, Debugger *debugger, bool keep_prev) const {
                                        size_t slot, bool keep_prev) const {
  bool ret = false;
  DebugServices *debug_services = debugger->debug_services();
  MS_EXCEPTION_IF_NULL(debug_services);
  TensorLoader *tensor_loader = debug_services->tensor_loader();
  TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
  MS_EXCEPTION_IF_NULL(tensor_loader);
  // TensorData is freed up in AscendSession class
  auto tensor_data = std::make_shared<mindspore::TensorData>();
  tensor_data->SetName(tensor_name);
  tensor_data->SetExecutionOrder(execution_order);
  tensor_data->SetSlot(slot);
  if (trans_flag) {
    MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
    mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
    size_t host_size = out_tensor->data().nbytes();
    ret = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
    if (!ret) {
      MS_LOG(ERROR) << "Copy device mem to host failed";
      return ret;
    }
    tensor_data->SetTensor(out_tensor);
  } else {
    mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
    size_t host_size = out_tensor->data().nbytes();
    auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
    if (ret_rt_memcpy != RT_ERROR_NONE) {
      MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
    }
    MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
    tensor_data->SetTensor(out_tensor);
  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
  size_t host_size = out_tensor->data().nbytes();
  auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
  if (ret_rt_memcpy != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
  }
  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
  tensor_data->SetTensor(out_tensor);
  ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
  return ret;
 }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
@@ -45,9 +45,8 @@ class AscendDeviceAddress : public DeviceAddress {
  bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type) const override;
 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
                     bool keep_prev) const;
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
 #endif
 private:
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -254,15 +254,10 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
      auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
      MS_EXCEPTION_IF_NULL(ascend_addr);
      ShapeVector int_shapes;
      if (trans_flag) {
        int_shapes = trans::GetRuntimePaddingShape(node, j);
      } else {
        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                             [](size_t inner_item) { return SizeToInt(inner_item); });
      }
      auto ret =
        ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
      auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                           [](size_t inner_item) { return SizeToInt(inner_item); });
      auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
      if (!ret) {
        MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name
                      << ", host_format:" << format << ".!";
@@ -272,40 +267,6 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  }
 }
 void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(graph);
  // trans_flag: "true" means tensor values will be transfered to host format, otherwise not.
  bool trans_flag = false;
  const auto &parameters = graph->inputs();
  // for parameters, set its execution order to be 0;
  int exec_order = 0;
  for (auto &item : parameters) {
    if (!item->isa<Parameter>()) {
      continue;
    }
    std::string parameter_name = item->fullname_with_scope();
    auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
    auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
    auto format = kOpFormat_DEFAULT;
    string tensor_name = parameter_name + ':' + "0";
    auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
    MS_EXCEPTION_IF_NULL(ascend_addr);
    ShapeVector int_shapes;
    if (trans_flag) {
      int_shapes = trans::GetRuntimePaddingShape(item, PRAMATER_OUTPUT_INDEX);
    } else {
      auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                           [](size_t inner_item) { return SizeToInt(inner_item); });
    }
    auto ret =
      ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name
                    << ", host_format:" << format << ".!";
    }
  }
 }
 }  // namespace
 #endif
@@ -319,7 +280,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debug
  // load output
  LoadOutput(graph, debugger);
  // load parameters
  LoadParameters(graph, debugger);
  if (debugger) debugger->LoadParameters();
 #endif
  return true;
 }
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@@ -70,6 +70,12 @@ class DeviceAddress : public mindspore::DeviceSync {
                             const ShapeVector &host_shape, TypeId host_type) const {
    return true;
  }
 #ifdef ENABLE_DEBUGGER
  virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                             const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const {
    return true;
  }
 #endif
 protected:
  const void *ptr() const { return ptr_; }
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -80,14 +80,14 @@ GPUDeviceAddress::~GPUDeviceAddress() {
 }
 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
                                     const ShapeVector &host_shape, TypeId host_type, size_t slot,
                                     bool keep_prev) const {
  bool ret = false;
  if (size_ == 0) {
    return true;
  }
  DebugServices *debug_services = debugger->debug_services();
  TensorLoader *tensor_loader = debug_services->tensor_loader();
  TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
  size_t host_size = out_tensor->data().nbytes();
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@@ -44,8 +44,7 @@ class GPUDeviceAddress : public DeviceAddress {
 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
                     bool keep_prev) const;
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
 #endif
 private:
  DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -111,7 +111,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -130,7 +130,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@@ -148,36 +148,6 @@ void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
  }
 }
 void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
  MS_EXCEPTION_IF_NULL(graph);
  if (!(debugger && dump_enabled)) {
    return;
  }
  const auto &parameters = graph->inputs();
  // for parameters, set its execution order to be 0;
  int exec_order = 0;
  for (auto &item : parameters) {
    if (!item->isa<Parameter>()) {
      continue;
    }
    std::string parameter_name = item->fullname_with_scope();
    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
    auto format = kOpFormat_DEFAULT;
    string tensor_name = parameter_name + ':' + "0";
    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
    ShapeVector int_shapes;
    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
    }
  }
 }
 void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
    DebugServices *debug_services = debugger->debug_services();
@@ -601,7 +571,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
  }
  if (!mock) {
    // collect weights and bias for dump mode
    LoadParameters(graph, debugger, dump_enabled);
    if (debugger) debugger->LoadParameters();
    CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
  }
  ClearSwapInfo(mock);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -53,8 +53,8 @@ class KernelRuntime {
  void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector<tensor::TensorPtr> &input_tensors,
                         session::KernelGraph *graph);
  void RunOpClearMemory(const session::KernelGraph *graph);
  bool DumpDataEnabled();
  bool DumpDataEnabledIteration();
  static bool DumpDataEnabled();
  static bool DumpDataEnabledIteration();
  virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
  virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
  virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;