improve perf, keep consistent tensor state, fix recheck, check weights at step end

5 years ago · dd0084c52b
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -1003,18 +1003,9 @@ void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs)
 void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  MS_LOG(INFO) << "Start!";
  MS_EXCEPTION_IF_NULL(kernel_graph);
 #ifdef ENABLE_DEBUGGER
  if (debugger_->DebuggerBackendEnabled()) {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    // TensorData will be freed up here
    debugger_->EmptyTensor();
    uint32_t iter_num = debugger_->GetTensorLoaderIterNum();
    debugger_->SetTensorLoaderIterNum(++iter_num);
    (void)runtime_instance->LoadData(kernel_graph.get());
    debugger_->EmptyPrevTensor();
  }
 #endif
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  (void)runtime_instance->LoadData(kernel_graph.get());
  MS_LOG(INFO) << "Finish!";
 }

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -360,7 +360,9 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
  SyncValueNodeDeviceAddr(kernel_graph);
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
  PreIterationDbg(kernel_graph);
  if (debugger_) {
    debugger_->PreExecute(kernel_graph, graph_sum_);
  }
 #if ENABLE_CPU && ENABLE_GPU
  // Initialize parameter server
  InitPSParamAndOptim(kernel_graph, inputs);
@@ -372,7 +374,6 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
  for (int64_t i = 0; i < loopsize; i++) {
    Execute(kernel_graph);
  }
  PostLoadTensor(kernel_graph);
  // In pynative mode, device addresses of tensors in value nodes need be clean.
  CleanValueNodeDeviceAddr(kernel_graph);
  // Summary
@@ -443,13 +444,6 @@ bool GPUSession::DumpDataEnabledIteration() const {
  return runtime_instance->DumpDataEnabledIteration();
 }

 void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  if (debugger_) {
    debugger_->PreExecute(kernel_graph, graph_sum_);
  }
  PreLoadTensor(kernel_graph);
 }

 void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  // debug used for dump
@@ -463,30 +457,6 @@ void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_gra
  }
 }

 void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
    return;
  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  debugger_->EmptyTensor();
  uint32_t iter_num = debugger_->GetTensorLoaderIterNum();
  debugger_->SetTensorLoaderIterNum(++iter_num);
 }

 void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
    return;
  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  debugger_->EmptyPrevTensor();
 }

 void GPUSession::SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -75,14 +75,8 @@ class GPUSession : public SessionBasic {

  bool DumpDataEnabledIteration() const;

  void PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void CleanValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -66,7 +66,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
                                     std::vector<std::vector<parameter_t>> *parameters,
                                     std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows,
                                     const std::vector<std::shared_ptr<TensorData>> &tensor_list,
                                     const bool init_dbg_suspend) {
                                     const bool init_dbg_suspend, const bool step_end, const bool recheck) {
  std::lock_guard<std::mutex> lg(lock_);
  if (watchpoint_table.empty()) return;

@@ -75,13 +75,26 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
    const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
    const auto tensor_slot = std::to_string(tensor->GetSlot());
    mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
    // no elements to analyze
    if (tensor_ptr->DataSize() == 0) continue;
    int tensor_dtype = tensor_ptr->data_type_c();
    std::vector<watchpoint_t> watchpoints_to_check;
    std::string qualified_tensor_name;
    for (auto w_table_item : watchpoint_table) {
      auto wp = std::get<1>(w_table_item);
      if (wp.condition.type == INIT && !init_dbg_suspend) continue;
      // check ONLY init conditions on intial suspended state.
      // skip other conditions on intial suspended state
      // skip init condition on all the other states
      if ((wp.condition.type == INIT) ^ init_dbg_suspend) continue;

      if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue;

      // check change conditions only on step end.
      if (wp.change_condition() && !step_end) continue;

      // if recheck, ignore the cache results and reanalyze everything.
      // if not a recheck, check only unanalyzed tensors
      if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
      std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
      if (!found.empty()) {
        qualified_tensor_name = found;
@@ -174,6 +187,10 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
        error_code = std::get<1>(item);
        parameter_list = std::get<2>(item);
      }
      // add analyzed tensor to cache
      if (!recheck) {
        wp_id_cache[tensor_name].insert(wp.id);
      }

      if (is_hit || error_code) {
        name->push_back(qualified_tensor_name);
@@ -238,28 +255,6 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode
  }
 }

 void DebugServices::AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list,
                                         const CNodePtr &kernel) {
  if (kernel) {
    auto input_size = AnfAlgo::GetInputTensorNum(kernel);
    for (size_t j = 0; j < input_size; ++j) {
      auto input_kernel = kernel->input(j + 1);
      std::string input_kernel_name = input_kernel->fullname_with_scope();
      auto found_dot = input_kernel_name.find_last_of('.');
      if (found_dot != std::string::npos &&
          (input_kernel_name.substr(found_dot + 1) == "weight" || input_kernel_name.substr(found_dot + 1) == "bias")) {
        std::string locate_tensor = input_kernel_name + ":0";
        std::map<std::string, std::shared_ptr<TensorData>> tensor_map = tensor_loader_->GetTensorMap();
        std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
        iter = tensor_map.find(locate_tensor);
        if (iter != tensor_map.end()) {
          tensor_list->push_back(iter->second);
        }
      }
    }
  }
 }

 void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }

 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
@@ -292,4 +287,32 @@ std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::Get
  return watchpoint_table;
 }

 void DebugServices::ResetLoadedTensors() {
  wp_id_cache.clear();
  MS_LOG(INFO) << "Resetting loaded tensors";
  tensor_loader_->MoveParametersCurrentToPrev();
  tensor_loader_->EmptyCurrentTensor();
  // will move parameters from previous to current map
  tensor_loader_->SwapCurrentPrev();
 }

 std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  MS_EXCEPTION_IF_NULL(kernel);
  std::vector<std::shared_ptr<TensorData>> result;
  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  auto kernel_name = kernel->fullname_with_scope();
  for (size_t j = 0; j < output_size; ++j) {
    auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
    auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
    if (tensor) result.push_back(tensor);
  }
  return result;
 }
 bool DebugServices::TensorExistsInCurrent(std::string tensor_name) {
  return tensor_loader_->TensorExistsInCurrent(tensor_name);
 }
 void DebugServices::MoveTensorCurrentToPrev(std::string tensor_name) {
  tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
 }

 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -22,6 +22,7 @@
 #include <memory>
 #include <tuple>
 #include <unordered_map>
 #include <set>
 #include <mutex>
 #include <map>
 #include <limits>
@@ -160,6 +161,10 @@ class DebugServices {
    bool range_enabled() const {
      return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
    }

    bool change_condition() const {
      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL || condition.type == NOT_CHANGED;
    }
  } watchpoint_t;

  void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
@@ -171,7 +176,8 @@ class DebugServices {
  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
                        std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
                        std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);
                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend,
                        const bool step_end, const bool recheck);

  void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                        std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
@@ -181,8 +187,6 @@ class DebugServices {

  bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;

  void AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list, const CNodePtr &kernel);

  void EmptyTensor();

  std::vector<std::shared_ptr<TensorData>> GetTensor() const;
@@ -205,9 +209,19 @@ class DebugServices {

  std::unordered_map<unsigned int, watchpoint_t> GetWatchpointTable();

  void ResetLoadedTensors();

  std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);

  bool TensorExistsInCurrent(std::string tensor_name);

  void MoveTensorCurrentToPrev(std::string tensor_name);

 private:
  std::mutex lock_;

  // to keep track of watchpoints that have been checked already for a tensor in current step
  std::unordered_map<std::string, std::set<int32_t>> wp_id_cache;
  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;

  TensorLoader *tensor_loader_;
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -313,20 +313,16 @@ void Debugger::PostExecute() {
  }
  if (debugger_->DebuggerBackendEnabled()) {
    // analyze tensor data and send the watchpoints been hit
    if (run_level_ == "node") {
      MS_LOG(INFO) << "Debugger is in node level mode ";
      return;
    }
    if (debugger_enabled_ && !is_dataset_graph_) {
      if (device_target_ != kGPUDevice) {
        num_step_++;
        MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
        SendWatchpoints(CheckWatchpoints());
        CommandLoop();
      } else {
        CommandLoop();
      }
      MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
      SendWatchpoints(CheckWatchpoints());
      CommandLoop();
    }
    // Only keep parameters in the current map
    debug_services_->ResetLoadedTensors();
  }
 }

@@ -596,7 +592,7 @@ void Debugger::CommandLoop() {
        MS_LOG(INFO) << "RunCMD";
        if (GetRunLevel(reply) == "recheck") {
          MS_LOG(INFO) << "rechecking all watchpoints";
          SendWatchpoints(CheckWatchpoints());
          SendWatchpoints(CheckWatchpoints("", nullptr, true));
        } else {
          // no longer the initial suspension.
          initial_suspend_ = false;
@@ -705,9 +701,6 @@ void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCon
      return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
    });
  debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list);
  if (initial_suspend_ &&
      static_cast<DebugServices::CONDITION_TYPE>(condition.condition()) == DebugServices::CONDITION_TYPE::INIT)
    SendWatchpoints(CheckWatchpoints());
 }

 void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
@@ -780,7 +773,8 @@ void Debugger::Exit() {
  }
 }

 std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel) {
 std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
                                                    bool recheck) {
  std::vector<std::string> name;
  std::vector<std::string> slot;
  std::vector<int> condition;
@@ -795,11 +789,10 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
  if (watchnode.empty()) {
    tensor_list = debug_services_->GetTensor();
  } else {
    tensor_list = debug_services_->GetNodeTensorMap(watchnode);
    debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
    tensor_list = debug_services_->GetNodeTensor(kernel);
  }
  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
                                    tensor_list, initial_suspend_);
                                    tensor_list, initial_suspend_, watchnode.empty(), recheck);
  std::list<WatchpointHit> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
    WatchpointHit hit;
@@ -1045,7 +1038,7 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
  }
  closedir(d);

  if (op_names.size()) {
  if (!op_names.empty()) {
    MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
  }

@@ -1091,12 +1084,6 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
    return;
  }
  bool keep_prev;
  if (anf_node->isa<Parameter>()) {
    keep_prev = true;
  } else {
    keep_prev = false;
  }
  // for parameters and value nodes, set its execution order to be 0;
  int exec_order = 0;
  std::string node_name = anf_node->fullname_with_scope();
@@ -1114,6 +1101,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  auto shape = AnfAlgo::GetOutputDeviceShape(anf_node, output_index);
  (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                       [](size_t inner_item) { return SizeToInt(inner_item); });
  bool keep_prev;
  if (anf_node->isa<Parameter>()) {
    keep_prev = true;
    debug_services_->MoveTensorCurrentToPrev(tensor_name);
  } else {
    keep_prev = false;
  }
  bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev);
  if (!ret) {
    MS_LOG(ERROR) << "LoadMemToHost:"
@@ -1123,9 +1117,6 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output

 void Debugger::LoadParametersAndConst() {
  if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
        (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
    return;
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  // load parameters
  MS_LOG(INFO) << "Start to load Parameters!";
@@ -1199,5 +1190,8 @@ void Debugger::ClearCurrentData() {
  if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
    debug_services_->EmptyCurrentTensor();
 }
 bool Debugger::TensorExistsInCurrent(std::string tensor_name) {
  return debug_services_->TensorExistsInCurrent(tensor_name);
 }

 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -145,6 +145,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  std::list<KernelGraphPtr> GetGraphPtrList() { return graph_ptr_list_; }

  bool TensorExistsInCurrent(std::string tensor_name);

 private:
  // private constructor for singleton
  Debugger();
@@ -197,7 +199,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // analyze tensors and check watchpoint conditions
  // return names of tensors and what condition they hit
  std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string(),
                                            const CNodePtr &kernel = NULL);
                                            const CNodePtr &kernel = nullptr, bool recheck = false);

  // send watchpoints that hit
  void SendWatchpoints(const std::list<WatchpointHit> &points);
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -33,6 +33,44 @@ class TensorLoader {

  ~TensorLoader() { EmptyTensor(); }

  void MoveTensorCurrentToPrev(std::string tensor_name) {
    auto handle = tensor_list_map.extract(tensor_name);
    if (!handle.empty()) {
      MS_LOG(INFO) << "Moving " << tensor_name << " from current map to previous map";
      prev_tensor_list_map.insert(std::move(handle));
    }
  }

  void SwapCurrentPrev() { tensor_list_map.swap(prev_tensor_list_map); }

  bool TensorExistsInCurrent(std::string tensor_name) {
    return tensor_list_map.find(tensor_name) != tensor_list_map.end();
  }

  // only parameters will return true
  bool PrevTensorExistsInCurrent(std::string tensor_name) { return TensorExistsInCurrent(tensor_name + ":prev"); }

  void MoveParametersCurrentToPrev() {
    MS_LOG(INFO) << "Moving parameters from current map to previous map";
    auto iter = tensor_list_map.begin();
    while (iter != tensor_list_map.end()) {
      auto key = iter->first;
      if (PrevTensorExistsInCurrent(key)) {
        // :prev tensor only exists for parameter. Move it to prev
        ++iter;
        MoveTensorCurrentToPrev(key);
      } else {
        ++iter;
      }
    }
  }

  bool IsPrevTensor(std::string tensor_name) {
    const std::string suffix = ":prev";
    if (tensor_name.length() <= suffix.length()) return false;
    return std::equal(suffix.rbegin(), suffix.rend(), tensor_name.rbegin());
  }

  bool LoadNewTensor(std::shared_ptr<TensorData> tensor, bool keep_prev) {
    std::lock_guard<std::mutex> lg(lock_);
    if (keep_prev) {
@@ -43,20 +81,32 @@ class TensorLoader {
        tensor_list_map.insert(std::move(handle));
      }
    }
    tensor_list.push_back(tensor);
    tensor_list_map[tensor->GetName()] = tensor;  // use [] instead of insert to ensure latest value
    auto node_name = tensor->GetName();
    node_name = node_name.substr(0, node_name.find_first_of(":"));
    node_tensor_map.insert({node_name, tensor});
    return true;
  }
  std::vector<std::shared_ptr<TensorData>> GetTensor() { return tensor_list; }

  std::vector<std::shared_ptr<TensorData>> GetTensor() {
    std::vector<std::shared_ptr<TensorData>> tensor_list;
    for (auto &it : tensor_list_map) {
      if (!IsPrevTensor(it.first)) tensor_list.push_back(it.second);
    }
    return tensor_list;
  }

  std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) {
    auto iter = tensor_list_map.find(tensor_name);
    if (iter != tensor_list_map.end()) return iter->second;
    return nullptr;
  }

  uint32_t GetIterNum() { return iter_num; }

  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }

  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) {
  std::shared_ptr<TensorData> GetPrevTensor(const std::string &tensor_name) {
    if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
      return tensor_list_map[tensor_name + ":prev"];
    }
@@ -91,14 +141,13 @@ class TensorLoader {
    prev_tensor_list_map.clear();
    node_tensor_map.clear();
    tensor_list_map.swap(prev_tensor_list_map);
    tensor_list.clear();
  }

  void EmptyPrevTensor() { prev_tensor_list_map.clear(); }

  void EmptyCurrentTensor() {
    tensor_list_map.clear();
    tensor_list.clear();
    node_tensor_map.clear();
  }

  void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
@@ -142,7 +191,6 @@ class TensorLoader {
  }

 private:
  std::vector<std::shared_ptr<TensorData>> tensor_list;
  std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
  std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map;
  std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map;
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -674,6 +674,10 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
                                        const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
                                        size_t slot, bool keep_prev) const {
  bool ret = false;
  if (Debugger::GetInstance()->TensorExistsInCurrent(tensor_name)) {
    MS_LOG(INFO) << tensor_name << " already loaded for this step so not loading it again.";
    return true;
  }
  // TensorData is freed up in AscendSession class
  auto tensor_data = std::make_shared<mindspore::TensorData>();
  tensor_data->SetName(tensor_name);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -296,8 +296,6 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DEBUGGER
  MS_LOG(INFO) << "Start load step";
  uint32_t cur_iter = 0;
  MS_LOG(INFO) << "Cur iter is " << cur_iter;
  for (auto graph_ptr : debugger_->GetGraphPtrList()) {
    debugger_->SetGraphPtr(graph_ptr);
    // load output
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -87,6 +87,11 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
    return true;
  }

  if (Debugger::GetInstance()->TensorExistsInCurrent(tensor_name)) {
    MS_LOG(INFO) << tensor_name << " already loaded for this step so not loading it again.";
    return true;
  }

  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
  size_t host_size = out_tensor->data().nbytes();
  auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -154,8 +154,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    std::vector<int> real_outputs;
    real_outputs = CheckRealOutput(node_name, output_size);

    for (std::vector<int>::iterator it = real_outputs.begin(); it != real_outputs.end(); ++it) {
      auto j = *it;
    for (int j : real_outputs) {
      auto addr = kernel_outputs[j];
      auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
      auto format = kOpFormat_DEFAULT;