!3007 GPU debugger and GPU dump - milestone 1

Merge pull request !3007 from john_tzanakakis/master_ms1
5 years ago · ef18c5f879
--- a/build.sh
+++ b/build.sh
@@ -279,6 +279,9 @@ checkopts()
  done
 }
 checkopts "$@"
 if [[ "X$ENABLE_GPU" = "Xon" ]] && [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
    ENABLE_DEBUGGER="on"
 fi
 echo "---------------- MindSpore: build start ----------------"
 mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
 git submodule update --init graphengine
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -37,6 +37,7 @@
 #include "common/trans.h"
 #include "utils/context/ms_context.h"
 #include "utils/base_ref_extends.h"
 #include "debug/tensor_load.h"

 namespace mindspore {
 namespace session {
@@ -164,7 +165,11 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
 void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
 #ifdef ENABLE_DEBUGGER
  if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) {
 #else
  if (!runtime_instance->Run(kernel_graph.get())) {
 #endif
    MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  }
 }
@@ -229,6 +234,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList

 void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  auto &kernel_graph = graphs_[graph_id];
 #ifdef ENABLE_DEBUGGER
  PreIterationDbg(kernel_graph);
 #endif
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
@@ -245,6 +253,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    // Run graph on GPU
    Execute(kernel_graph);
  }
 #ifdef ENABLE_DEBUGGER
  PostLoadTensor(kernel_graph);
 #endif
  // Get result from GPU
  UpdateOutputs(kernel_graph, outputs, inputs);
  // Summary
@@ -253,6 +264,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
  if (context_ptr->enable_gpu_summary()) {
    Summary(kernel_graph.get());
  }
 #ifdef ENABLE_DEBUGGER
  PostIterationDbg(kernel_graph);
 #endif
 }

 void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
@@ -296,6 +310,70 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
  RunOpClearMemory(kernel_graph.get());
  return tuple_tensors;
 }

 #ifdef ENABLE_DEBUGGER
 void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
 #ifdef ENABLE_DUMP_E2E
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
 #endif
 }

 bool GPUSession::DumpDataEnabledIteration() const {
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  return runtime_instance->DumpDataEnabledIteration();
 }

 void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  if (debugger_) {
    debugger_->PreExecute(kernel_graph);
  }
  PreLoadTensor(kernel_graph);
 }

 void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  // debug used for dump
  if (debugger_ && dump_enabled) {
    Dump(kernel_graph);
  }
  if (debugger_) {
    debugger_->PostExecute();
  }
 }

 void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
    return;
  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  DebugServices *debug_services = debugger_->debug_services();
  TensorLoader *tensor_loader = debug_services->tensor_loader();
  tensor_loader->EmptyTensor();
  uint32_t iter_num = tensor_loader->GetIterNum();
  tensor_loader->set_iter_num(++iter_num);
 }

 void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  bool dump_enabled = DumpDataEnabledIteration();
  if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
    return;
  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  DebugServices *debug_services = debugger_->debug_services();
  TensorLoader *tensor_loader = debug_services->tensor_loader();
  tensor_loader->EmptyPrevTensor();
 }
 #endif

 }  // namespace gpu
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -67,6 +67,20 @@ class GPUSession : public SessionBasic {
                     const std::vector<tensor::TensorPtr> &inputs_const) const override;

  void Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const;

 #ifdef ENABLE_DEBUGGER
  void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  bool DumpDataEnabledIteration() const;

  void PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
 #endif
 };
 using GPUSessionPtr = std::shared_ptr<GPUSession>;
 MS_REG_SESSION(kGPUDevice, GPUSession);
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -24,7 +24,6 @@
 #include "backend/kernel_compiler/common_utils.h"
 #include "frontend/operator/ops.h"
 #include "common/trans.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -32,6 +32,7 @@
 #include "utils/contract.h"
 #include "pipeline/pynative/pynative_execute.h"
 #include "runtime/device/kernel_info.h"
 #include "utils/context/ms_context.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
@@ -112,7 +113,9 @@ class SessionBasic {
  // set debugger
  void SetDebugger() {
    debugger_ = Debugger::GetInstance();
    debugger_->Init(device_id_);
    auto ms_context = MsContext::GetInstance();
    MS_EXCEPTION_IF_NULL(ms_context);
    debugger_->Init(device_id_, ms_context->device_target());
  }
 #endif

--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@@ -16,6 +16,7 @@ if (ENABLE_DEBUGGER)
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
        )
 endif (ENABLE_DEBUGGER)

--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -21,6 +21,7 @@
 #include "debug/debugger/debugger.h"
 #include "pipeline/jit/pipeline.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime_manager.h"

 using debugger::EventReply;
 using debugger::GraphProto;
@@ -41,17 +42,20 @@ Debugger::Debugger()
    : grpc_client_(nullptr),
      debug_services_(nullptr),
      device_id_(0),
      device_target_(""),
      num_step_(0),
      debugger_enabled_(false),
      is_dataset_graph_(false),
      partial_memory_(false) {}

 void Debugger::Init(const uint32_t device_id) {
 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  // save device_id
  MS_LOG(INFO) << "Debugger got device_id: " << device_id;
  device_id_ = device_id;
  MS_LOG(INFO) << "Debugger got device_target: " << device_target;
  device_target_ = device_target;
 }

 void Debugger::EnableDebugger() {
@@ -62,6 +66,14 @@ void Debugger::EnableDebugger() {
  grpc_client_ = nullptr;
  debug_services_ = nullptr;

  // see if dump is enabled
  bool dump_enabled = false;
  if (device_target_ == kGPUDevice) {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    dump_enabled = runtime_instance->DumpDataEnabled();
  }

  // get env variables to configure debugger
  const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
  if (env_enable_str != nullptr) {
@@ -70,7 +82,8 @@ void Debugger::EnableDebugger() {
      debugger_enabled_ = true;
    }
  }
  if (!debugger_enabled_) {

  if (!debugger_enabled_ && !dump_enabled) {
    MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
    return;
  }
@@ -118,7 +131,10 @@ void Debugger::EnableDebugger() {
  }

  // initialize grpc client
  grpc_client_ = std::make_unique<GrpcClient>(host, port);
  if (debugger_enabled_) {
    grpc_client_ = std::make_unique<GrpcClient>(host, port);
  }

  debug_services_ = std::make_unique<DebugServices>();
 }

@@ -127,6 +143,7 @@ void Debugger::Reset() {
  std::lock_guard<std::mutex> a_lock(access_lock_);
  // reset components
  device_id_ = 0;
  device_target_ = "";
  num_step_ = 0;
  debugger_enabled_ = false;
  is_dataset_graph_ = false;
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -55,7 +55,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  // init
  // only save device_id
  void Init(const uint32_t device_id);
  void Init(const uint32_t device_id, const std::string device_target);

  // reset debugger
  void Reset();
@@ -128,6 +128,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::unique_ptr<DebugServices> debug_services_;
  KernelGraphPtr graph_ptr_;
  uint32_t device_id_;
  std::string device_target_;
  int32_t num_step_;
  bool debugger_enabled_;
  bool is_dataset_graph_;
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -24,6 +24,10 @@
 #include <string>
 #include <utility>
 #include "debug/tensor_data.h"
 #include "ir/dtype.h"
 #ifdef ENABLE_DUMP_E2E
 #include "debug/e2e_dump.h"
 #endif
 namespace mindspore {
 class TensorLoader {
 public:
@@ -72,8 +76,54 @@ class TensorLoader {

  void EmptyPrevTensor() { prev_tensor_list_map.clear(); }

  void EmptyCurrentTensor() {
    tensor_list_map.clear();
    tensor_list.clear();
  }

  void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }

 #ifdef ENABLE_DUMP_E2E
  bool DumpTensorToFile(std::string tensor_name, bool trans_flag, const std::string &filepath,
                        const std::string &host_fmt, const std::vector<int> &host_shape, TypeId host_type,
                        TypeId addr_type_id, std::string addr_format, size_t slot) const {
    bool ret = false;
    if (filepath.empty()) {
      MS_LOG(ERROR) << "Dump file path is null!";
      return ret;
    }
    std::string shape = "shape";
    if (host_shape.size()) {
      for (auto &value : host_shape) {
        shape = shape + '_' + std::to_string(value);
      }
    } else {
      shape = shape + "_0";
    }
    std::string file_extension = ".bin";
    std::string path = "";
    if (trans_flag) {
      path = filepath + '_' + shape + '_' + TypeIdLabel(host_type) + '_' + host_fmt + file_extension;
    } else {
      path = filepath + '_' + shape + '_' + TypeIdToType(addr_type_id)->ToString() + '_' + addr_format + file_extension;
    }

    MS_LOG(INFO) << "Dump path is " << path;

    std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
    auto iter = tensor_list_map.find(tensor_loader_name);
    if (iter != tensor_list_map.end()) {
      std::shared_ptr<TensorData> node = iter->second;
      mindspore::tensor::TensorPtr out_tensor = node->GetTensor();
      size_t host_size = out_tensor->data().nbytes();

      ret = mindspore::Dump::DumpToFile(path, out_tensor->data_c(), host_size);
    }

    return ret;
  }
 #endif

 private:
  std::vector<std::shared_ptr<TensorData>> tensor_list;
  std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -275,7 +275,7 @@ void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_p
 }  // namespace
 #endif

 bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
 bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DUMP_E2E
  MS_LOG(INFO) << "Start dump step";
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -38,7 +38,7 @@ class AscendKernelRuntime : public KernelRuntime {
  AscendKernelRuntime() = default;
  ~AscendKernelRuntime() override;
  bool Init() override;
  bool DumpData(session::KernelGraph *graph) override;
  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
  bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
  bool GenTask(const session::KernelGraph *graph) override;
  bool RunTask(const session::KernelGraph *graph) override;
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@@ -270,7 +270,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
  resource_manager_.DecreaseSummaryRefCount(summary_outputs);
 }

 bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
 bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  resource_manager_.IncreaseAddressRefCount(kernel_graph);

--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
  ~CPUKernelRuntime() override = default;

  bool Init() override { return true; }
  bool Run(session::KernelGraph *graph) override;
  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
  void AssignKernelAddress(session::KernelGraph *kernel_graph);
  void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                       VectorRef *outputs, std::vector<tensor::TensorPtr> *need_sync_outputs);
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -16,9 +16,16 @@

 #include "runtime/device/gpu/gpu_device_address.h"
 #include <vector>
 #include <memory>
 #include "runtime/device/gpu/gpu_device_manager.h"
 #include "utils/log_adapter.h"
 #include "runtime/device/gpu/gpu_memory_allocator.h"
 #include "ir/tensor.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debug_services.h"
 #include "debug/tensor_load.h"
 #include "debug/debugger/debugger.h"
 #endif

 namespace mindspore {
 namespace device {
@@ -59,6 +66,36 @@ GPUDeviceAddress::~GPUDeviceAddress() {
    ptr_ = nullptr;
  }
 }
 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                                     const std::vector<int> &host_shape, TypeId host_type, size_t slot,
                                     Debugger *debugger, bool keep_prev) const {
  bool ret = false;
  if (size_ == 0) {
    return true;
  }
  DebugServices *debug_services = debugger->debug_services();
  TensorLoader *tensor_loader = debug_services->tensor_loader();

  mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
  size_t host_size = out_tensor->data().nbytes();
  auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
  if (!ret_rt_memcpy) {
    MS_LOG(ERROR) << "Copy device mem to host failed";
    return ret;
  }
  auto tensor_data = std::make_shared<mindspore::TensorData>();
  tensor_data->SetName(tensor_name);
  tensor_data->SetExecutionOrder(execution_order);
  tensor_data->SetTensor(out_tensor);
  tensor_data->SetSlot(slot);
  ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);

  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;

  return ret;
 }
 #endif
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@@ -22,6 +22,9 @@
 #include "runtime/device/device_address.h"

 namespace mindspore {
 #ifdef ENABLE_DEBUGGER
 class Debugger;
 #endif
 namespace device {
 namespace gpu {
 class GPUDeviceAddress : public DeviceAddress {
@@ -37,6 +40,11 @@ class GPUDeviceAddress : public DeviceAddress {
  DeviceAddressStatus status() const { return status_; }
  DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; }

 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const std::vector<int> &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
                     bool keep_prev) const;
 #endif
 private:
  DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
 };
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "runtime/device/gpu/gpu_kernel_runtime.h"
 #include <algorithm>
 #include "runtime/device/gpu/gpu_device_address.h"
 #include "runtime/device/gpu/cuda_driver.h"
 #include "runtime/device/gpu/gpu_buffer_mgr.h"
@@ -29,6 +29,8 @@
 #include "runtime/device/gpu/gpu_memory_manager.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "runtime/device/gpu/gpu_memory_copy_manager.h"
 #include "common/trans.h"
 #include "ir/dtype.h"

 namespace mindspore {
 namespace device {
@@ -36,6 +38,7 @@ namespace gpu {
 using mindspore::device::memswap::MemSwapInfoSet;
 using mindspore::device::memswap::MemSwapManager;
 using mindspore::device::memswap::SwapKind;
 static const size_t PARAMETER_OUTPUT_INDEX = 0;
 bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }

 bool GPUKernelRuntime::Init() {
@@ -43,7 +46,15 @@ bool GPUKernelRuntime::Init() {
    GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
    return true;
  }
  auto ret = InitDevice();
  bool ret = false;
 #ifdef ENABLE_DUMP_E2E
  ret = SetDumpConf();
  if (!ret) {
    MS_LOG(INFO) << "No dump conf to set!";
  }
 #endif

  ret = InitDevice();
  if (!ret) {
    MS_LOG(ERROR) << "InitDevice error.";
    return ret;
@@ -63,6 +74,216 @@ bool GPUKernelRuntime::Init() {
  return ret;
 }

 #ifdef ENABLE_DUMP_E2E
 namespace {
 void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
                Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(dump_conf);
  bool trans_flag = dump_conf->trans_flag();
  const auto &apply_kernels = graph->execution_order();
  for (const auto &node : apply_kernels) {
    MS_EXCEPTION_IF_NULL(node);
    auto node_name = AnfAlgo::GetCNodeName(node);
    std::string kernel_name = node->fullname_with_scope();
    if (!dump_conf->IsKernelNeedDump(kernel_name)) {
      continue;
    }
    const std::string strsrc = "/";
    const std::string strdst = "--";
    std::string::size_type pos = 0;
    std::string::size_type srclen = strsrc.size();
    std::string::size_type dstlen = strdst.size();
    while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) {
      kernel_name.replace(pos, srclen, strdst);
      pos += dstlen;
    }
    auto output_size = AnfAlgo::GetOutputTensorNum(node);
    for (size_t j = 0; j < output_size; ++j) {
      auto addr = AnfAlgo::GetOutputAddr(node, j);
      TypeId addr_type_id = addr->type_id();
      std::string addr_format = addr->format();
      std::vector<int> int_shapes;
      if (trans_flag) {
        int_shapes = trans::GetRuntimePaddingShape(node, j);
      } else {
        auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
        (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                             [](size_t inner_item) { return SizeToInt(inner_item); });
      }

      auto type = AnfAlgo::GetOutputInferDataType(node, j);

      auto format = kOpFormat_DEFAULT;
      string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j);

      DebugServices *debug_services = debugger->debug_services();
      TensorLoader *tensor_loader = debug_services->tensor_loader();
      std::string original_kernel_name = node->fullname_with_scope();
      size_t slot = j;
      auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
                                                 addr_type_id, addr_format, slot);

      if (!ret) {
        std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
                            ", host_format:" + format + ".!";
      }
    }
  }
 }

 void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf,
                    Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(dump_conf);
  bool trans_flag = dump_conf->trans_flag();
  const auto &parameters = graph->inputs();
  for (auto &item : parameters) {
    if (!item->isa<Parameter>()) {
      continue;
    }
    std::string parameter_name = item->fullname_with_scope();
    if (!dump_conf->IsKernelNeedDump(parameter_name)) {
      continue;
    }
    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
    TypeId addr_type_id = addr->type_id();
    std::string addr_format = addr->format();
    std::vector<int> int_shapes;
    if (trans_flag) {
      int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX);
    } else {
      auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                           [](size_t inner_item) { return SizeToInt(inner_item); });
    }

    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);

    auto format = kOpFormat_DEFAULT;
    string filepath = dump_path + '/' + parameter_name + '_' + "output_0";

    DebugServices *debug_services = debugger->debug_services();
    TensorLoader *tensor_loader = debug_services->tensor_loader();
    std::string original_kernel_name = parameter_name;
    size_t slot = 0;
    auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type,
                                               addr_type_id, addr_format, slot);

    if (!ret) {
      std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath +
                          ", host_format:" + format + ".!";
    }
  }
 }
 }  // namespace

 bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_LOG(INFO) << "Start dump step";
  DumpConfPtr dump_conf = GetDumpConf();
  MS_EXCEPTION_IF_NULL(dump_conf);
  dump_conf->UpdataCurIter();
  bool dump_flag = dump_conf->dump_enable();
  if (!dump_flag) {
    MS_LOG(INFO) << "Dump flag is disable, pass dump step";
    return true;
  }
  uint32_t cur_iter = dump_conf->cur_iter();
  if (dump_conf->dump_iter() != 0) {
    if (cur_iter != dump_conf->dump_iter()) {
      return true;
    }
  }
  MS_LOG(INFO) << "Cur iter is " << cur_iter;
  std::string net_name = dump_conf->dump_net_name();
  std::string iterator = std::to_string(cur_iter);
  std::string dump_path = dump_conf->dump_path();
  if (dump_path.back() == '/') {
    dump_path = dump_path + net_name + '/' + iterator;
  } else {
    dump_path = dump_path + '/' + net_name + '/' + iterator;
  }

  // dump output
  DumpOutput(graph, dump_path, dump_conf, debugger);
  // dump parameters
  DumpParameters(graph, dump_path, dump_conf, debugger);

  return true;
 }
 #endif

 #ifdef ENABLE_DEBUGGER
 namespace {
 void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
                    const std::vector<mindspore::kernel::AddressPtr> &kernel_inputs,
                    const std::vector<mindspore::kernel::AddressPtr> &kernel_workspaces,
                    const std::vector<mindspore::kernel::AddressPtr> &kernel_outputs, int exec_order, void *stream_ptr,
                    bool dump_enabled) {
  if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
    return;
  }
  std::string kernel_name = kernel->fullname_with_scope();
  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  for (size_t j = 0; j < output_size; ++j) {
    auto addr = kernel_outputs[j];
    auto type = AnfAlgo::GetOutputInferDataType(kernel, j);
    auto format = kOpFormat_DEFAULT;
    auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
    string tensor_name = kernel_name + ':' + std::to_string(j);
    std::vector<int> int_shapes;
    auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
    }
  }
 }

 void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
  MS_EXCEPTION_IF_NULL(graph);
  if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
    return;
  }
  const auto &parameters = graph->inputs();
  // for parameters, set its execution order to be 0;
  int exec_order = 0;
  for (auto &item : parameters) {
    if (!item->isa<Parameter>()) {
      continue;
    }
    std::string parameter_name = item->fullname_with_scope();
    auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
    auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
    auto format = kOpFormat_DEFAULT;
    string tensor_name = parameter_name + ':' + "0";
    auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
    std::vector<int> int_shapes;
    auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
    }
  }
 }

 void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
    DebugServices *debug_services = debugger->debug_services();
    TensorLoader *tensor_loader = debug_services->tensor_loader();
    tensor_loader->EmptyCurrentTensor();
  }
 }
 }  // namespace
 #endif

 DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                       TypeId type_id) {
  return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
@@ -147,7 +368,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  }
 }

 bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
 bool GPUKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
  struct timeval start_time, end_time;
  (void)gettimeofday(&start_time, nullptr);
  bool ret = true;
@@ -170,7 +391,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
    mem_reuse_util_ = mem_reuse_iter->second;
    MS_EXCEPTION_IF_NULL(mem_reuse_util_);

    ret = RunOneStep(graph);
    ret = RunOneStep(graph, debugger);
  } else {
    ret = LaunchKernel(graph);
  }
@@ -182,28 +403,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  return ret;
 }

 bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) {
 bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) {
  bool ret = true;
  auto graph_id = graph->graph_id();
  if (!is_first_step_map_[graph_id]) {
    // Normally run graph
    ret = LaunchKernelDynamic(graph);
    ret = LaunchKernelDynamic(graph, debugger);
  } else {
    // Mock run first step
    ret = LaunchKernelDynamic(graph, true, false);
    ret = LaunchKernelDynamic(graph, debugger, true, false);
    if (ret) {
      // Normally run graph
      ret = LaunchKernelDynamic(graph);
      ret = LaunchKernelDynamic(graph, debugger);
    } else {
      // Trigger memory swap
      ret = SearchMemSwapScheme(graph);
      ret = SearchMemSwapScheme(graph, debugger);
    }
    is_first_step_map_[graph_id] = false;
  }
  return ret;
 }

 bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
 bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
  MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
  bool ret = false;
  ClearKernelOldOutputAndWorkspace(graph);
@@ -217,7 +438,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
    if (!mem_swap_manager_->RetreatSwapInfo()) {
      return false;
    }
    ret = LaunchKernelDynamic(graph, true, false);
    ret = LaunchKernelDynamic(graph, debugger, true, false);
    if (!ret) {
      ClearKernelOldOutputAndWorkspace(graph);
    }
@@ -225,14 +446,14 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
  mem_swap_manager_->AssignHostMemory();

  // Time profiling
  ret = LaunchKernelDynamic(graph, false, true);
  ret = LaunchKernelDynamic(graph, debugger, false, true);
  if (!ret) {
    return ret;
  }
  return RefineMemSwapScheme(graph);
  return RefineMemSwapScheme(graph, debugger);
 }

 bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
 bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) {
  MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
  auto &kernels = graph->execution_order();
  for (const auto &kernel : kernels) {
@@ -245,7 +466,7 @@ bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
      bool ret = false;
      while (!ret) {
        mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx);
        ret = LaunchKernelDynamic(graph, true, false);
        ret = LaunchKernelDynamic(graph, debugger, true, false);
        if (!ret) {
          ClearKernelOldOutputAndWorkspace(graph);
          ClearSwapInfo(true);
@@ -384,14 +605,24 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g
  }
 }

 bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) {
 bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock,
                                           bool profiling) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
  // Reset the reference count.
  mem_reuse_util_->ResetDynamicUsedRefCount();
  // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
  AllocCommunicationOpDynamicRes(graph);

 #ifdef ENABLE_DEBUGGER
  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
  if (!mock) {
    // collect weights and bias
    LoadParameters(graph, debugger, dump_enabled);
  }
 #endif
  auto &kernels = graph->execution_order();
  int exec_order = 1;
  for (const auto &kernel : kernels) {
    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
    MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -400,6 +631,12 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
    AddressPtrList kernel_outputs;
    auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock);
    if (!ret) {
 #ifdef ENABLE_DEBUGGER
      if (!mock) {
        // invalidate current data collected by the debugger
        ClearCurrentData(debugger, dump_enabled);
      }
 #endif
      return false;
    }
    if (!mock) {
@@ -409,9 +646,21 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
      } else {
        LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
      }
 #ifdef ENABLE_DEBUGGER
      // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
      LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
                     dump_enabled);
 #endif
    }
    exec_order = exec_order + 1;
    FreeKernelDynamicRes(kernel);
    if (!UpdateMemorySwapTask(kernel, mock, profiling)) {
 #ifdef ENABLE_DEBUGGER
      if (!mock) {
        // invalidate current data collected by the debugger
        ClearCurrentData(debugger, dump_enabled);
      }
 #endif
      return false;
    }
  }
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -38,7 +38,10 @@ class GPUKernelRuntime : public KernelRuntime {
  bool Init() override;
  void ReleaseDeviceRes() override;
  void AssignMemory(session::KernelGraph *graph) override;
  bool Run(session::KernelGraph *graph) override;
  bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
 #ifdef ENABLE_DUMP_E2E
  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
 #endif

 protected:
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@@ -61,10 +64,11 @@ class GPUKernelRuntime : public KernelRuntime {
  void ClearKernelOutputAddress(const session::KernelGraph *graph);
  void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
  void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
  bool RunOneStep(const session::KernelGraph *graph);
  bool SearchMemSwapScheme(const session::KernelGraph *graph);
  bool RefineMemSwapScheme(const session::KernelGraph *graph);
  bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false);
  bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
  bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
  bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
  bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
                           bool profiling = false);
  void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
                                     const AddressPtrList &workspace, const AddressPtrList &outputs);
  bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -41,7 +41,7 @@ KernelRuntime::~KernelRuntime() {
 #endif
 }

 bool KernelRuntime::Run(session::KernelGraph *graph) {
 bool KernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
  bool ret = false;
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
@@ -72,7 +72,7 @@ bool KernelRuntime::Run(session::KernelGraph *graph) {
 }

 // for D to impl
 bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
 bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  if (graph != nullptr) {
    return true;
  }
@@ -190,6 +190,39 @@ void KernelRuntime::RunOpClearMemory(const session::KernelGraph *graph) {
  }
 }

 bool KernelRuntime::DumpDataEnabled() {
  bool ret = false;
 #ifdef ENABLE_DUMP_E2E
  DumpConfPtr dump_conf = GetDumpConf();
  MS_EXCEPTION_IF_NULL(dump_conf);
  bool dump_flag = dump_conf->dump_enable();
  if (!dump_flag) {
    return ret;
  }
  ret = true;
 #endif
  return ret;
 }

 bool KernelRuntime::DumpDataEnabledIteration() {
  bool ret = false;
 #ifdef ENABLE_DUMP_E2E
  if (!DumpDataEnabled()) {
    return ret;
  }
  DumpConfPtr dump_conf = GetDumpConf();
  MS_EXCEPTION_IF_NULL(dump_conf);
  uint32_t cur_iter = dump_conf->cur_iter() + 1;
  if (dump_conf->dump_iter() != 0) {
    if (cur_iter != dump_conf->dump_iter()) {
      return ret;
    }
  }
  ret = true;
 #endif
  return ret;
 }

 void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
  AssignStaticMemoryInput(graph);
  AssignStaticMemoryValueNode(graph);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -55,8 +55,10 @@ class KernelRuntime {
  virtual void AssignMemory(session::KernelGraph *graph);
  void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph);
  void RunOpClearMemory(const session::KernelGraph *graph);
  virtual bool Run(session::KernelGraph *graph);
  virtual bool DumpData(session::KernelGraph *graph);
  bool DumpDataEnabled();
  bool DumpDataEnabledIteration();
  virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr);
  virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
  virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
  virtual bool RunTask(const session::KernelGraph *graph);
  virtual bool GenTask(const session::KernelGraph *graph);