[bug_fix]GPU distributed training occur core dump when memory is not enough

5 years ago · 6f6a0dfd7a
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
@@ -346,7 +346,6 @@ bool MemSwapManager::RetreatSwapInfo() {
    ResetSwapInfo();
    RetreatSwapThreshold();
    if (tensor_size_threshold_idx_ == ordered_tensors_.size() - 1 && distance_threshold_ < kDistanceLowerBound) {
      MS_LOG(ERROR) << "Retreat swap info failed";
      return false;
    }
  } else {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -310,6 +310,13 @@ void GPUKernelRuntime::AllocInplaceNodeMemory(const session::KernelGraph *graph)
  }
 }

 bool GPUKernelRuntime::IsDistributedTraining(const session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  const auto &kernels = graph->execution_order();
  return std::any_of(kernels.begin(), kernels.end(),
                     [](const AnfNodePtr &kernel) { return AnfAlgo::IsCommunicationOp(kernel); });
 }

 void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
@@ -367,28 +374,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) {
 }

 bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) {
  bool ret = true;
  auto graph_id = graph->graph_id();
  if (!is_first_step_map_[graph_id] || graph->is_dynamic_shape()) {
    // Normally run graph
    ret = LaunchKernelDynamic(graph);
  } else {
    // Mock run first step
    ret = LaunchKernelDynamic(graph, true, false);
    if (ret) {
      // Normally run graph
      ret = LaunchKernelDynamic(graph);
    } else {
      // Trigger memory swap
      ret = SearchMemSwapScheme(graph);
    }
    is_first_step_map_[graph_id] = false;
    return LaunchKernelDynamic(graph);
  }
  return ret;
  is_first_step_map_[graph_id] = false;
  // Mock run first step
  bool ret = LaunchKernelDynamic(graph, true, false);
  if (ret) {
    // Normally run graph
    return LaunchKernelDynamic(graph);
  }
  if (IsDistributedTraining(graph)) {
    MS_LOG(ERROR) << "Device memory is not enough, run graph failed!";
    return false;
  }
  // Trigger memory swap
  return SearchMemSwapScheme(graph);
 }

 bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
  MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
  MS_LOG(INFO) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
  bool ret = false;
  ClearKernelOldOutputAndWorkspace(graph);
  if (!mem_swap_manager_->mem_swap_init()) {
@@ -399,6 +406,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {

  while (!ret) {
    if (!mem_swap_manager_->RetreatSwapInfo()) {
      MS_LOG(ERROR) << "Device memory is not enough, run graph failed!";
      return false;
    }
    ret = LaunchKernelDynamic(graph, true, false);
@@ -417,7 +425,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
 }

 bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
  MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
  MS_LOG(INFO) << "Refine memory swap scheme, it may take some time, please wait a moment.";
  auto &kernels = graph->execution_order();
  for (const auto &kernel : kernels) {
    if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -98,6 +98,7 @@ class GPUKernelRuntime : public KernelRuntime {
  void UpdateHostSwapOutQueue(bool mock);
  void ClearSwapInfo(bool mock);
  void AllocInplaceNodeMemory(const session::KernelGraph *graph);
  bool IsDistributedTraining(const session::KernelGraph *graph);

  DeviceAddressPtr GetPrevNodeMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node);
  DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node);