Browse Source

[bug_fix]GPU distributed training occur core dump when memory is not enough

tags/v1.1.0
lizhenyu 5 years ago
parent
commit
6f6a0dfd7a
3 changed files with 25 additions and 17 deletions
  1. +0
    -1
      mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
  2. +24
    -16
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
  3. +1
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h

+ 0
- 1
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc View File

@@ -346,7 +346,6 @@ bool MemSwapManager::RetreatSwapInfo() {
ResetSwapInfo(); ResetSwapInfo();
RetreatSwapThreshold(); RetreatSwapThreshold();
if (tensor_size_threshold_idx_ == ordered_tensors_.size() - 1 && distance_threshold_ < kDistanceLowerBound) { if (tensor_size_threshold_idx_ == ordered_tensors_.size() - 1 && distance_threshold_ < kDistanceLowerBound) {
MS_LOG(ERROR) << "Retreat swap info failed";
return false; return false;
} }
} else { } else {


+ 24
- 16
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -310,6 +310,13 @@ void GPUKernelRuntime::AllocInplaceNodeMemory(const session::KernelGraph *graph)
} }
} }


bool GPUKernelRuntime::IsDistributedTraining(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
const auto &kernels = graph->execution_order();
return std::any_of(kernels.begin(), kernels.end(),
[](const AnfNodePtr &kernel) { return AnfAlgo::IsCommunicationOp(kernel); });
}

void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
auto context_ptr = MsContext::GetInstance(); auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr); MS_EXCEPTION_IF_NULL(context_ptr);
@@ -367,28 +374,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) {
} }


bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) { bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) {
bool ret = true;
auto graph_id = graph->graph_id(); auto graph_id = graph->graph_id();
if (!is_first_step_map_[graph_id] || graph->is_dynamic_shape()) { if (!is_first_step_map_[graph_id] || graph->is_dynamic_shape()) {
// Normally run graph // Normally run graph
ret = LaunchKernelDynamic(graph);
} else {
// Mock run first step
ret = LaunchKernelDynamic(graph, true, false);
if (ret) {
// Normally run graph
ret = LaunchKernelDynamic(graph);
} else {
// Trigger memory swap
ret = SearchMemSwapScheme(graph);
}
is_first_step_map_[graph_id] = false;
return LaunchKernelDynamic(graph);
} }
return ret;
is_first_step_map_[graph_id] = false;
// Mock run first step
bool ret = LaunchKernelDynamic(graph, true, false);
if (ret) {
// Normally run graph
return LaunchKernelDynamic(graph);
}
if (IsDistributedTraining(graph)) {
MS_LOG(ERROR) << "Device memory is not enough, run graph failed!";
return false;
}
// Trigger memory swap
return SearchMemSwapScheme(graph);
} }


bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
MS_LOG(INFO) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
bool ret = false; bool ret = false;
ClearKernelOldOutputAndWorkspace(graph); ClearKernelOldOutputAndWorkspace(graph);
if (!mem_swap_manager_->mem_swap_init()) { if (!mem_swap_manager_->mem_swap_init()) {
@@ -399,6 +406,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {


while (!ret) { while (!ret) {
if (!mem_swap_manager_->RetreatSwapInfo()) { if (!mem_swap_manager_->RetreatSwapInfo()) {
MS_LOG(ERROR) << "Device memory is not enough, run graph failed!";
return false; return false;
} }
ret = LaunchKernelDynamic(graph, true, false); ret = LaunchKernelDynamic(graph, true, false);
@@ -417,7 +425,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
} }


bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
MS_LOG(INFO) << "Refine memory swap scheme, it may take some time, please wait a moment.";
auto &kernels = graph->execution_order(); auto &kernels = graph->execution_order();
for (const auto &kernel : kernels) { for (const auto &kernel : kernels) {
if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) { if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) {


+ 1
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h View File

@@ -98,6 +98,7 @@ class GPUKernelRuntime : public KernelRuntime {
void UpdateHostSwapOutQueue(bool mock); void UpdateHostSwapOutQueue(bool mock);
void ClearSwapInfo(bool mock); void ClearSwapInfo(bool mock);
void AllocInplaceNodeMemory(const session::KernelGraph *graph); void AllocInplaceNodeMemory(const session::KernelGraph *graph);
bool IsDistributedTraining(const session::KernelGraph *graph);


DeviceAddressPtr GetPrevNodeMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node); DeviceAddressPtr GetPrevNodeMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node);
DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node); DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node);


Loading…
Cancel
Save