Browse Source

[bugfix]core dump when allocating memory fail

tags/v1.5.0-rc1
lizhenyu 4 years ago
parent
commit
f3d8172cd0
2 changed files with 34 additions and 6 deletions
  1. +23
    -6
      mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.cc
  2. +11
    -0
      mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.h

+ 23
- 6
mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.cc View File

@@ -36,8 +36,8 @@ void MemoryManagerActor::AllocateMemory(const std::vector<DeviceTensor *> *alloc
}
// Allocate memory through the device context.
if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, (*op_context), device_context,
from_aid.Name(), device_tensor->GetSize());
SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context);
return;
}
}

@@ -69,8 +69,8 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
auto &device_context = (*device_contexts)[i];
// Allocate memory through the device context.
if (!device_context->AllocateContinuousMemory(alloc_list, total_size, size_list)) {
SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, (*op_context), device_context,
from_aid.Name(), total_size);
SetOpContextMemoryAllocFail(from_aid.Name(), device_context, total_size, op_context);
return;
}
}

@@ -100,8 +100,8 @@ void MemoryManagerActor::AllocateBatchMemory(const std::vector<DeviceTensor *> *

// Allocate memory through the device context.
if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, (*op_context), device_context,
from_aid.Name(), device_tensor->GetSize());
SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context);
return;
}
}

@@ -165,5 +165,22 @@ void MemoryManagerActor::Wait(OpContext<DeviceTensor> *const op_context, const A
// Call back to the from actor to process.
Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
}

void MemoryManagerActor::SetOpContextMemoryAllocFail(const std::string &kernel_name,
const DeviceContext *device_context, size_t alloc_size,
OpContext<DeviceTensor> *const op_context) {
MS_EXCEPTION_IF_NULL(device_context);
MS_EXCEPTION_IF_NULL(op_context);

MS_EXCEPTION_IF_NULL(op_context->sequential_num_);
auto step_id = uuids::uuid::ToBytes(*(op_context->sequential_num_));
// First occur allocating memory failed.
if (mem_alloc_failed_step_ids_.find(step_id) == mem_alloc_failed_step_ids_.end()) {
mem_alloc_failed_step_ids_.clear();
(void)mem_alloc_failed_step_ids_.insert(step_id);
SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, (*op_context), device_context,
kernel_name, alloc_size);
}
}
} // namespace runtime
} // namespace mindspore

+ 11
- 0
mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.h View File

@@ -20,6 +20,7 @@
#include <vector>
#include <memory>
#include <string>
#include <set>
#include <unordered_map>
#include "runtime/framework/actor/actor_common.h"
#include "runtime/framework/device_tensor_store.h"
@@ -60,6 +61,16 @@ class MemoryManagerActor : public ActorBase {

// Wait the MemoryManagerActor to finish running all current messages.
void Wait(OpContext<DeviceTensor> *const op_context, const AID from_aid);

private:
// When allocate device memory fail, print error log and set op context failed status.
void SetOpContextMemoryAllocFail(const std::string &kernel_name, const DeviceContext *device_context,
size_t alloc_size, OpContext<DeviceTensor> *const op_context);

// MemoryManagerActor object is used like a single instance, if one actor allocates memory failed in one batch, which
// will set fail message info OpContext, major thread will destroy the OpContext object, subsequent actor can not set
// fail message again, so we record allocating memory fail event by the uuid of the batch, which is key of the set.
std::set<std::string> mem_alloc_failed_step_ids_;
};
} // namespace runtime
} // namespace mindspore


Loading…
Cancel
Save