Browse Source

fix device memory leak

tags/v1.2.0-rc1
chujinjin 4 years ago
parent
commit
ade9a82c2b
12 changed files with 75 additions and 12 deletions
  1. +0
    -2
      mindspore/ccsrc/backend/session/ascend_session.cc
  2. +0
    -4
      mindspore/ccsrc/backend/session/gpu_session.cc
  3. +6
    -1
      mindspore/ccsrc/backend/session/session_basic.cc
  4. +52
    -1
      mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
  5. +2
    -0
      mindspore/ccsrc/pipeline/pynative/pynative_execute.h
  6. +3
    -1
      mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
  7. +1
    -0
      mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
  8. +1
    -0
      mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h
  9. +3
    -1
      mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
  10. +1
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
  11. +5
    -2
      mindspore/ccsrc/runtime/device/kernel_runtime.cc
  12. +1
    -0
      mindspore/core/ir/device_sync.h

+ 0
- 2
mindspore/ccsrc/backend/session/ascend_session.cc View File

@@ -1018,7 +1018,6 @@ void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_grap


void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const { void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!"; MS_LOG(INFO) << "Start!";
opt::HideNopNode(kernel_graph.get());
// Insert CLearZero op // Insert CLearZero op
// prepare for next step from json get atomic info // prepare for next step from json get atomic info
BuildKernel(kernel_graph); BuildKernel(kernel_graph);
@@ -1079,7 +1078,6 @@ void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input
KernelGraph *kernel_graph) const { KernelGraph *kernel_graph) const {
MS_LOG(INFO) << "Start memory alloc!"; MS_LOG(INFO) << "Start memory alloc!";
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
opt::RemoveNopNode(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance); MS_EXCEPTION_IF_NULL(runtime_instance);
runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);


+ 0
- 4
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -418,8 +418,6 @@ void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &grap
SelectKernel(kernel_graph); SelectKernel(kernel_graph);
RunOpHardwareOptimize(kernel_graph); RunOpHardwareOptimize(kernel_graph);
StartKernelRT(); StartKernelRT();
// Hide NopOp from execution graph
opt::HideNopNode(kernel_graph.get());
BuildKernel(kernel_graph); BuildKernel(kernel_graph);
run_op_graphs_[graph_info] = kernel_graph; run_op_graphs_[graph_info] = kernel_graph;
} }
@@ -434,8 +432,6 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
// run op // run op
auto kernel_graph = run_op_graphs_[graph_info]; auto kernel_graph = run_op_graphs_[graph_info];
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
// Remove NopOp from execution graph
opt::RemoveNopNode(kernel_graph.get());
RunOpAllocateMemory(*input_tensors, kernel_graph.get()); RunOpAllocateMemory(*input_tensors, kernel_graph.get());
// Execute the computation // Execute the computation
LoadInputData(kernel_graph, *input_tensors); LoadInputData(kernel_graph, *input_tensors);


+ 6
- 1
mindspore/ccsrc/backend/session/session_basic.cc View File

@@ -1173,7 +1173,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap
auto &tensor = item.first; auto &tensor = item.first;
auto &node = item.second.first; auto &node = item.second.first;
auto &output_index = item.second.second; auto &output_index = item.second.second;
auto address = AnfAlgo::GetMutableOutputAddr(node, output_index);
DeviceAddressPtr address = nullptr;
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
address = AnfAlgo::GetMutableOutputAddr(node, output_index, false);
} else {
address = AnfAlgo::GetMutableOutputAddr(node, output_index);
}
MS_EXCEPTION_IF_NULL(tensor); MS_EXCEPTION_IF_NULL(tensor);
tensor->set_device_address(address); tensor->set_device_address(address);
tensor->SetNeedWait(false); tensor->SetNeedWait(false);


+ 52
- 1
mindspore/ccsrc/pipeline/pynative/pynative_execute.cc View File

@@ -988,7 +988,7 @@ AnfNodePtr PynativeExecutor::GetInput(const py::object &obj, bool op_mask) {
// out = op(cell1(x, y)) // out = op(cell1(x, y))
// out = op(cell1(x, y)[0]) // out = op(cell1(x, y)[0])
node = GetObjNode(obj, obj_id); node = GetObjNode(obj, obj_id);
} else if (py::isinstance<py::tuple>(obj)) {
} else if (py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj)) {
// out = op((x, y)) // out = op((x, y))
// out = cell((x, y)) // out = cell((x, y))
auto tuple = obj.cast<py::tuple>(); auto tuple = obj.cast<py::tuple>();
@@ -1100,6 +1100,23 @@ void PynativeExecutor::CleanPreMemoryInValueNode(const std::string &cell_id) {
top_cell_id_ = cell_id; top_cell_id_ = cell_id;
return; return;
} }
if (dynamic_cell_) {
std::set<std::string> forward_op_tensor_id;
for (const auto &elem : cell_op_index_with_tensor_id_[top_cell_id_]) {
const auto &tensor_id_list = elem.second;
for (const auto &tensor_id : tensor_id_list) {
forward_op_tensor_id.emplace(tensor_id);
}
}
for (auto &tensor : all_value_node_tensors_) {
if (tensor->device_address() != nullptr &&
forward_op_tensor_id.find(tensor->id()) != forward_op_tensor_id.end()) {
tensor->device_address()->ClearDeviceMemory();
tensor->set_device_address(nullptr);
}
}
all_value_node_tensors_.clear();
}
const auto &tensor_id_with_tensor = cell_tensor_id_with_tensor_[top_cell_id_]; const auto &tensor_id_with_tensor = cell_tensor_id_with_tensor_[top_cell_id_];
for (const auto &elem : tensor_id_with_tensor) { for (const auto &elem : tensor_id_with_tensor) {
const auto &tensors_in_value_node = elem.second; const auto &tensors_in_value_node = elem.second;
@@ -2111,6 +2128,37 @@ std::string PynativeExecutor::GetGradCellId(bool has_sens, const py::object &cel
return cell_id; return cell_id;
} }


void PynativeExecutor::SaveAllValueNodeTensors(const FuncGraphPtr &graph) {
std::unordered_set<tensor::TensorPtr> all_value_node_tensors;
auto trace_function = [&all_value_node_tensors](const AnfNodePtr &anf_node) {
auto value = GetValueNode(anf_node);
if (value) {
if (value->isa<tensor::Tensor>()) {
auto tensor = value->cast<tensor::TensorPtr>();
MS_EXCEPTION_IF_NULL(tensor);
if (tensor->device_address()) {
all_value_node_tensors.emplace(tensor);
}
} else if (value->isa<ValueTuple>()) {
auto tuple = value->cast<ValueTuplePtr>();
MS_EXCEPTION_IF_NULL(tuple);
for (size_t i = 0; i < tuple->size(); i++) {
if ((*tuple)[i]->isa<tensor::Tensor>()) {
auto tensor = (*tuple)[i]->cast<tensor::TensorPtr>();
MS_EXCEPTION_IF_NULL(tensor);
if (tensor->device_address()) {
all_value_node_tensors.emplace(tensor);
}
}
}
}
}
return FOLLOW;
};
(void)TopoSort(graph->get_return(), SuccDeeperSimple, trace_function);
all_value_node_tensors_ = all_value_node_tensors;
}

void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::object &cell, const py::object &weights, void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::object &cell, const py::object &weights,
const py::args &args) { const py::args &args) {
auto size = args.size(); auto size = args.size();
@@ -2152,6 +2200,9 @@ void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::obje
resource->results()[pipeline::kBackend] = compile::CreateBackend(); resource->results()[pipeline::kBackend] = compile::CreateBackend();


MS_LOG(INFO) << "Start opt"; MS_LOG(INFO) << "Start opt";
if (dynamic_cell_) {
SaveAllValueNodeTensors(resource->func_graph());
}
PynativeOptimizeAction(resource); PynativeOptimizeAction(resource);
SaveTensorsInValueNode(resource); SaveTensorsInValueNode(resource);
TaskEmitAction(resource); TaskEmitAction(resource);


+ 2
- 0
mindspore/ccsrc/pipeline/pynative/pynative_execute.h View File

@@ -200,6 +200,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
// Update the abstract and device address info of value node and tensors in bprop graph // Update the abstract and device address info of value node and tensors in bprop graph
void UpdateAbstractAndDeviceAddress(const OpExecInfoPtr &op_exec_info, const py::object &out_real); void UpdateAbstractAndDeviceAddress(const OpExecInfoPtr &op_exec_info, const py::object &out_real);
void SaveTensorsInValueNode(const ResourcePtr &resource); void SaveTensorsInValueNode(const ResourcePtr &resource);
void SaveAllValueNodeTensors(const FuncGraphPtr &graph);
void CleanPreMemoryInValueNode(const std::string &cell_id); void CleanPreMemoryInValueNode(const std::string &cell_id);


// Construct grad graph // Construct grad graph
@@ -306,6 +307,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
std::unordered_map<std::string, TensorIdWithTensor> cell_tensor_id_with_tensor_; std::unordered_map<std::string, TensorIdWithTensor> cell_tensor_id_with_tensor_;
std::unordered_map<std::string, abstract::AbstractBasePtr> node_abs_map_; std::unordered_map<std::string, abstract::AbstractBasePtr> node_abs_map_;
std::unordered_map<std::string, AbstractListMap> prim_abs_list_; std::unordered_map<std::string, AbstractListMap> prim_abs_list_;
std::unordered_set<tensor::TensorPtr> all_value_node_tensors_;
}; };


using PynativeExecutorPtr = std::shared_ptr<PynativeExecutor>; using PynativeExecutorPtr = std::shared_ptr<PynativeExecutor>;


+ 3
- 1
mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc View File

@@ -612,7 +612,7 @@ bool AscendDeviceAddress::ConvertFormatAndSyncHostToDevice(const ShapeVector &sh
return sync_ok; return sync_ok;
} }


AscendDeviceAddress::~AscendDeviceAddress() {
void AscendDeviceAddress::ClearDeviceMemory() {
if (ptr_ == nullptr) { if (ptr_ == nullptr) {
return; return;
} }
@@ -627,6 +627,8 @@ AscendDeviceAddress::~AscendDeviceAddress() {
} }
} }


AscendDeviceAddress::~AscendDeviceAddress() { ClearDeviceMemory(); }

bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &filepath, const std::string &host_fmt, bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &filepath, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type) const { const ShapeVector &host_shape, TypeId host_type) const {
bool ret = false; bool ret = false;


+ 1
- 0
mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h View File

@@ -41,6 +41,7 @@ class AscendDeviceAddress : public DeviceAddress {
~AscendDeviceAddress() override; ~AscendDeviceAddress() override;
bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override;
bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override;
void ClearDeviceMemory() override;
DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; } DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; }
bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt, bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type) const override; const ShapeVector &host_shape, TypeId host_type) const override;


+ 1
- 0
mindspore/ccsrc/runtime/device/cpu/cpu_device_address.h View File

@@ -35,6 +35,7 @@ class CPUDeviceAddress : public DeviceAddress {
bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override;
bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override;
void ClearDeviceMemory() override {}
DeviceAddressType DeviceType() const override { return DeviceAddressType::kCPU; } DeviceAddressType DeviceType() const override { return DeviceAddressType::kCPU; }
}; };
} // namespace cpu } // namespace cpu


+ 3
- 1
mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc View File

@@ -69,7 +69,7 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId
return GPUDeviceManager::GetInstance().SyncStream(stream); return GPUDeviceManager::GetInstance().SyncStream(stream);
} }


GPUDeviceAddress::~GPUDeviceAddress() {
void GPUDeviceAddress::ClearDeviceMemory() {
if (ptr_ == nullptr) { if (ptr_ == nullptr) {
return; return;
} }
@@ -78,6 +78,8 @@ GPUDeviceAddress::~GPUDeviceAddress() {
ptr_ = nullptr; ptr_ = nullptr;
} }
} }

GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, const ShapeVector &host_shape, TypeId host_type, size_t slot,


+ 1
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h View File

@@ -38,6 +38,7 @@ class GPUDeviceAddress : public DeviceAddress {
bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override;
bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override;
void ClearDeviceMemory() override;
void set_status(DeviceAddressStatus status) { status_ = status; } void set_status(DeviceAddressStatus status) { status_ = status; }
DeviceAddressStatus status() const { return status_; } DeviceAddressStatus status() const { return status_; }
DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; } DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; }


+ 5
- 2
mindspore/ccsrc/runtime/device/kernel_runtime.cc View File

@@ -819,6 +819,9 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
if (AnfAlgo::GetCNodeName(cnode) == kAtomicAddrCleanOpName) { if (AnfAlgo::GetCNodeName(cnode) == kAtomicAddrCleanOpName) {
return GenAddrCleanLaunchArgs(cnode, kernel_inputs); return GenAddrCleanLaunchArgs(cnode, kernel_inputs);
} }
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto visit_nop_node = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode);
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
auto op_name = AnfAlgo::GetCNodeName(cnode); auto op_name = AnfAlgo::GetCNodeName(cnode);
constexpr auto none_placeholder_index = 3; constexpr auto none_placeholder_index = 3;
@@ -833,7 +836,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
} }
} }
auto real_input = AnfAlgo::GetRealInputIndex(kernel, i); auto real_input = AnfAlgo::GetRealInputIndex(kernel, i);
auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input);
auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input, visit_nop_node);
MS_EXCEPTION_IF_NULL(device_address); MS_EXCEPTION_IF_NULL(device_address);
kernel::AddressPtr input = std::make_shared<kernel::Address>(); kernel::AddressPtr input = std::make_shared<kernel::Address>();
MS_EXCEPTION_IF_NULL(input); MS_EXCEPTION_IF_NULL(input);
@@ -844,7 +847,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
} }


for (size_t i = 0; i < kernel_mod.GetOutputSizeList().size(); ++i) { for (size_t i = 0; i < kernel_mod.GetOutputSizeList().size(); ++i) {
auto device_address = AnfAlgo::GetOutputAddr(kernel, i);
auto device_address = AnfAlgo::GetOutputAddr(kernel, i, visit_nop_node);
kernel::AddressPtr output = std::make_shared<kernel::Address>(); kernel::AddressPtr output = std::make_shared<kernel::Address>();
MS_EXCEPTION_IF_NULL(output); MS_EXCEPTION_IF_NULL(output);
output->addr = device_address->ptr_; output->addr = device_address->ptr_;


+ 1
- 0
mindspore/core/ir/device_sync.h View File

@@ -33,6 +33,7 @@ class DeviceSync {
virtual bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const = 0; virtual bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const = 0;
virtual bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const = 0; virtual bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const = 0;
virtual void *GetMutablePtr() const = 0; virtual void *GetMutablePtr() const = 0;
virtual void ClearDeviceMemory() = 0;
}; };
using DeviceSyncPtr = std::shared_ptr<DeviceSync>; using DeviceSyncPtr = std::shared_ptr<DeviceSync>;
} // namespace mindspore } // namespace mindspore


Loading…
Cancel
Save