Browse Source

send info when training is done

tags/v1.0.0
lichen_101010 John Tzanakakis 5 years ago
parent
commit
dffa61b228
6 changed files with 46 additions and 11 deletions
  1. +2
    -0
      mindspore/ccsrc/debug/debugger/debug_grpc.proto
  2. +17
    -7
      mindspore/ccsrc/debug/debugger/debugger.cc
  3. +5
    -0
      mindspore/ccsrc/debug/debugger/debugger.h
  4. +10
    -3
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
  5. +8
    -1
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
  6. +4
    -0
      mindspore/ccsrc/runtime/device/kernel_runtime.h

+ 2
- 0
mindspore/ccsrc/debug/debugger/debug_grpc.proto View File

@@ -35,6 +35,8 @@ message Metadata {
string backend = 3; string backend = 3;
// the full name of current node // the full name of current node
string cur_node = 4; string cur_node = 4;
// check if training is done.
bool training_done = 5;
} }


message Chunk { message Chunk {


+ 17
- 7
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -57,6 +57,7 @@ Debugger::Debugger()
run_level_(""), run_level_(""),
node_name_(""), node_name_(""),
cur_name_(""), cur_name_(""),
training_done_(false),
is_dataset_graph_(false), is_dataset_graph_(false),
partial_memory_(false), partial_memory_(false),
last_overflow_bin_(0), last_overflow_bin_(0),
@@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const {
} }


void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
SendMetadata();
// send graph to mindinght server
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
}
// enter command loop, wait and process commands
CommandLoop();
}

void Debugger::SendMetadata() {
// prepare metadata // prepare metadata
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata; Metadata metadata;
@@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
metadata.set_cur_step(num_step_); metadata.set_cur_step(num_step_);
metadata.set_backend(device_target_); metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_); metadata.set_cur_node(cur_name_);
metadata.set_training_done(training_done_);
MS_LOG(INFO) << "Is training done?" << training_done_;
EventReply reply_metadata = grpc_client_->SendMetadata(metadata); EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
if (reply_metadata.status() != reply_metadata.OK) { if (reply_metadata.status() != reply_metadata.OK) {
MS_LOG(ERROR) << "Error: SendMetadata failed"; MS_LOG(ERROR) << "Error: SendMetadata failed";
} }
// send graph to mindinght server
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
}
// enter command loop, wait and process commands
CommandLoop();
} }


void Debugger::CommandLoop() { void Debugger::CommandLoop() {
@@ -365,6 +372,7 @@ void Debugger::CommandLoop() {
metadata.set_cur_step(num_step_); metadata.set_cur_step(num_step_);
metadata.set_backend(device_target_); metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_); metadata.set_cur_node(cur_name_);
metadata.set_training_done(training_done_);


// loop exit flag // loop exit flag
bool run = false; bool run = false;
@@ -787,4 +795,6 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
return op_names; return op_names;
} }


void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }

} // namespace mindspore } // namespace mindspore

+ 5
- 0
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// check if any feature that uses the debugger backend is enabled // check if any feature that uses the debugger backend is enabled
bool DebuggerBackendEnabled(); bool DebuggerBackendEnabled();


void SetTrainingDone(bool training_done);

void SendMetadata();

private: private:
// private constructor for singleton // private constructor for singleton
Debugger(); Debugger();
@@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::string run_level_; std::string run_level_;
std::string node_name_; std::string node_name_;
std::string cur_name_; std::string cur_name_;
bool training_done_;
bool is_dataset_graph_; bool is_dataset_graph_;
bool partial_memory_; bool partial_memory_;
std::mutex access_lock_; std::mutex access_lock_;


+ 10
- 3
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc View File

@@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() {


void AscendKernelRuntime::ReleaseDeviceRes() { void AscendKernelRuntime::ReleaseDeviceRes() {
MS_LOG(INFO) << "Ascend finalize start"; MS_LOG(INFO) << "Ascend finalize start";
#ifdef ENABLE_DEBUGGER
if (debugger_ && debugger_->debugger_enabled()) {
debugger_->SetTrainingDone(true);
debugger_->SendMetadata();
}
#endif
if (!initialized_) { if (!initialized_) {
return; return;
} }
@@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
const auto &apply_kernels = graph->execution_order(); const auto &apply_kernels = graph->execution_order();
// for kernels, execution order starts from 1 // for kernels, execution order starts from 1
int exec_order = 1; int exec_order = 1;
auto debugger_ = mindspore::Debugger::GetInstance();
DebugServices *debug_services = debugger_->debug_services();
auto debugger_i = mindspore::Debugger::GetInstance();
DebugServices *debug_services = debugger_i->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable(); auto watchpoint_table = debug_services->GetWatchpointTable();
for (const auto &node : apply_kernels) { for (const auto &node : apply_kernels) {
MS_EXCEPTION_IF_NULL(node); MS_EXCEPTION_IF_NULL(node);
auto node_name = AnfAlgo::GetCNodeName(node); auto node_name = AnfAlgo::GetCNodeName(node);
std::string kernel_name = node->fullname_with_scope(); std::string kernel_name = node->fullname_with_scope();
auto output_size = AnfAlgo::GetOutputTensorNum(node); auto output_size = AnfAlgo::GetOutputTensorNum(node);
if (debugger_->partial_memory()) {
if (debugger_i->partial_memory()) {
if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) {
continue; continue;
} }
@@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger)
bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(graph);
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
debugger_ = debugger;
MS_LOG(INFO) << "Start load step"; MS_LOG(INFO) << "Start load step";
uint32_t cur_iter = 0; uint32_t cur_iter = 0;
MS_LOG(INFO) << "Cur iter is " << cur_iter; MS_LOG(INFO) << "Cur iter is " << cur_iter;


+ 8
- 1
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); }); [](size_t inner_item) { return SizeToInt(inner_item); });
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
if (!ret) { if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:" MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() {


void GPUKernelRuntime::ReleaseDeviceRes() { void GPUKernelRuntime::ReleaseDeviceRes() {
// For dataset mode. // For dataset mode.
#ifdef ENABLE_DEBUGGER
if (debugger_ && debugger_->debugger_enabled()) {
debugger_->SetTrainingDone(true);
debugger_->SendMetadata();
}
#endif
if (GpuBufferMgr::GetInstance().IsInit()) { if (GpuBufferMgr::GetInstance().IsInit()) {
if (!GpuBufferMgr::GetInstance().IsClosed()) { if (!GpuBufferMgr::GetInstance().IsClosed()) {
if (!GpuBufferMgr::GetInstance().CloseNotify()) { if (!GpuBufferMgr::GetInstance().CloseNotify()) {
@@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
AllocCommunicationOpDynamicRes(graph); AllocCommunicationOpDynamicRes(graph);


#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
debugger_ = debugger;
bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
if (!mock) { if (!mock) {
UpdateStepNum(debugger, dump_enabled); UpdateStepNum(debugger, dump_enabled);


+ 4
- 0
mindspore/ccsrc/runtime/device/kernel_runtime.h View File

@@ -124,6 +124,10 @@ class KernelRuntime {
#ifdef ENABLE_DUMP_E2E #ifdef ENABLE_DUMP_E2E
DumpConfPtr dump_conf_ptr_; DumpConfPtr dump_conf_ptr_;
#endif #endif

#ifdef ENABLE_DEBUGGER
Debugger *debugger_;
#endif
void *stream_ = nullptr; void *stream_ = nullptr;
std::shared_ptr<MemoryManager> mem_manager_{nullptr}; std::shared_ptr<MemoryManager> mem_manager_{nullptr};
}; };


Loading…
Cancel
Save