Browse Source

send info when training is done

tags/v1.0.0
lichen_101010 John Tzanakakis 5 years ago
parent
commit
dffa61b228
6 changed files with 46 additions and 11 deletions
  1. +2
    -0
      mindspore/ccsrc/debug/debugger/debug_grpc.proto
  2. +17
    -7
      mindspore/ccsrc/debug/debugger/debugger.cc
  3. +5
    -0
      mindspore/ccsrc/debug/debugger/debugger.h
  4. +10
    -3
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
  5. +8
    -1
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
  6. +4
    -0
      mindspore/ccsrc/runtime/device/kernel_runtime.h

+ 2
- 0
mindspore/ccsrc/debug/debugger/debug_grpc.proto View File

@@ -35,6 +35,8 @@ message Metadata {
string backend = 3;
// the full name of current node
string cur_node = 4;
// check if training is done.
bool training_done = 5;
}

message Chunk {


+ 17
- 7
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -57,6 +57,7 @@ Debugger::Debugger()
run_level_(""),
node_name_(""),
cur_name_(""),
training_done_(false),
is_dataset_graph_(false),
partial_memory_(false),
last_overflow_bin_(0),
@@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const {
}

void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
SendMetadata();
// send graph to mindinght server
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
}
// enter command loop, wait and process commands
CommandLoop();
}

void Debugger::SendMetadata() {
// prepare metadata
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata;
@@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
metadata.set_cur_step(num_step_);
metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_);
metadata.set_training_done(training_done_);
MS_LOG(INFO) << "Is training done?" << training_done_;
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
if (reply_metadata.status() != reply_metadata.OK) {
MS_LOG(ERROR) << "Error: SendMetadata failed";
}
// send graph to mindinght server
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
}
// enter command loop, wait and process commands
CommandLoop();
}

void Debugger::CommandLoop() {
@@ -365,6 +372,7 @@ void Debugger::CommandLoop() {
metadata.set_cur_step(num_step_);
metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_);
metadata.set_training_done(training_done_);

// loop exit flag
bool run = false;
@@ -787,4 +795,6 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
return op_names;
}

void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }

} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// check if any feature that uses the debugger backend is enabled
bool DebuggerBackendEnabled();

void SetTrainingDone(bool training_done);

void SendMetadata();

private:
// private constructor for singleton
Debugger();
@@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::string run_level_;
std::string node_name_;
std::string cur_name_;
bool training_done_;
bool is_dataset_graph_;
bool partial_memory_;
std::mutex access_lock_;


+ 10
- 3
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc View File

@@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() {

void AscendKernelRuntime::ReleaseDeviceRes() {
MS_LOG(INFO) << "Ascend finalize start";
#ifdef ENABLE_DEBUGGER
if (debugger_ && debugger_->debugger_enabled()) {
debugger_->SetTrainingDone(true);
debugger_->SendMetadata();
}
#endif
if (!initialized_) {
return;
}
@@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
const auto &apply_kernels = graph->execution_order();
// for kernels, execution order starts from 1
int exec_order = 1;
auto debugger_ = mindspore::Debugger::GetInstance();
DebugServices *debug_services = debugger_->debug_services();
auto debugger_i = mindspore::Debugger::GetInstance();
DebugServices *debug_services = debugger_i->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable();
for (const auto &node : apply_kernels) {
MS_EXCEPTION_IF_NULL(node);
auto node_name = AnfAlgo::GetCNodeName(node);
std::string kernel_name = node->fullname_with_scope();
auto output_size = AnfAlgo::GetOutputTensorNum(node);
if (debugger_->partial_memory()) {
if (debugger_i->partial_memory()) {
if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) {
continue;
}
@@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger)
bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
#ifdef ENABLE_DEBUGGER
debugger_ = debugger;
MS_LOG(INFO) << "Start load step";
uint32_t cur_iter = 0;
MS_LOG(INFO) << "Cur iter is " << cur_iter;


+ 8
- 1
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() {

void GPUKernelRuntime::ReleaseDeviceRes() {
// For dataset mode.
#ifdef ENABLE_DEBUGGER
if (debugger_ && debugger_->debugger_enabled()) {
debugger_->SetTrainingDone(true);
debugger_->SendMetadata();
}
#endif
if (GpuBufferMgr::GetInstance().IsInit()) {
if (!GpuBufferMgr::GetInstance().IsClosed()) {
if (!GpuBufferMgr::GetInstance().CloseNotify()) {
@@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
AllocCommunicationOpDynamicRes(graph);

#ifdef ENABLE_DEBUGGER
debugger_ = debugger;
bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
if (!mock) {
UpdateStepNum(debugger, dump_enabled);


+ 4
- 0
mindspore/ccsrc/runtime/device/kernel_runtime.h View File

@@ -124,6 +124,10 @@ class KernelRuntime {
#ifdef ENABLE_DUMP_E2E
DumpConfPtr dump_conf_ptr_;
#endif

#ifdef ENABLE_DEBUGGER
Debugger *debugger_;
#endif
void *stream_ = nullptr;
std::shared_ptr<MemoryManager> mem_manager_{nullptr};
};


Loading…
Cancel
Save