Browse Source

!16321 Fixing online debugger issue for multigraph GPU, skipping nodes when user clicks "next node"

From: @parastooashtari
Reviewed-by: @john_tzanakakis,@robingrosman
Signed-off-by: @john_tzanakakis
tags/v1.3.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
ef29ce7481
4 changed files with 26 additions and 14 deletions
  1. +1
    -1
      mindspore/ccsrc/backend/session/gpu_session.cc
  2. +2
    -2
      mindspore/ccsrc/debug/debug_services.cc
  3. +21
    -10
      mindspore/ccsrc/debug/debugger/debugger.cc
  4. +2
    -1
      mindspore/ccsrc/debug/debugger/debugger.h

+ 1
- 1
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -452,7 +452,7 @@ void GPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_gra
DumpJsonParser::GetInstance().UpdateDumpIter();
}
if (debugger_) {
debugger_->PostExecute(kernel_graph);
debugger_->PostExecute();
}
}



+ 2
- 2
mindspore/ccsrc/debug/debug_services.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -258,7 +258,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::
AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
if (is_hit || error_code) {
std::vector<int>::iterator iter;
// if the execution order is repeated,inserts the new one before the others with same execution order.
// if the execution order is repeated, inserts the new one before the others with same execution order.
iter = std::lower_bound(exec_order.begin(), exec_order.end(), tensor->GetExecutionOrder());
int position = iter - exec_order.begin();
exec_order.insert(iter, tensor->GetExecutionOrder());


+ 21
- 10
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -329,6 +329,13 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
graph_ptr_ = dbg_graph_ptr;
SendMultiGraphsAndSuspend(graph_proto_list_);
graph_proto_list_.clear();
} else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
// stop only when receive the first sub run graph for each step
// if we have stopped for the last kernel before, no need to stop again
if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
CommandLoop();
}
debug_services_->ResetLoadedTensors();
}
}
}
@@ -339,9 +346,11 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
}
CheckGraphPtr(graph_ptr);
}
// resets for the new graph
suspended_at_last_kernel_ = 0;
}

void Debugger::PostExecute(const KernelGraphPtr &graph_ptr) {
void Debugger::PostExecute() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
if (pipeline::ExecutorPy::GetDebugTerminate()) {
@@ -354,17 +363,16 @@ void Debugger::PostExecute(const KernelGraphPtr &graph_ptr) {
num_step_++;
}
SendWatchpoints(CheckWatchpoints());
if (graph_ptr != nullptr && device_target_ == kGPUDevice) {
auto graph_id = graph_ptr->graph_id();
if (graph_id == rungraph_id_list_.front()) {
CommandLoop();
}
} else {
// no need to suspend at each graph for GPU, suspension happens in preExecute
if (device_target_ != kGPUDevice) {
CommandLoop();
}
}
// Only keep parameters in the current map
debug_services_->ResetLoadedTensors();
// GPU ResetLoadedTensors happens in preExecute
if (device_target_ != kGPUDevice) {
debug_services_->ResetLoadedTensors();
}
}
}

@@ -398,9 +406,12 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
hit_empty_flag = false;
}
}
if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_) && !last_kernel) {
if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
// if kernel is not watchpoint and is next_to or continue_to node, suspend
// No need to suspend if this is the last node in graph since PostExecute suspends at the end of graph
// sets a bool to be checked in preExecute to avoid double stopping at last kernel in the last graph
if (last_kernel) {
suspended_at_last_kernel_ = 1;
}
CommandLoop();
}
return;


+ 2
- 1
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -80,7 +80,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

// analyze tensors and wait for command
// don't need a graph_ptr because it is saved during pre_execute
void PostExecute(const KernelGraphPtr &graph_ptr = nullptr);
void PostExecute();

bool ReadNodeDataRequired(const CNodePtr &kernel) const;

@@ -235,6 +235,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::string device_target_;
int32_t num_step_;
bool debugger_enabled_;
bool suspended_at_last_kernel_;
std::string run_level_;
std::string node_name_;
std::string cur_name_;


Loading…
Cancel
Save