Browse Source

!9036 Support debugger version check with Mindinsight

From: @adelshafiei
Reviewed-by: 
Signed-off-by:
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
77ba75ba75
5 changed files with 93 additions and 41 deletions
  1. +4
    -1
      mindspore/ccsrc/debug/debugger/debug_grpc.proto
  2. +74
    -36
      mindspore/ccsrc/debug/debugger/debugger.cc
  3. +13
    -2
      mindspore/ccsrc/debug/debugger/debugger.h
  4. +1
    -1
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
  5. +1
    -1
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc

+ 4
- 1
mindspore/ccsrc/debug/debugger/debug_grpc.proto View File

@@ -37,9 +37,11 @@ message Metadata {
// the full name of current node
string cur_node = 4;
// check if training is done.
bool training_done = 5;
bool training_done = 5;
// the number of total graphs
int32 graph_num = 6;
// mindspore version
string ms_version = 7;
}

message Chunk {
@@ -61,6 +63,7 @@ message EventReply {
RunCMD run_cmd = 3;
SetCMD set_cmd = 4;
ViewCMD view_cmd = 5;
bool version_matched = 6;
}
}



+ 74
- 36
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -71,7 +71,8 @@ Debugger::Debugger()
last_overflow_bin_(0),
overflow_bin_path_(""),
initial_suspend_(true),
not_dataset_graph_sum_(0) {
not_dataset_graph_sum_(0),
version_("") {
if (CheckDebuggerEnabled()) {
// configure partial memory reuse
partial_memory_ = CheckDebuggerPartialMemoryEnabled();
@@ -100,6 +101,7 @@ void Debugger::Init(const uint32_t device_id, const std::string device_target) {
device_id_ = device_id;
MS_LOG(INFO) << "Debugger got device_target: " << device_target;
device_target_ = device_target;
version_ = "1.1.0";
}

void Debugger::EnableDebugger() {
@@ -413,7 +415,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
EnableDebugger();
if (debugger_enabled_) {
LoadParametersAndConst();
// get graph proto and send to mindinsight
// get graph proto and send to Mindinsight
auto graph_proto = graph_proto_list_.front();
SendGraphAndSuspend(graph_proto);
}
@@ -449,17 +451,18 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
}

void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
SendMetadata();
// send graph to mindinght server
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
if (SendMetadata(true)) {
// send graph to Mindinsight server
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
}
// enter command loop, wait and process commands
CommandLoop();
}
// enter command loop, wait and process commands
CommandLoop();
}

void Debugger::SendMetadata() {
bool Debugger::SendMetadata(bool version_check) {
// prepare metadata
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata;
@@ -468,17 +471,43 @@ void Debugger::SendMetadata() {
metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_);
metadata.set_training_done(training_done_);
metadata.set_ms_version(version_);
MS_LOG(INFO) << "Is training done?" << training_done_;
// set graph munber to not_dataset_graph_sum_
metadata.set_graph_num(not_dataset_graph_sum_);
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
if (reply_metadata.status() != reply_metadata.OK) {
bool ret = false;
if (reply_metadata.status() == reply_metadata.OK) {
if (version_check) {
// get type of the command in meta data reply, it should be version matched
DebuggerCommand cmd = GetCommand(reply_metadata);
if (cmd != DebuggerCommand::kVersionMatchedCMD) {
MS_LOG(ERROR) << "MindInsight version is too old, Mindspore version is " << version_;
Exit();
} else {
if (GetMiVersionMatched(reply_metadata)) {
MS_LOG(INFO) << "MindSpore version is " << version_ << " matches MindInsight version.";
ret = true;
} else {
MS_LOG(ERROR) << "MindSpore version " << version_ << ", did not match MindInsight version.";
CommandLoop();
}
}
} else {
// version check is done before so we can just return true here
ret = true;
}
} else {
MS_LOG(ERROR) << "Error: SendMetadata failed";
}

return ret;
}

void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum) {
SendMetadata();
if (!SendMetadata(true)) {
return;
}
// send multiple graphs to mindinght server
// split graph into chunks if one graph is larger than chunk size
std::list<Chunk> chunked_graph_proto_list;
@@ -610,40 +639,44 @@ void Debugger::CommandLoop() {
SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply));
}
break;
case DebuggerCommand::kViewCMD:
case DebuggerCommand::kViewCMD: {
MS_LOG(INFO) << "ViewCMD";
{
// print view cmd content
ProtoVector<TensorProto> received_tensors = GetTensors(reply);
for (auto tensor : received_tensors) {
MS_LOG(INFO) << "tensor node name: " << tensor.node_name();
MS_LOG(INFO) << "tensor slot: " << tensor.slot();
MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha;
MS_LOG(INFO) << "tensor iter: " << tensor.iter();
MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha;
}
// print view cmd content
ProtoVector<TensorProto> received_tensors = GetTensors(reply);
for (auto received_tensor : received_tensors) {
MS_LOG(INFO) << "tensor node name: " << received_tensor.node_name();
MS_LOG(INFO) << "tensor slot: " << received_tensor.slot();
MS_LOG(INFO) << "tensor finished: " << std::boolalpha << received_tensor.finished() << std::noboolalpha;
MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
}
MS_LOG(INFO) << "Sending tensors";
std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
{
// print view cmd reply
for (auto tensor : tensors) {
MS_LOG(INFO) << "tensor node name: " << tensor.node_name();
MS_LOG(INFO) << "tensor slot: " << tensor.slot();
MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha;
MS_LOG(INFO) << "tensor iter: " << tensor.iter();
MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha;
MS_LOG(INFO) << "tensor dims: ";
for (auto dim : tensor.dims()) {
MS_LOG(INFO) << dim << ",";
}
MS_LOG(INFO) << "tensor dtype: " << tensor.data_type();
// print view cmd reply
for (auto tensor : tensors) {
MS_LOG(INFO) << "tensor node name: " << tensor.node_name();
MS_LOG(INFO) << "tensor slot: " << tensor.slot();
MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha;
MS_LOG(INFO) << "tensor iter: " << tensor.iter();
MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha;
MS_LOG(INFO) << "tensor dims: ";
for (auto dim : tensor.dims()) {
MS_LOG(INFO) << dim << ",";
}
MS_LOG(INFO) << "tensor dtype: " << tensor.data_type();
}
EventReply send_tensors_reply = grpc_client_->SendTensors(tensors);
if (send_tensors_reply.status() != send_tensors_reply.OK) {
MS_LOG(ERROR) << "Error: SendTensors failed";
}
} break;
case DebuggerCommand::kVersionMatchedCMD:
MS_LOG(ERROR) << "Received unexpected Version Matched CMD from Mindinsight.";
Exit();
break;
default:
MS_LOG(ERROR) << "Received unknown CMD from Mindinsight";
Exit();
break;
}
}
@@ -825,6 +858,9 @@ DebuggerCommand GetCommand(const EventReply &reply) {
case debugger::EventReply::CmdCase::kViewCmd:
cmd = DebuggerCommand::kViewCMD;
break;
case debugger::EventReply::CmdCase::kVersionMatched:
cmd = DebuggerCommand::kVersionMatchedCMD;
break;
default:
MS_LOG(DEBUG) << "Debug: UnknownCMD";
break;
@@ -909,6 +945,8 @@ std::string GetTensorFullName(const TensorProto &tensor) {
return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter());
}

bool GetMiVersionMatched(const EventReply &reply) { return reply.version_matched(); }

bool Debugger::partial_memory() { return partial_memory_; }

void Debugger::SetCurNode(std::string cur_name) {


+ 13
- 2
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -43,7 +43,14 @@ using ProtoVector = google::protobuf::RepeatedPtrField<T>;
namespace mindspore {
// different types of command recieved by debugger
// need to keep sync with client-side proto and server-side proto
enum class DebuggerCommand { kExitCMD = 2, kRunCMD = 3, kSetCMD = 4, kViewCMD = 5, kUnknownCMD = -1 };
enum class DebuggerCommand {
kExitCMD = 2,
kRunCMD = 3,
kSetCMD = 4,
kViewCMD = 5,
kVersionMatchedCMD = 6,
kUnknownCMD = -1
};

class Debugger : public std::enable_shared_from_this<Debugger> {
public:
@@ -102,7 +109,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

void SetTrainingDone(bool training_done);

void SendMetadata();
// returns true if reply received and mindspore version matched with mindinsight version
// version_check should be true if you want the function to do backend compability check with Mindinsight
bool SendMetadata(bool version_check);

void LoadParametersAndConst();

@@ -215,6 +224,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
static std::shared_ptr<Debugger> debugger_;
uint32_t not_dataset_graph_sum_;
std::list<uint32_t> rungraph_id_list_;
std::string version_;
};

using DebuggerPtr = std::shared_ptr<Debugger>;
@@ -238,6 +248,7 @@ WatchCondition GetWatchcondition(const EventReply &reply);
int32_t GetWatchpointID(const EventReply &reply);
bool GetWatchpointDelete(const EventReply &reply);
ProtoVector<TensorProto> GetTensors(const EventReply &reply);
bool GetMiVersionMatched(const EventReply &reply);

// get the full name of a tensor, which is the name used in TensorLoader
std::string GetTensorFullName(const TensorProto &tensor);


+ 1
- 1
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc View File

@@ -267,7 +267,7 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
#ifdef ENABLE_DEBUGGER
if (debugger_ && debugger_->debugger_enabled()) {
debugger_->SetTrainingDone(true);
debugger_->SendMetadata();
debugger_->SendMetadata(false);
}
#endif
if (!initialized_) {


+ 1
- 1
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -208,7 +208,7 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
#ifdef ENABLE_DEBUGGER
if (debugger_ && debugger_->debugger_enabled()) {
debugger_->SetTrainingDone(true);
debugger_->SendMetadata();
debugger_->SendMetadata(false);
}
#endif
if (GpuBufferMgr::GetInstance().IsInit()) {


Loading…
Cancel
Save