Browse Source

Debugger multi-graph support implementation

Other Contributor: Adel Shafiei, John Tzanakakis
tags/v1.1.0
lichen_101010 John Tzanakakis 5 years ago
parent
commit
1b6265fa43
10 changed files with 206 additions and 23 deletions
  1. +25
    -3
      mindspore/ccsrc/backend/session/ascend_session.cc
  2. +1
    -0
      mindspore/ccsrc/backend/session/ascend_session.h
  3. +10
    -1
      mindspore/ccsrc/backend/session/gpu_session.cc
  4. +6
    -2
      mindspore/ccsrc/debug/debugger/debug_grpc.proto
  5. +117
    -11
      mindspore/ccsrc/debug/debugger/debugger.cc
  6. +13
    -3
      mindspore/ccsrc/debug/debugger/debugger.h
  7. +23
    -1
      mindspore/ccsrc/debug/debugger/grpc_client.cc
  8. +6
    -0
      mindspore/ccsrc/debug/debugger/grpc_client.h
  9. +3
    -0
      mindspore/ccsrc/debug/debugger/proto_exporter.cc
  10. +2
    -2
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc

+ 25
- 3
mindspore/ccsrc/backend/session/ascend_session.cc View File

@@ -160,6 +160,11 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {

HardwareOptimize(NOT_NULL(root_graph), NOT_NULL(&memo));
memo.clear();
// load graphs to debugger.
if (debugger_) {
LoadGraphsToDbg(NOT_NULL(root_graph), NOT_NULL(&memo));
}
memo.clear();

UpdateRefOutputMap(NOT_NULL(root_graph), NOT_NULL(&memo));
memo.clear();
@@ -191,7 +196,7 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
// build kernel
BuildKernel(root_graph);
if (debugger_ && debugger_->partial_memory()) {
debugger_->PreExecute(root_graph);
debugger_->PreExecute(root_graph, graph_sum_);
}
SetSummaryNodes(root_graph.get());
// Alloc memory for child graph's inputs
@@ -271,7 +276,7 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (debugger_ && debugger_->partial_memory()) {
debugger_->PreExecute(graph);
debugger_->PreExecute(graph, graph_sum_);
}
if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
MS_LOG(INFO) << "Precompile only, stop in build kernel step";
@@ -329,7 +334,7 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens
// load input data from user input
LoadInputData(kernel_graph, inputs);
if (debugger_) {
debugger_->PreExecute(kernel_graph);
debugger_->PreExecute(kernel_graph, graph_sum_);
}
#if ENABLE_CPU && ENABLE_D
// Initialize parameter server
@@ -962,6 +967,23 @@ void AscendSession::HardwareOptimize(NotNull<KernelGraphPtr> graph,
MS_LOG(INFO) << "Finish doing HardwareOptimize in graph: " << graph->graph_id();
}

void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,
NotNull<std::set<KernelGraphPtr> *> const memo) const {
if (memo->find(graph) != memo->end()) {
return;
}
memo->insert(graph.get());

MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id();

debugger_->LoadGraphs(graph);
MS_LOG(INFO) << "graph_sum_: " << graph_sum_;
for (auto &child_graph : graph->child_graph_order()) {
LoadGraphsToDbg(NOT_NULL(child_graph.lock()), memo);
}
MS_LOG(INFO) << "Finish doing LoadGraphsToDbg in graph: " << graph->graph_id();
}

void AscendSession::AssignStaticMemory(NotNull<KernelGraphPtr> graph,
NotNull<std::set<KernelGraphPtr> *> const memo) const {
if (memo->find(graph) != memo->end()) {


+ 1
- 0
mindspore/ccsrc/backend/session/ascend_session.h View File

@@ -125,6 +125,7 @@ class AscendSession : public SessionBasic {
size_t *const raise_precision_count, size_t *const reduce_precision_count) const;
void IrFusionPass(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo);
void HardwareOptimize(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;



+ 10
- 1
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -333,12 +333,21 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
}
// Alloc memory, including static memory and dynamic memory
AllocateMemory(graph.get());

#ifdef ENABLE_DEBUGGER
if (debugger_) {
debugger_->LoadGraphs(graph);
}
#endif
MS_LOG(INFO) << "CompileGraph graph_id: " << graph_id;

return graph_id;
}

void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *outputs) {
auto &kernel_graph = graphs_[graph_id];
MS_LOG(INFO) << "RunGraph graph_id: " << graph_id;
// Load input data from user input
LoadInputData(kernel_graph, inputs);
PreIterationDbg(kernel_graph);
@@ -414,7 +423,7 @@ bool GPUSession::DumpDataEnabledIteration() const {

void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
if (debugger_) {
debugger_->PreExecute(kernel_graph);
debugger_->PreExecute(kernel_graph, graph_sum_);
}
PreLoadTensor(kernel_graph);
}


+ 6
- 2
mindspore/ccsrc/debug/debugger/debug_grpc.proto View File

@@ -26,6 +26,7 @@ service EventListener {
rpc SendGraph (stream Chunk) returns (EventReply) {};
rpc SendTensors (stream TensorProto) returns (EventReply) {};
rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
rpc SendMultiGraphs (stream Chunk) returns (EventReply) {};
}

message Metadata {
@@ -36,11 +37,14 @@ message Metadata {
// the full name of current node
string cur_node = 4;
// check if training is done.
bool training_done = 5;
bool training_done = 5;
// the number of total graphs
int32 graph_num = 6;
}

message Chunk {
bytes buffer = 1;
bytes buffer = 1;
bool finished = 2;
}

message EventReply {


+ 117
- 11
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -34,6 +34,7 @@
#include "debug/data_dump/e2e_dump_util.h"
#include "utils/config_manager.h"

using debugger::Chunk;
using debugger::EventReply;
using debugger::GraphProto;
using debugger::ModelProto;
@@ -69,7 +70,8 @@ Debugger::Debugger()
partial_memory_(false),
last_overflow_bin_(0),
overflow_bin_path_(""),
initial_suspend_(true) {
initial_suspend_(true),
not_dataset_graph_sum_(0) {
if (CheckDebuggerEnabled()) {
// configure partial memory reuse
partial_memory_ = CheckDebuggerPartialMemoryEnabled();
@@ -259,12 +261,47 @@ void Debugger::Reset() {
stream_task_to_opname_.clear();
}

void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
CheckDatasetSinkMode();
if (debugger_->DebuggerBackendEnabled()) {
// check and save graph_ptr, suspend if graph is new
auto graph_id = graph_ptr->graph_id();
// collect rungrap_ids to update step number in multigraph case
if (!rungraph_id_list_.size()) {
rungraph_id_list_.push_back(graph_id);

} else {
if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) {
rungraph_id_list_.push_back(graph_id);
}
}
// check and save graph_ptr, suspend if graph is new
MS_LOG(INFO) << "total number graph: " << graph_sum;
// multiple graphs
if (graph_sum > 1) {
// there are more than one graphs are not dataset_graph
if (not_dataset_graph_sum_ > 0) {
// only try to enable debugger if they are not all dataset graphs
if (!debugger_enabled_) {
EnableDebugger();
}

if (debugger_enabled_) {
if (graph_proto_list_.size()) {
// only send compiled graphs once.
SendMultiGraphsAndSuspend(graph_proto_list_, graph_sum);
graph_proto_list_.clear();
} else if (graph_id == rungraph_id_list_.front()) {
// stop only when receive the first sub run graph for each step
CommandLoop();
}
}
}
} else if (graph_proto_list_.size() == 1) {
// In single graph case, reset graph_ptr_ to be nullptr for the initial step
if (num_step_ == 0) {
graph_ptr_ = nullptr;
}
CheckGraphPtr(graph_ptr);
}
}
@@ -346,20 +383,38 @@ void Debugger::SetStreamTaskToOpnameMap(const std::map<std::pair<uint32_t, uint3
stream_task_to_opname_ = mapping;
}

void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
if (graph_ptr_ != graph_ptr) {
MS_LOG(INFO) << "Debugger got new graph: " << graph_ptr->graph_id();
MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
// save new graph_ptr
graph_ptr_ = graph_ptr;
// check if it is dataset graph
CheckDatasetGraph();
if (!is_dataset_graph_) {
// get proto for new graph_ptr
auto graph_proto = GetGraphProto(graph_ptr);
// add new graph proto to graph_proto_list_
graph_proto_list_.push_back(graph_proto);
not_dataset_graph_sum_++;
}
// reset is_dataset_graph to be false
is_dataset_graph_ = false;
}
}

// In single graph cases, check single graph ptr
void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
if (graph_ptr_ != graph_ptr) {
MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
// save new graph_ptr
graph_ptr_ = graph_ptr;
if (!is_dataset_graph_) {
// only try to enable debugger if it is not a dataset graph
EnableDebugger();
if (debugger_enabled_) {
LoadParametersAndConst();
// get graph proto and send to mindinsight
SendGraphAndSuspend(GetGraphProto());
auto graph_proto = graph_proto_list_.front();
SendGraphAndSuspend(graph_proto);
}
}
}
@@ -386,7 +441,7 @@ void Debugger::CheckDatasetGraph() {
is_dataset_graph_ = false;
}

GraphProto Debugger::GetGraphProto() const {
GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
// convert kernel graph to debugger modelproto
ModelProto model = GetDebuggerFuncGraphProto(graph_ptr_);
return model.graph();
@@ -413,12 +468,49 @@ void Debugger::SendMetadata() {
metadata.set_cur_node(cur_name_);
metadata.set_training_done(training_done_);
MS_LOG(INFO) << "Is training done?" << training_done_;
// set graph munber to not_dataset_graph_sum_
metadata.set_graph_num(not_dataset_graph_sum_);
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
if (reply_metadata.status() != reply_metadata.OK) {
MS_LOG(ERROR) << "Error: SendMetadata failed";
}
}

void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum) {
SendMetadata();
// send multiple graphs to mindinght server
// split graph into chunks if one graph is larger than chunk size
std::list<Chunk> chunked_graph_proto_list;
Chunk chunk;
for (auto graph : graph_proto_list) {
std::string str = graph.SerializeAsString();
auto graph_size = graph.ByteSize();
if (graph_size > CHUNK_SIZE) {
auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
chunk.set_buffer(sub_graph_str[i]);
chunked_graph_proto_list.push_back(chunk);
if (i < sub_graph_str.size() - 1) {
chunk.set_finished(false);
} else {
chunk.set_finished(true);
chunked_graph_proto_list.push_back(chunk);
}
}
} else {
chunk.set_buffer(str);
chunk.set_finished(true);
chunked_graph_proto_list.push_back(chunk);
}
}
EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
}
// enter command loop, wait and process commands
CommandLoop();
}

void Debugger::CommandLoop() {
// prepare metadata
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
@@ -923,6 +1015,8 @@ bool Debugger::CheckPort(const char *port) {
return true;
}

uint32_t Debugger::GetFirstRunGraphId() { return rungraph_id_list_.front(); }

void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index) {
MS_EXCEPTION_IF_NULL(anf_node);
if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
@@ -996,6 +1090,13 @@ void Debugger::LoadGraphOutputs() {
}
}
for (size_t j = 0; j < output_size; ++j) {
auto kernel_info = static_cast<device::KernelInfo *>(node->kernel_info());
MS_EXCEPTION_IF_NULL(kernel_info);
auto addr_test = kernel_info->GetOutputAddr(j);
if (addr_test == nullptr) {
MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name;
continue;
}
auto addr = AnfAlgo::GetOutputAddr(node, j);
MS_EXCEPTION_IF_NULL(addr);
auto type = AnfAlgo::GetOutputInferDataType(node, j);
@@ -1015,9 +1116,14 @@ void Debugger::LoadGraphOutputs() {
}
}

void Debugger::UpdateStepNum() {
if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
// update step number if we are processing the first graph (to support multigraph)
if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
(graph->graph_id() == debugger_->GetFirstRunGraphId())) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
++num_step_;
}
}

void Debugger::ClearCurrentData() {


+ 13
- 3
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -68,7 +68,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// enable debugger
// send graph and wait for command
// do nothing if graph is set already
void PreExecute(const KernelGraphPtr &graph_ptr);
void PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum = 1);

// analyze tensors and wait for command
// don't need a graph_ptr because it is saved during pre_execute
@@ -106,7 +106,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

void LoadParametersAndConst();

void UpdateStepNum();
void UpdateStepNum(const session::KernelGraph *graph);

void ClearCurrentData();

@@ -114,6 +114,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

void CheckDatasetSinkMode();

void LoadGraphs(const KernelGraphPtr &graph_ptr);

uint32_t GetFirstRunGraphId();

private:
// private constructor for singleton
Debugger();
@@ -138,11 +142,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void CheckDatasetGraph();

// serialize graph and get proto
GraphProto GetGraphProto() const;
GraphProto GetGraphProto(const KernelGraphPtr &graph_ptr) const;

// send graph and enter command wait loop
void SendGraphAndSuspend(const GraphProto &graph_proto);

void SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum);

// wait for command and process command
// send command request and process reply in a loop
// break if RunCMD
@@ -197,9 +203,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::string overflow_bin_path_;
// flag to keep track of the very first suspension of debugger
bool initial_suspend_;
std::list<GraphProto> graph_proto_list_;

// singleton
static std::mutex instance_lock_;
static std::shared_ptr<Debugger> debugger_;
uint32_t not_dataset_graph_sum_;
std::list<uint32_t> rungraph_id_list_;
};

using DebuggerPtr = std::shared_ptr<Debugger>;


+ 23
- 1
mindspore/ccsrc/debug/debugger/grpc_client.cc View File

@@ -69,7 +69,7 @@ EventReply GrpcClient::SendMetadata(const Metadata &metadata) {
return reply;
}

std::vector<std::string> ChunkString(std::string str, int graph_size) {
std::vector<std::string> GrpcClient::ChunkString(std::string str, int graph_size) {
std::vector<std::string> buf;
int size_iter = 0;
while (size_iter < graph_size) {
@@ -118,6 +118,28 @@ EventReply GrpcClient::SendGraph(const GraphProto &graph) {
return reply;
}

EventReply GrpcClient::SendMultiGraphs(const std::list<Chunk> &chunks) {
EventReply reply;
grpc::ClientContext context;

std::unique_ptr<grpc::ClientWriter<Chunk> > writer(stub_->SendMultiGraphs(&context, &reply));
for (const auto &chunk : chunks) {
if (!writer->Write(chunk)) {
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
writer->WritesDone();
grpc::Status status = writer->Finish();

if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendMultigraphs";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}

EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) {
EventReply reply;
grpc::ClientContext context;


+ 6
- 0
mindspore/ccsrc/debug/debugger/grpc_client.h View File

@@ -19,9 +19,11 @@
#include <grpcpp/grpcpp.h>
#include <string>
#include <list>
#include <vector>
#include <memory>
#include "proto/debug_grpc.grpc.pb.h"

using debugger::Chunk;
using debugger::EventListener;
using debugger::EventReply;
using debugger::GraphProto;
@@ -52,8 +54,12 @@ class GrpcClient {

EventReply SendTensors(const std::list<TensorProto> &tensors);

EventReply SendMultiGraphs(const std::list<Chunk> &chunks);

EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);

std::vector<std::string> ChunkString(std::string str, int graph_size);

private:
std::unique_ptr<EventListener::Stub> stub_;
};


+ 3
- 0
mindspore/ccsrc/debug/debugger/proto_exporter.cc View File

@@ -354,6 +354,8 @@ void DebuggerProtoExporter::ExportFuncGraph(const FuncGraphPtr &func_graph, debu
// set graph name
graph_proto->set_name(func_graph->ToString());

MS_LOG(INFO) << "graph names: " << func_graph->ToString();

ExportParameters(func_graph, graph_proto);

ExportCNodes(func_graph, graph_proto, &const_map);
@@ -433,6 +435,7 @@ void DebuggerProtoExporter::ExportCNode(const FuncGraphPtr &func_graph, const CN

// add full_name for debugger
node_proto->set_full_name(node->fullname_with_scope());
MS_LOG(INFO) << "full_name: " << node->fullname_with_scope();

// process OP inputs
for (size_t i = 1; i < inputs.size(); ++i) {


+ 2
- 2
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -577,8 +577,8 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
AllocInplaceNodeMemory(graph);

bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
if (!mock) {
debugger_->UpdateStepNum();
if (!mock && debugger_) {
debugger_->UpdateStepNum(graph);
}
auto &kernels = graph->execution_order();
int exec_order = 1;


Loading…
Cancel
Save