From 5713ea38b81549015c84d231de9bdd6a9c86bbb3 Mon Sep 17 00:00:00 2001
From: lichen_101010
Date: Mon, 14 Sep 2020 19:37:03 -0400
Subject: [PATCH] resolve terminate issue
terminate issue part 2
clean up code
first draft of deadlock issue
test
resolve terminate deadlock issue
add node level condition
resolve terminate issue
terminate issue part 2
clean up code
first draft of deadlock issue
test
resolve terminate deadlock issue
add node level condition
cleanup code and CI checks
rebase to master and address conflicts
refactor code
fix a bug
---
mindspore/ccsrc/backend/session/executor.cc | 3 ++-
mindspore/ccsrc/debug/debugger/debugger.cc | 16 ++++++++++++++--
mindspore/ccsrc/pipeline/jit/pipeline.cc | 11 +++++++++++
mindspore/ccsrc/pipeline/jit/pipeline.h | 5 ++++-
4 files changed, 31 insertions(+), 4 deletions(-)
diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc
index ce2dd16b62..855d8b53a0 100644
--- a/mindspore/ccsrc/backend/session/executor.cc
+++ b/mindspore/ccsrc/backend/session/executor.cc
@@ -102,7 +102,8 @@ void Executor::CheckException() {
}
void Executor::WorkerJoin() {
- if (worker_->joinable()) {
+ // Avoid worker thread join itself which will cause deadlock
+ if (worker_->joinable() && worker_->get_id() != std::this_thread::get_id()) {
{
std::unique_lock lock(task_mutex_);
auto task = std::make_shared();
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index 5802e9cf4f..5df3b5a61a 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -428,6 +428,8 @@ void Debugger::CommandLoop() {
case DebuggerCommand::kExitCMD:
MS_LOG(INFO) << "ExitCMD";
Exit();
+ // Used for debugger termination
+ run = true;
break;
case DebuggerCommand::kRunCMD:
MS_LOG(INFO) << "RunCMD";
@@ -578,8 +580,18 @@ std::list Debugger::LoadTensors(const ProtoVector &ten
void Debugger::Exit() {
// clear resource before exit
- pipeline::ClearResAtexit();
- std::exit(EXIT_FAILURE);
+ // For node level, debugger has to exit itself because main thread can only exit in step bundary;
+ // For step level, debugger will notify main thread to exit;
+ if (run_level_ == "node") {
+ pipeline::ClearResAtexit();
+ exit(1);
+ } else if (run_level_ == "step") {
+ // Notify main thread to terminate
+ pipeline::ExecutorPy::DebugTerminate(true);
+ } else {
+ pipeline::ClearResAtexit();
+ exit(1);
+ }
}
std::list Debugger::CheckWatchpoints(const std::string &watchnode) {
diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc
index 4fcfdd9245..8b0a002eba 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@@ -76,6 +76,7 @@ const char IR_TYPE_MINDIR[] = "mind_ir";
ExecutorPyPtr ExecutorPy::executor_ = nullptr;
std::mutex ExecutorPy::instance_lock_;
+bool ExecutorPy::debugger_terminate_ = false;
std::unordered_map
@@ -748,7 +749,17 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V
ProcessVmArgInner(args, GetResource(phase), arg_list);
}
+void ExecutorPy::TerminateDebugger() {
+ if (debugger_terminate_) {
+ MS_LOG(INFO) << "Terminate debugger and clear resources!";
+ ClearResAtexit();
+ exit(1);
+ }
+}
+
py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
+ // Mindspore debugger notify main thread to exit after one step, and will not run next step
+ TerminateDebugger();
std::size_t size = args.size();
if (!py::isinstance(phase)) {
MS_LOG(EXCEPTION) << "Run failed, phase input is not a str";
diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.h b/mindspore/ccsrc/pipeline/jit/pipeline.h
index 23f7dbf220..53adefd0d8 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.h
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.h
@@ -97,6 +97,9 @@ class ExecutorPy : public std::enable_shared_from_this {
void DelNetRes(const std::string &id);
void ReleaseResource(const py::object &phase);
static void ClearRes();
+ static bool GetDebugTerminate() { return debugger_terminate_; }
+ static void DebugTerminate(bool val) { debugger_terminate_ = val; }
+ void TerminateDebugger();
std::map> FetchInfoForQuantExport(const std::string &phase_s);
@@ -111,6 +114,7 @@ class ExecutorPy : public std::enable_shared_from_this {
std::map info_;
static std::shared_ptr executor_;
static std::mutex instance_lock_;
+ static bool debugger_terminate_;
};
using ExecutorPyPtr = std::shared_ptr;
@@ -125,7 +129,6 @@ void InitHccl();
void FinalizeHccl();
void InitBackend();
void FinalizeBackend();
-
void ClearResAtexit();
void ReleaseGeTsd();