From 5713ea38b81549015c84d231de9bdd6a9c86bbb3 Mon Sep 17 00:00:00 2001 From: lichen_101010 Date: Mon, 14 Sep 2020 19:37:03 -0400 Subject: [PATCH] resolve terminate issue terminate issue part 2 clean up code first draft of deadlock issue test resolve terminate deadlock issue add node level condition resolve terminate issue terminate issue part 2 clean up code first draft of deadlock issue test resolve terminate deadlock issue add node level condition cleanup code and CI checks rebase to master and address conflicts refactor code fix a bug --- mindspore/ccsrc/backend/session/executor.cc | 3 ++- mindspore/ccsrc/debug/debugger/debugger.cc | 16 ++++++++++++++-- mindspore/ccsrc/pipeline/jit/pipeline.cc | 11 +++++++++++ mindspore/ccsrc/pipeline/jit/pipeline.h | 5 ++++- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc index ce2dd16b62..855d8b53a0 100644 --- a/mindspore/ccsrc/backend/session/executor.cc +++ b/mindspore/ccsrc/backend/session/executor.cc @@ -102,7 +102,8 @@ void Executor::CheckException() { } void Executor::WorkerJoin() { - if (worker_->joinable()) { + // Avoid worker thread join itself which will cause deadlock + if (worker_->joinable() && worker_->get_id() != std::this_thread::get_id()) { { std::unique_lock lock(task_mutex_); auto task = std::make_shared(); diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 5802e9cf4f..5df3b5a61a 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -428,6 +428,8 @@ void Debugger::CommandLoop() { case DebuggerCommand::kExitCMD: MS_LOG(INFO) << "ExitCMD"; Exit(); + // Used for debugger termination + run = true; break; case DebuggerCommand::kRunCMD: MS_LOG(INFO) << "RunCMD"; @@ -578,8 +580,18 @@ std::list Debugger::LoadTensors(const ProtoVector &ten void Debugger::Exit() { // clear resource before exit - pipeline::ClearResAtexit(); - std::exit(EXIT_FAILURE); + // For node level, debugger has to exit itself because main thread can only exit in step bundary; + // For step level, debugger will notify main thread to exit; + if (run_level_ == "node") { + pipeline::ClearResAtexit(); + exit(1); + } else if (run_level_ == "step") { + // Notify main thread to terminate + pipeline::ExecutorPy::DebugTerminate(true); + } else { + pipeline::ClearResAtexit(); + exit(1); + } } std::list Debugger::CheckWatchpoints(const std::string &watchnode) { diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc index 4fcfdd9245..8b0a002eba 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.cc +++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc @@ -76,6 +76,7 @@ const char IR_TYPE_MINDIR[] = "mind_ir"; ExecutorPyPtr ExecutorPy::executor_ = nullptr; std::mutex ExecutorPy::instance_lock_; +bool ExecutorPy::debugger_terminate_ = false; std::unordered_map @@ -748,7 +749,17 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V ProcessVmArgInner(args, GetResource(phase), arg_list); } +void ExecutorPy::TerminateDebugger() { + if (debugger_terminate_) { + MS_LOG(INFO) << "Terminate debugger and clear resources!"; + ClearResAtexit(); + exit(1); + } +} + py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) { + // Mindspore debugger notify main thread to exit after one step, and will not run next step + TerminateDebugger(); std::size_t size = args.size(); if (!py::isinstance(phase)) { MS_LOG(EXCEPTION) << "Run failed, phase input is not a str"; diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.h b/mindspore/ccsrc/pipeline/jit/pipeline.h index 23f7dbf220..53adefd0d8 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.h +++ b/mindspore/ccsrc/pipeline/jit/pipeline.h @@ -97,6 +97,9 @@ class ExecutorPy : public std::enable_shared_from_this { void DelNetRes(const std::string &id); void ReleaseResource(const py::object &phase); static void ClearRes(); + static bool GetDebugTerminate() { return debugger_terminate_; } + static void DebugTerminate(bool val) { debugger_terminate_ = val; } + void TerminateDebugger(); std::map> FetchInfoForQuantExport(const std::string &phase_s); @@ -111,6 +114,7 @@ class ExecutorPy : public std::enable_shared_from_this { std::map info_; static std::shared_ptr executor_; static std::mutex instance_lock_; + static bool debugger_terminate_; }; using ExecutorPyPtr = std::shared_ptr; @@ -125,7 +129,6 @@ void InitHccl(); void FinalizeHccl(); void InitBackend(); void FinalizeBackend(); - void ClearResAtexit(); void ReleaseGeTsd();