diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc index ce2dd16b62..855d8b53a0 100644 --- a/mindspore/ccsrc/backend/session/executor.cc +++ b/mindspore/ccsrc/backend/session/executor.cc @@ -102,7 +102,8 @@ void Executor::CheckException() { } void Executor::WorkerJoin() { - if (worker_->joinable()) { + // Avoid worker thread join itself which will cause deadlock + if (worker_->joinable() && worker_->get_id() != std::this_thread::get_id()) { { std::unique_lock lock(task_mutex_); auto task = std::make_shared(); diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 4d09df8f84..f9410a132c 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -444,6 +444,8 @@ void Debugger::CommandLoop() { case DebuggerCommand::kExitCMD: MS_LOG(INFO) << "ExitCMD"; Exit(); + // Used for debugger termination + run = true; break; case DebuggerCommand::kRunCMD: MS_LOG(INFO) << "RunCMD"; @@ -594,8 +596,18 @@ std::list Debugger::LoadTensors(const ProtoVector &ten void Debugger::Exit() { // clear resource before exit - pipeline::ClearResAtexit(); - std::exit(EXIT_FAILURE); + // For node level, debugger has to exit itself because main thread can only exit in step bundary; + // For step level, debugger will notify main thread to exit; + if (run_level_ == "node") { + pipeline::ClearResAtexit(); + exit(1); + } else if (run_level_ == "step") { + // Notify main thread to terminate + pipeline::ExecutorPy::DebugTerminate(true); + } else { + pipeline::ClearResAtexit(); + exit(1); + } } std::list Debugger::CheckWatchpoints(const std::string &watchnode) { diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc index 4fcfdd9245..8b0a002eba 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.cc +++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc @@ -76,6 +76,7 @@ const char IR_TYPE_MINDIR[] = "mind_ir"; ExecutorPyPtr ExecutorPy::executor_ = nullptr; std::mutex ExecutorPy::instance_lock_; +bool ExecutorPy::debugger_terminate_ = false; std::unordered_map @@ -748,7 +749,17 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V ProcessVmArgInner(args, GetResource(phase), arg_list); } +void ExecutorPy::TerminateDebugger() { + if (debugger_terminate_) { + MS_LOG(INFO) << "Terminate debugger and clear resources!"; + ClearResAtexit(); + exit(1); + } +} + py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) { + // Mindspore debugger notify main thread to exit after one step, and will not run next step + TerminateDebugger(); std::size_t size = args.size(); if (!py::isinstance(phase)) { MS_LOG(EXCEPTION) << "Run failed, phase input is not a str"; diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.h b/mindspore/ccsrc/pipeline/jit/pipeline.h index 23f7dbf220..53adefd0d8 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.h +++ b/mindspore/ccsrc/pipeline/jit/pipeline.h @@ -97,6 +97,9 @@ class ExecutorPy : public std::enable_shared_from_this { void DelNetRes(const std::string &id); void ReleaseResource(const py::object &phase); static void ClearRes(); + static bool GetDebugTerminate() { return debugger_terminate_; } + static void DebugTerminate(bool val) { debugger_terminate_ = val; } + void TerminateDebugger(); std::map> FetchInfoForQuantExport(const std::string &phase_s); @@ -111,6 +114,7 @@ class ExecutorPy : public std::enable_shared_from_this { std::map info_; static std::shared_ptr executor_; static std::mutex instance_lock_; + static bool debugger_terminate_; }; using ExecutorPyPtr = std::shared_ptr; @@ -125,7 +129,6 @@ void InitHccl(); void FinalizeHccl(); void InitBackend(); void FinalizeBackend(); - void ClearResAtexit(); void ReleaseGeTsd();