Merge pull request !6515 from lichen_101010/terminate_deadlock_issuetags/v1.1.0
| @@ -102,7 +102,8 @@ void Executor::CheckException() { | |||||
| } | } | ||||
| void Executor::WorkerJoin() { | void Executor::WorkerJoin() { | ||||
| if (worker_->joinable()) { | |||||
| // Avoid worker thread join itself which will cause deadlock | |||||
| if (worker_->joinable() && worker_->get_id() != std::this_thread::get_id()) { | |||||
| { | { | ||||
| std::unique_lock<std::mutex> lock(task_mutex_); | std::unique_lock<std::mutex> lock(task_mutex_); | ||||
| auto task = std::make_shared<ExitTask>(); | auto task = std::make_shared<ExitTask>(); | ||||
| @@ -444,6 +444,8 @@ void Debugger::CommandLoop() { | |||||
| case DebuggerCommand::kExitCMD: | case DebuggerCommand::kExitCMD: | ||||
| MS_LOG(INFO) << "ExitCMD"; | MS_LOG(INFO) << "ExitCMD"; | ||||
| Exit(); | Exit(); | ||||
| // Used for debugger termination | |||||
| run = true; | |||||
| break; | break; | ||||
| case DebuggerCommand::kRunCMD: | case DebuggerCommand::kRunCMD: | ||||
| MS_LOG(INFO) << "RunCMD"; | MS_LOG(INFO) << "RunCMD"; | ||||
| @@ -594,8 +596,18 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten | |||||
| void Debugger::Exit() { | void Debugger::Exit() { | ||||
| // clear resource before exit | // clear resource before exit | ||||
| pipeline::ClearResAtexit(); | |||||
| std::exit(EXIT_FAILURE); | |||||
| // For node level, debugger has to exit itself because main thread can only exit in step bundary; | |||||
| // For step level, debugger will notify main thread to exit; | |||||
| if (run_level_ == "node") { | |||||
| pipeline::ClearResAtexit(); | |||||
| exit(1); | |||||
| } else if (run_level_ == "step") { | |||||
| // Notify main thread to terminate | |||||
| pipeline::ExecutorPy::DebugTerminate(true); | |||||
| } else { | |||||
| pipeline::ClearResAtexit(); | |||||
| exit(1); | |||||
| } | |||||
| } | } | ||||
| std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) { | std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) { | ||||
| @@ -76,6 +76,7 @@ const char IR_TYPE_MINDIR[] = "mind_ir"; | |||||
| ExecutorPyPtr ExecutorPy::executor_ = nullptr; | ExecutorPyPtr ExecutorPy::executor_ = nullptr; | ||||
| std::mutex ExecutorPy::instance_lock_; | std::mutex ExecutorPy::instance_lock_; | ||||
| bool ExecutorPy::debugger_terminate_ = false; | |||||
| std::unordered_map<abstract::AbstractBasePtrList, int, abstract::AbstractBasePtrListHasher, | std::unordered_map<abstract::AbstractBasePtrList, int, abstract::AbstractBasePtrListHasher, | ||||
| abstract::AbstractBasePtrListEqual> | abstract::AbstractBasePtrListEqual> | ||||
| @@ -748,7 +749,17 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V | |||||
| ProcessVmArgInner(args, GetResource(phase), arg_list); | ProcessVmArgInner(args, GetResource(phase), arg_list); | ||||
| } | } | ||||
| void ExecutorPy::TerminateDebugger() { | |||||
| if (debugger_terminate_) { | |||||
| MS_LOG(INFO) << "Terminate debugger and clear resources!"; | |||||
| ClearResAtexit(); | |||||
| exit(1); | |||||
| } | |||||
| } | |||||
| py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) { | py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) { | ||||
| // Mindspore debugger notify main thread to exit after one step, and will not run next step | |||||
| TerminateDebugger(); | |||||
| std::size_t size = args.size(); | std::size_t size = args.size(); | ||||
| if (!py::isinstance<py::str>(phase)) { | if (!py::isinstance<py::str>(phase)) { | ||||
| MS_LOG(EXCEPTION) << "Run failed, phase input is not a str"; | MS_LOG(EXCEPTION) << "Run failed, phase input is not a str"; | ||||
| @@ -97,6 +97,9 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> { | |||||
| void DelNetRes(const std::string &id); | void DelNetRes(const std::string &id); | ||||
| void ReleaseResource(const py::object &phase); | void ReleaseResource(const py::object &phase); | ||||
| static void ClearRes(); | static void ClearRes(); | ||||
| static bool GetDebugTerminate() { return debugger_terminate_; } | |||||
| static void DebugTerminate(bool val) { debugger_terminate_ = val; } | |||||
| void TerminateDebugger(); | |||||
| std::map<std::string, std::pair<PrimitivePyPtr, std::string>> FetchInfoForQuantExport(const std::string &phase_s); | std::map<std::string, std::pair<PrimitivePyPtr, std::string>> FetchInfoForQuantExport(const std::string &phase_s); | ||||
| @@ -111,6 +114,7 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> { | |||||
| std::map<std::string, ExecutorInfoPtr> info_; | std::map<std::string, ExecutorInfoPtr> info_; | ||||
| static std::shared_ptr<ExecutorPy> executor_; | static std::shared_ptr<ExecutorPy> executor_; | ||||
| static std::mutex instance_lock_; | static std::mutex instance_lock_; | ||||
| static bool debugger_terminate_; | |||||
| }; | }; | ||||
| using ExecutorPyPtr = std::shared_ptr<ExecutorPy>; | using ExecutorPyPtr = std::shared_ptr<ExecutorPy>; | ||||
| @@ -125,7 +129,6 @@ void InitHccl(); | |||||
| void FinalizeHccl(); | void FinalizeHccl(); | ||||
| void InitBackend(); | void InitBackend(); | ||||
| void FinalizeBackend(); | void FinalizeBackend(); | ||||
| void ClearResAtexit(); | void ClearResAtexit(); | ||||
| void ReleaseGeTsd(); | void ReleaseGeTsd(); | ||||