Browse Source

resolve terminate issue

terminate issue part 2

clean up code

first draft of deadlock issue

test

resolve terminate deadlock issue

add node level condition

resolve terminate issue

terminate issue part 2

clean up code

first draft of deadlock issue

test

resolve terminate deadlock issue

add node level condition

cleanup code and CI checks

rebase to master and address conflicts

refactor code

fix a bug
tags/v1.1.0
lichen_101010 5 years ago
parent
commit
5713ea38b8
4 changed files with 31 additions and 4 deletions
  1. +2
    -1
      mindspore/ccsrc/backend/session/executor.cc
  2. +14
    -2
      mindspore/ccsrc/debug/debugger/debugger.cc
  3. +11
    -0
      mindspore/ccsrc/pipeline/jit/pipeline.cc
  4. +4
    -1
      mindspore/ccsrc/pipeline/jit/pipeline.h

+ 2
- 1
mindspore/ccsrc/backend/session/executor.cc View File

@@ -102,7 +102,8 @@ void Executor::CheckException() {
}

void Executor::WorkerJoin() {
if (worker_->joinable()) {
// Avoid worker thread join itself which will cause deadlock
if (worker_->joinable() && worker_->get_id() != std::this_thread::get_id()) {
{
std::unique_lock<std::mutex> lock(task_mutex_);
auto task = std::make_shared<ExitTask>();


+ 14
- 2
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -428,6 +428,8 @@ void Debugger::CommandLoop() {
case DebuggerCommand::kExitCMD:
MS_LOG(INFO) << "ExitCMD";
Exit();
// Used for debugger termination
run = true;
break;
case DebuggerCommand::kRunCMD:
MS_LOG(INFO) << "RunCMD";
@@ -578,8 +580,18 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten

void Debugger::Exit() {
// clear resource before exit
pipeline::ClearResAtexit();
std::exit(EXIT_FAILURE);
// For node level, debugger has to exit itself because main thread can only exit in step bundary;
// For step level, debugger will notify main thread to exit;
if (run_level_ == "node") {
pipeline::ClearResAtexit();
exit(1);
} else if (run_level_ == "step") {
// Notify main thread to terminate
pipeline::ExecutorPy::DebugTerminate(true);
} else {
pipeline::ClearResAtexit();
exit(1);
}
}

std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) {


+ 11
- 0
mindspore/ccsrc/pipeline/jit/pipeline.cc View File

@@ -76,6 +76,7 @@ const char IR_TYPE_MINDIR[] = "mind_ir";

ExecutorPyPtr ExecutorPy::executor_ = nullptr;
std::mutex ExecutorPy::instance_lock_;
bool ExecutorPy::debugger_terminate_ = false;

std::unordered_map<abstract::AbstractBasePtrList, int, abstract::AbstractBasePtrListHasher,
abstract::AbstractBasePtrListEqual>
@@ -748,7 +749,17 @@ void ExecutorPy::ProcessVmArg(const py::tuple &args, const std::string &phase, V
ProcessVmArgInner(args, GetResource(phase), arg_list);
}

void ExecutorPy::TerminateDebugger() {
if (debugger_terminate_) {
MS_LOG(INFO) << "Terminate debugger and clear resources!";
ClearResAtexit();
exit(1);
}
}

py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
// Mindspore debugger notify main thread to exit after one step, and will not run next step
TerminateDebugger();
std::size_t size = args.size();
if (!py::isinstance<py::str>(phase)) {
MS_LOG(EXCEPTION) << "Run failed, phase input is not a str";


+ 4
- 1
mindspore/ccsrc/pipeline/jit/pipeline.h View File

@@ -97,6 +97,9 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
void DelNetRes(const std::string &id);
void ReleaseResource(const py::object &phase);
static void ClearRes();
static bool GetDebugTerminate() { return debugger_terminate_; }
static void DebugTerminate(bool val) { debugger_terminate_ = val; }
void TerminateDebugger();

std::map<std::string, std::pair<PrimitivePyPtr, std::string>> FetchInfoForQuantExport(const std::string &phase_s);

@@ -111,6 +114,7 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
std::map<std::string, ExecutorInfoPtr> info_;
static std::shared_ptr<ExecutorPy> executor_;
static std::mutex instance_lock_;
static bool debugger_terminate_;
};
using ExecutorPyPtr = std::shared_ptr<ExecutorPy>;

@@ -125,7 +129,6 @@ void InitHccl();
void FinalizeHccl();
void InitBackend();
void FinalizeBackend();

void ClearResAtexit();
void ReleaseGeTsd();



Loading…
Cancel
Save