From 93b3055a47a00e083ffc93670863af3a8fb7a283 Mon Sep 17 00:00:00 2001 From: kswang Date: Thu, 7 Jan 2021 16:26:05 +0800 Subject: [PATCH] fix longrunning segment fault --- mindspore/ccsrc/backend/session/executor.cc | 44 ++++++++++----------- mindspore/ccsrc/backend/session/executor.h | 3 +- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/mindspore/ccsrc/backend/session/executor.cc b/mindspore/ccsrc/backend/session/executor.cc index db250503b4..b0eec577d4 100644 --- a/mindspore/ccsrc/backend/session/executor.cc +++ b/mindspore/ccsrc/backend/session/executor.cc @@ -277,34 +277,39 @@ void Executor::ClearDoneTasks() { done_tasks_.clear(); } -void Executor::RunTask(const std::shared_ptr &task, bool sync) { +void Executor::RunTask(const std::shared_ptr &task, bool sync, bool long_run) { { std::lock_guard lock(task_mutex_); ready_tasks_.push(task); } sync_run_task_finished_ = false; task_cond_var_.notify_all(); - ClearDoneTasks(); if (sync && !sync_run_task_finished_) { std::unique_lock lock(task_mutex_); - sync_cond_var_.wait(lock, [this] { - bool finished = sync_run_task_finished_; - return finished; - }); + if (long_run) { + mindspore::ScopedLongRunning long_running; + sync_cond_var_.wait(lock, [this] { + bool finished = sync_run_task_finished_; + return finished; + }); + } else { + sync_cond_var_.wait(lock, [this] { + bool finished = sync_run_task_finished_; + return finished; + }); + } } ClearDoneTasks(); MsException::Instance().CheckException(); } -void Executor::SyncRunTask(const std::shared_ptr &task) { RunTask(task, true); } - GraphId Executor::CompileGraph(const SessionPtr &session, const GraphSegmentPtr &segment, const AnfNodePtrList &outputs) { auto task = std::make_shared(); task->session_ = session; task->segment_ = segment; task->output_nodes_ = outputs; - SyncRunTask(task); + RunTask(task, true); return task->graph_id_; } @@ -312,7 +317,7 @@ GraphId Executor::CompileGraph(const SessionPtr &session, NotNull auto task = std::make_shared(); task->session_ = session; task->func_graph_ = func_graph.get(); - SyncRunTask(task); + RunTask(task, true); return task->graph_id_; } @@ -320,7 +325,7 @@ void Executor::BuildGraph(const SessionPtr &session, GraphId graphId) { auto task = std::make_shared(); task->session_ = session; task->graph_id_ = graphId; - SyncRunTask(task); + RunTask(task, true); } void Executor::RunGraph(const SessionPtr &session, const GraphId &graph_id, @@ -334,8 +339,7 @@ void Executor::RunGraph(const SessionPtr &session, const GraphId &graph_id, session->CreateOutputTensors(graph_id, inputs, outputs, &task->tensor_to_node_); task->outputs_ = *outputs; task->sync_run_ = true; - mindspore::ScopedLongRunning long_running; - SyncRunTask(task); + RunTask(task, true, true); } void Executor::WaitTaskGraphAvailable(const SessionPtr &session, const std::shared_ptr &task) { @@ -350,7 +354,6 @@ void Executor::WaitTaskGraphAvailable(const SessionPtr &session, const std::shar } } if (need_lock) { - ClearDoneTasks(); mindspore::ScopedLongRunning long_running; for (auto &tensor : task->input_tensors_) { if (tensor->NeedWait() && !tensor->IsGraphOutput()) { @@ -365,7 +368,6 @@ void Executor::WaitTaskGraphAvailable(const SessionPtr &session, const std::shar } auto graph = session->GetGraph(task->graph_id_); if (graph != nullptr && !graph->IsPostGraphFinished()) { - ClearDoneTasks(); mindspore::ScopedLongRunning long_running; std::unique_lock lock(reenter_mutex_); reenter_cond_var_.wait(lock, [&graph] { return graph->IsPostGraphFinished(); }); @@ -388,8 +390,7 @@ void Executor::RunGraphAsync(const SessionPtr &session, const GraphId &graph_id, // sync run graph without output tensor(int dataset graph) if (!TensorInVector(outputs)) { task->sync_run_ = true; - mindspore::ScopedLongRunning long_running; - SyncRunTask(task); + RunTask(task, true, true); return; } WaitTaskGraphAvailable(session, task); @@ -415,8 +416,7 @@ void Executor::RunOp(const SessionPtr &session, OpRunInfo *op_run_info, const Gr tensor->Wait(); } } - mindspore::ScopedLongRunning long_running; - SyncRunTask(task); + RunTask(task, true, true); *outputs = task->outputs_; } @@ -428,7 +428,7 @@ void Executor::RunOpsInGraph(const SessionPtr &session, const GraphId &graph_id, task->session_ = session; task->graph_id_ = graph_id; task->input_tensors_ = inputs; - SyncRunTask(task); + RunTask(task, true); *outputs = task->outputs_; } @@ -436,14 +436,14 @@ bool Executor::CreateCommGroup(const std::string &group_name, std::vector(); task->group_name_ = group_name; task->ranks_ = ranks; - SyncRunTask(task); + RunTask(task, true); return task->result_; } bool Executor::DestroyCommGroup(const std::string &group_name) { auto task = std::make_shared(); task->group_name_ = group_name; - SyncRunTask(task); + RunTask(task, true); return task->result_; } diff --git a/mindspore/ccsrc/backend/session/executor.h b/mindspore/ccsrc/backend/session/executor.h index 8b1d0c150b..f4e543e19b 100644 --- a/mindspore/ccsrc/backend/session/executor.h +++ b/mindspore/ccsrc/backend/session/executor.h @@ -172,8 +172,7 @@ class Executor { void OnEvent(const ExecutorEvent &event); private: - void RunTask(const std::shared_ptr &task, bool sync); - void SyncRunTask(const std::shared_ptr &task); + void RunTask(const std::shared_ptr &task, bool sync, bool long_run = false); void UpdateOutputTensors(VectorRef *outputs, const std::map &tensor_to_node); std::vector> GetNewReadyTasks();