From f2f35d217663b02f3bc426b5f6bd42d7e2a059ac Mon Sep 17 00:00:00 2001 From: yelihua Date: Sat, 19 Sep 2020 16:28:00 +0800 Subject: [PATCH] fix the bug for sending suspend --- mindspore/ccsrc/backend/session/gpu_session.cc | 7 +++++++ mindspore/ccsrc/debug/debugger/debugger.cc | 13 +++++++++---- mindspore/ccsrc/profiler/device/gpu/data_saver.cc | 15 ++++++++++++++- mindspore/ccsrc/profiler/device/gpu/data_saver.h | 2 ++ 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index ce8a6eed0f..73b3156ada 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -371,7 +371,14 @@ void GPUSession::PostIterationDbg(const std::shared_ptr &kernel_gra } void GPUSession::PreLoadTensor(const std::shared_ptr &kernel_graph) const { + // check the dump_enabled and dataset_sink_mode bool dump_enabled = DumpDataEnabledIteration(); + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + if (dump_enabled && context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK)) { + MS_EXCEPTION(NotSupportError) << "Don't support set dataset_sink_mode to True when using e2e_dump"; + } + if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) { return; } diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 3bcf073229..c29a742f9c 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -283,12 +283,15 @@ void Debugger::PostExecuteNode() { auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); // if kernel is watchpoint,and get hit. suspend. + bool hit_empty_flag = true; if (is_watchpoint) { auto hits = CheckWatchpoints(cur_name_); if (!hits.empty()) { SendWatchpointsAndSuspend(hits); + hit_empty_flag = false; } - } else if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { + } + if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { // if kernel is not watchpoint and is next_to or continue_to node, suspend CommandLoop(); } @@ -405,7 +408,9 @@ void Debugger::CommandLoop() { MS_LOG(ERROR) << "Error: WaitForCommand failed"; num_wait_fail++; if (num_wait_fail > max_num_wait_fail) { - MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session"; + MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session."; + MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config " + "of debugger host and port."; Exit(); } MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after " @@ -417,11 +422,11 @@ void Debugger::CommandLoop() { // get type of the command in reply DebuggerCommand cmd = GetCommand(reply); if (cmd == DebuggerCommand::kUnknownCMD) { - MS_LOG(DEBUG) << "Debug: debugger recieved unknown command"; + MS_LOG(DEBUG) << "Debug: debugger received unknown command"; continue; } - MS_LOG(INFO) << "recieved command: "; + MS_LOG(INFO) << "received command: "; switch (cmd) { case DebuggerCommand::kUnknownCMD: MS_LOG(INFO) << "UnknownCMD"; diff --git a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc index 30e9e75196..8f54518747 100644 --- a/mindspore/ccsrc/profiler/device/gpu/data_saver.cc +++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.cc @@ -13,11 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "profiler/device/gpu/data_saver.h" #include #include +#include "sys/stat.h" #include "utils/log_adapter.h" +#include "utils/ms_utils.h" namespace mindspore { namespace profiler { @@ -183,6 +184,7 @@ void DataSaver::WriteOpType(const std::string &saver_base_dir) { ofs << op_type_info.second << std::endl; } ofs.close(); + ChangeFileMode(file_path); MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path; } @@ -199,6 +201,7 @@ void DataSaver::WriteOpDetail(const std::string &saver_base_dir) { ofs << op_detail << std::endl; } ofs.close(); + ChangeFileMode(file_path); MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path; } @@ -232,7 +235,9 @@ void DataSaver::WriteActivity(const std::string &saver_base_dir) { } } ofs.close(); + ChangeFileMode(file_path); activity_timestamp_ofs.close(); + ChangeFileMode(timestamp_file_path); MS_LOG(INFO) << "Write " << device_info.second.size() << " activity infos into file: " << file_path; } } @@ -254,6 +259,14 @@ void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) { ofs << std::endl; } ofs.close(); + ChangeFileMode(file_path); +} + +void DataSaver::ChangeFileMode(const std::string &file_path) { + if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) { + MS_LOG(INFO) << "Modify file:" << file_path << " to rw fail."; + return; + } } } // namespace gpu } // namespace profiler diff --git a/mindspore/ccsrc/profiler/device/gpu/data_saver.h b/mindspore/ccsrc/profiler/device/gpu/data_saver.h index b215238abc..2f3ba54ddb 100644 --- a/mindspore/ccsrc/profiler/device/gpu/data_saver.h +++ b/mindspore/ccsrc/profiler/device/gpu/data_saver.h @@ -145,6 +145,8 @@ class DataSaver { void WriteOpTimestamp(const std::string &saver_base_dir); + void ChangeFileMode(const std::string &file_path); + std::string device_id_; AllActivityInfos activity_infos_; OpTypeInfos op_type_infos_;