| @@ -371,7 +371,14 @@ void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_gra | |||
| } | |||
| void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| // check the dump_enabled and dataset_sink_mode | |||
| bool dump_enabled = DumpDataEnabledIteration(); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (dump_enabled && context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||
| MS_EXCEPTION(NotSupportError) << "Don't support set dataset_sink_mode to True when using e2e_dump"; | |||
| } | |||
| if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) { | |||
| return; | |||
| } | |||
| @@ -283,12 +283,15 @@ void Debugger::PostExecuteNode() { | |||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | |||
| // if kernel is watchpoint,and get hit. suspend. | |||
| bool hit_empty_flag = true; | |||
| if (is_watchpoint) { | |||
| auto hits = CheckWatchpoints(cur_name_); | |||
| if (!hits.empty()) { | |||
| SendWatchpointsAndSuspend(hits); | |||
| hit_empty_flag = false; | |||
| } | |||
| } else if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { | |||
| } | |||
| if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { | |||
| // if kernel is not watchpoint and is next_to or continue_to node, suspend | |||
| CommandLoop(); | |||
| } | |||
| @@ -405,7 +408,9 @@ void Debugger::CommandLoop() { | |||
| MS_LOG(ERROR) << "Error: WaitForCommand failed"; | |||
| num_wait_fail++; | |||
| if (num_wait_fail > max_num_wait_fail) { | |||
| MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session"; | |||
| MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session."; | |||
| MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config " | |||
| "of debugger host and port."; | |||
| Exit(); | |||
| } | |||
| MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after " | |||
| @@ -417,11 +422,11 @@ void Debugger::CommandLoop() { | |||
| // get type of the command in reply | |||
| DebuggerCommand cmd = GetCommand(reply); | |||
| if (cmd == DebuggerCommand::kUnknownCMD) { | |||
| MS_LOG(DEBUG) << "Debug: debugger recieved unknown command"; | |||
| MS_LOG(DEBUG) << "Debug: debugger received unknown command"; | |||
| continue; | |||
| } | |||
| MS_LOG(INFO) << "recieved command: "; | |||
| MS_LOG(INFO) << "received command: "; | |||
| switch (cmd) { | |||
| case DebuggerCommand::kUnknownCMD: | |||
| MS_LOG(INFO) << "UnknownCMD"; | |||
| @@ -13,11 +13,12 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "profiler/device/gpu/data_saver.h" | |||
| #include <fstream> | |||
| #include <numeric> | |||
| #include "sys/stat.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace profiler { | |||
| @@ -183,6 +184,7 @@ void DataSaver::WriteOpType(const std::string &saver_base_dir) { | |||
| ofs << op_type_info.second << std::endl; | |||
| } | |||
| ofs.close(); | |||
| ChangeFileMode(file_path); | |||
| MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path; | |||
| } | |||
| @@ -199,6 +201,7 @@ void DataSaver::WriteOpDetail(const std::string &saver_base_dir) { | |||
| ofs << op_detail << std::endl; | |||
| } | |||
| ofs.close(); | |||
| ChangeFileMode(file_path); | |||
| MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path; | |||
| } | |||
| @@ -232,7 +235,9 @@ void DataSaver::WriteActivity(const std::string &saver_base_dir) { | |||
| } | |||
| } | |||
| ofs.close(); | |||
| ChangeFileMode(file_path); | |||
| activity_timestamp_ofs.close(); | |||
| ChangeFileMode(timestamp_file_path); | |||
| MS_LOG(INFO) << "Write " << device_info.second.size() << " activity infos into file: " << file_path; | |||
| } | |||
| } | |||
| @@ -254,6 +259,14 @@ void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) { | |||
| ofs << std::endl; | |||
| } | |||
| ofs.close(); | |||
| ChangeFileMode(file_path); | |||
| } | |||
| void DataSaver::ChangeFileMode(const std::string &file_path) { | |||
| if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) { | |||
| MS_LOG(INFO) << "Modify file:" << file_path << " to rw fail."; | |||
| return; | |||
| } | |||
| } | |||
| } // namespace gpu | |||
| } // namespace profiler | |||
| @@ -145,6 +145,8 @@ class DataSaver { | |||
| void WriteOpTimestamp(const std::string &saver_base_dir); | |||
| void ChangeFileMode(const std::string &file_path); | |||
| std::string device_id_; | |||
| AllActivityInfos activity_infos_; | |||
| OpTypeInfos op_type_infos_; | |||