| @@ -371,7 +371,14 @@ void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_gra | |||||
| } | } | ||||
| void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const { | void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const { | ||||
| // check the dump_enabled and dataset_sink_mode | |||||
| bool dump_enabled = DumpDataEnabledIteration(); | bool dump_enabled = DumpDataEnabledIteration(); | ||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| if (dump_enabled && context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||||
| MS_EXCEPTION(NotSupportError) << "Don't support set dataset_sink_mode to True when using e2e_dump"; | |||||
| } | |||||
| if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) { | if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) { | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -283,12 +283,15 @@ void Debugger::PostExecuteNode() { | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | ||||
| // if kernel is watchpoint,and get hit. suspend. | // if kernel is watchpoint,and get hit. suspend. | ||||
| bool hit_empty_flag = true; | |||||
| if (is_watchpoint) { | if (is_watchpoint) { | ||||
| auto hits = CheckWatchpoints(cur_name_); | auto hits = CheckWatchpoints(cur_name_); | ||||
| if (!hits.empty()) { | if (!hits.empty()) { | ||||
| SendWatchpointsAndSuspend(hits); | SendWatchpointsAndSuspend(hits); | ||||
| hit_empty_flag = false; | |||||
| } | } | ||||
| } else if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { | |||||
| } | |||||
| if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { | |||||
| // if kernel is not watchpoint and is next_to or continue_to node, suspend | // if kernel is not watchpoint and is next_to or continue_to node, suspend | ||||
| CommandLoop(); | CommandLoop(); | ||||
| } | } | ||||
| @@ -405,7 +408,9 @@ void Debugger::CommandLoop() { | |||||
| MS_LOG(ERROR) << "Error: WaitForCommand failed"; | MS_LOG(ERROR) << "Error: WaitForCommand failed"; | ||||
| num_wait_fail++; | num_wait_fail++; | ||||
| if (num_wait_fail > max_num_wait_fail) { | if (num_wait_fail > max_num_wait_fail) { | ||||
| MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session"; | |||||
| MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session."; | |||||
| MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config " | |||||
| "of debugger host and port."; | |||||
| Exit(); | Exit(); | ||||
| } | } | ||||
| MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after " | MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after " | ||||
| @@ -417,11 +422,11 @@ void Debugger::CommandLoop() { | |||||
| // get type of the command in reply | // get type of the command in reply | ||||
| DebuggerCommand cmd = GetCommand(reply); | DebuggerCommand cmd = GetCommand(reply); | ||||
| if (cmd == DebuggerCommand::kUnknownCMD) { | if (cmd == DebuggerCommand::kUnknownCMD) { | ||||
| MS_LOG(DEBUG) << "Debug: debugger recieved unknown command"; | |||||
| MS_LOG(DEBUG) << "Debug: debugger received unknown command"; | |||||
| continue; | continue; | ||||
| } | } | ||||
| MS_LOG(INFO) << "recieved command: "; | |||||
| MS_LOG(INFO) << "received command: "; | |||||
| switch (cmd) { | switch (cmd) { | ||||
| case DebuggerCommand::kUnknownCMD: | case DebuggerCommand::kUnknownCMD: | ||||
| MS_LOG(INFO) << "UnknownCMD"; | MS_LOG(INFO) << "UnknownCMD"; | ||||
| @@ -13,11 +13,12 @@ | |||||
| * See the License for the specific language governing permissions and | * See the License for the specific language governing permissions and | ||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include "profiler/device/gpu/data_saver.h" | #include "profiler/device/gpu/data_saver.h" | ||||
| #include <fstream> | #include <fstream> | ||||
| #include <numeric> | #include <numeric> | ||||
| #include "sys/stat.h" | |||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| #include "utils/ms_utils.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace profiler { | namespace profiler { | ||||
| @@ -183,6 +184,7 @@ void DataSaver::WriteOpType(const std::string &saver_base_dir) { | |||||
| ofs << op_type_info.second << std::endl; | ofs << op_type_info.second << std::endl; | ||||
| } | } | ||||
| ofs.close(); | ofs.close(); | ||||
| ChangeFileMode(file_path); | |||||
| MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path; | MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path; | ||||
| } | } | ||||
| @@ -199,6 +201,7 @@ void DataSaver::WriteOpDetail(const std::string &saver_base_dir) { | |||||
| ofs << op_detail << std::endl; | ofs << op_detail << std::endl; | ||||
| } | } | ||||
| ofs.close(); | ofs.close(); | ||||
| ChangeFileMode(file_path); | |||||
| MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path; | MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path; | ||||
| } | } | ||||
| @@ -232,7 +235,9 @@ void DataSaver::WriteActivity(const std::string &saver_base_dir) { | |||||
| } | } | ||||
| } | } | ||||
| ofs.close(); | ofs.close(); | ||||
| ChangeFileMode(file_path); | |||||
| activity_timestamp_ofs.close(); | activity_timestamp_ofs.close(); | ||||
| ChangeFileMode(timestamp_file_path); | |||||
| MS_LOG(INFO) << "Write " << device_info.second.size() << " activity infos into file: " << file_path; | MS_LOG(INFO) << "Write " << device_info.second.size() << " activity infos into file: " << file_path; | ||||
| } | } | ||||
| } | } | ||||
| @@ -254,6 +259,14 @@ void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) { | |||||
| ofs << std::endl; | ofs << std::endl; | ||||
| } | } | ||||
| ofs.close(); | ofs.close(); | ||||
| ChangeFileMode(file_path); | |||||
| } | |||||
| void DataSaver::ChangeFileMode(const std::string &file_path) { | |||||
| if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) { | |||||
| MS_LOG(INFO) << "Modify file:" << file_path << " to rw fail."; | |||||
| return; | |||||
| } | |||||
| } | } | ||||
| } // namespace gpu | } // namespace gpu | ||||
| } // namespace profiler | } // namespace profiler | ||||
| @@ -145,6 +145,8 @@ class DataSaver { | |||||
| void WriteOpTimestamp(const std::string &saver_base_dir); | void WriteOpTimestamp(const std::string &saver_base_dir); | ||||
| void ChangeFileMode(const std::string &file_path); | |||||
| std::string device_id_; | std::string device_id_; | ||||
| AllActivityInfos activity_infos_; | AllActivityInfos activity_infos_; | ||||
| OpTypeInfos op_type_infos_; | OpTypeInfos op_type_infos_; | ||||