diff --git a/mindspore/profiler/parser/step_trace_parser.py b/mindspore/profiler/parser/step_trace_parser.py index 9e3e005b2b..3429a59a7a 100644 --- a/mindspore/profiler/parser/step_trace_parser.py +++ b/mindspore/profiler/parser/step_trace_parser.py @@ -22,7 +22,7 @@ from collections import namedtuple from decimal import Decimal from mindspore.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \ - JobIdMismatchException, ProfilerIOException + JobIdMismatchException, ProfilerIOException, ProfilerRawFileException from mindspore import log from mindspore.profiler.common.util import get_summary_for_step_trace from mindspore.profiler.common.validator.validate_path import \ @@ -400,13 +400,25 @@ class GpuStepTraceParser(BaseStepTraceParser): fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3 reduce_start = 4 start_time, end_time = 0, 1 + STEP_TRACE_POINT_COUNT = 3 source_file = validate_and_normalize_path(source_file) try: with open(source_file, 'r') as f: lines = f.readlines() + if len(lines) < STEP_TRACE_POINT_COUNT: + raise ProfilerRawFileException( + f"Failed to parse {source_file} file. The FP_POINT/BP_POINT/ITER_END_POINT " + f"do not recognized correctly. Try to set the environment variable'PROFILING_FP_START' " + f"and 'PROFILING_BP_END' to solve this problem. For example, " + f"'export PROFILING_FP_START=Defualt/xxx/Conv2d-op1' ") step_trace_info_all = [line.strip().split()[1:] for line in lines] num_of_step = len(step_trace_info_all[0]) + for step_trace_point in step_trace_info_all: + if len(step_trace_point) != num_of_step: + raise ProfilerRawFileException( + f"Failed to parse {source_file} file. Due to the profiled " + f"step_num of FP/BP/ITER_END Point are not equal") iter_start_info = [step_trace_info_all[fp_start][0]] + \ step_trace_info_all[iter_end][:num_of_step] step_trace_info_all.insert(iter_start, iter_start_info) diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index abda9890f8..1f3fdd15be 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -130,7 +130,7 @@ class Profiler: profiling_options = json.dumps(profiling_options) # Characters longer than 2048 are ignored, resulting in profiling option resolution errors if len(profiling_options) > 2048: - raise ValueError("The parameter length exceeds the limit (2048)") + raise ValueError("The parameter length exceeds the limit (2048), please input valid parameters.") # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options=profiling_options) @@ -369,36 +369,30 @@ class Profiler: return self._profiling_job_id job_id = "" - cmd = "ls -t " + self._output_path + "|grep JOB|awk '{print $1}'" - r = os.popen(cmd) - profiling_job_dirs = r.readlines() - r.close() - for item in profiling_job_dirs: - path = os.path.join(self._output_path, item.strip()) - log_file = get_file_names(path, "host_start.log") - if not log_file: - logger.error("Profiling: job path %s, host_start.log not exist.", path) - continue - - log_file = os.path.join(path, log_file[0]) - item_dict = self._parse_host_start_log(log_file) - - if not item_dict: - logger.error("Profiling: job path %s, fail to get job start info.", path) - continue - - if self._dev_id != item_dict["device_id"]: - logger.info("Profiling: job path %s, dev id %s, training device id %s.", - path, item_dict["device_id"], self._dev_id) - continue - - if self._start_time > int(item_dict["start_time"]): - logger.info("Profiling: job path %s, start_time %s, training start_time %d.", - path, item_dict["start_time"], self._start_time) - break + for item in os.listdir(self._output_path): + if item.startswith('JOB'): + path = os.path.join(self._output_path, item) + job_id = item + + log_file = get_file_names(path, "host_start.log") + if not log_file: + logger.error("Profiling: job path %s, host_start.log not exist.", path) + + log_file = os.path.join(path, log_file[0]) + item_dict = self._parse_host_start_log(log_file) + + if not item_dict: + logger.error("Profiling: job path %s, fail to get job start info.", path) - job_id = item.strip() - break + if self._dev_id != item_dict["device_id"]: + logger.info("Profiling: job path %s, dev id %s, training device id %s.", + path, item_dict["device_id"], self._dev_id) + + if self._start_time > int(item_dict["start_time"]): + logger.info("Profiling: job path %s, start_time %s, training start_time %d.", + path, item_dict["start_time"], self._start_time) + + break if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated"