From: @gzhcv Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -273,7 +273,7 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) { | |||||
| } | } | ||||
| // write step trace time info into file | // write step trace time info into file | ||||
| uint32_t factor = 10; | |||||
| const uint32_t factor = 10; | |||||
| std::vector<std::string> op_name_arr; | std::vector<std::string> op_name_arr; | ||||
| op_name_arr.push_back(step_trace_op_name.trace_fp_start); | op_name_arr.push_back(step_trace_op_name.trace_fp_start); | ||||
| op_name_arr.push_back(step_trace_op_name.trace_bp_end); | op_name_arr.push_back(step_trace_op_name.trace_bp_end); | ||||
| @@ -170,7 +170,6 @@ std::string ProfilingUtils::GetGraphSecondLastKernelName(const std::vector<CNode | |||||
| return second_last_kernel_name; | return second_last_kernel_name; | ||||
| } | } | ||||
| } // namespace gpu | } // namespace gpu | ||||
| } // namespace profiler | } // namespace profiler | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -597,7 +597,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo | |||||
| auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); | auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(profiler_inst); | MS_EXCEPTION_IF_NULL(profiler_inst); | ||||
| if (is_first_step_map_[graph->graph_id()]) { | |||||
| if (profiler_inst->GetEnableFlag() && is_first_step_map_[graph->graph_id()]) { | |||||
| profiler::gpu::ProfilingTraceInfo profiling_trace = | profiler::gpu::ProfilingTraceInfo profiling_trace = | ||||
| profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); | profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); | ||||
| profiler_inst->SetStepTraceOpName(profiling_trace); | profiler_inst->SetStepTraceOpName(profiling_trace); | ||||
| @@ -233,24 +233,34 @@ def query_step_trace_file(profiler_dir): | |||||
| return None | return None | ||||
| def get_summary_for_step_trace(average_info, header): | |||||
| def get_summary_for_step_trace(average_info, header, is_training_mode=True): | |||||
| """The property of summary info.""" | """The property of summary info.""" | ||||
| if not average_info or not header: | if not average_info or not header: | ||||
| return {} | return {} | ||||
| total_time = get_field_value(average_info, 'total', header) | total_time = get_field_value(average_info, 'total', header) | ||||
| iteration_interval = get_field_value(average_info, 'iteration_interval', | iteration_interval = get_field_value(average_info, 'iteration_interval', | ||||
| header) | header) | ||||
| fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) | |||||
| tail = get_field_value(average_info, 'tail', header) | |||||
| summary = { | |||||
| summary_part = { | |||||
| 'total_time': total_time, | 'total_time': total_time, | ||||
| 'iteration_interval': iteration_interval, | 'iteration_interval': iteration_interval, | ||||
| 'iteration_interval_percent': calculate_percent(iteration_interval, total_time), | 'iteration_interval_percent': calculate_percent(iteration_interval, total_time), | ||||
| 'fp_and_bp': fp_and_bp, | |||||
| 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), | |||||
| 'tail': tail, | |||||
| 'tail_percent': calculate_percent(tail, total_time) | |||||
| } | } | ||||
| if is_training_mode: | |||||
| fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) | |||||
| tail = get_field_value(average_info, 'tail', header) | |||||
| summary = { | |||||
| 'fp_and_bp': fp_and_bp, | |||||
| 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), | |||||
| 'tail': tail, | |||||
| 'tail_percent': calculate_percent(tail, total_time) | |||||
| } | |||||
| else: | |||||
| fp = get_field_value(average_info, 'fp', header) | |||||
| summary = { | |||||
| 'fp': fp, | |||||
| 'fp_percent': calculate_percent(fp, total_time) | |||||
| } | |||||
| summary.update(summary_part) | |||||
| return summary | return summary | ||||
| @@ -21,7 +21,7 @@ from decimal import Decimal | |||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \ | from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \ | ||||
| ProfilerFileNotFoundException, ProfilerRawFileException | |||||
| ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException | |||||
| from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond | from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond | ||||
| from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path | from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path | ||||
| from mindspore.profiler.parser.container import TimelineContainer | from mindspore.profiler.parser.container import TimelineContainer | ||||
| @@ -776,6 +776,24 @@ class GpuTimelineGenerator(BaseTimelineGenerator): | |||||
| # Update timeline summary info | # Update timeline summary info | ||||
| self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) | self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) | ||||
| def check_op_name(self, op_name): | |||||
| """ | |||||
| Check whether the operator name exists. | |||||
| Args: | |||||
| op_name (str): The operator name or operator name prefix. | |||||
| Returns: | |||||
| bool, `True` if the operator name does exist, else `False`. | |||||
| """ | |||||
| if not op_name: | |||||
| raise ProfilerParamValueErrorException('The op_name should exist.') | |||||
| for op_time_info in self._timeline_meta: | |||||
| full_op_name = op_time_info['name'] | |||||
| if full_op_name and full_op_name.startswith(op_name): | |||||
| return True | |||||
| return False | |||||
| class AscendTimelineGenerator(BaseTimelineGenerator): | class AscendTimelineGenerator(BaseTimelineGenerator): | ||||
| """Generate ascend Timeline data from file.""" | """Generate ascend Timeline data from file.""" | ||||
| _display_filename = 'ascend_timeline_display_{}.json' | _display_filename = 'ascend_timeline_display_{}.json' | ||||
| @@ -42,9 +42,10 @@ class BaseStepTraceParser: | |||||
| output_file_path (str): The output file path. | output_file_path (str): The output file path. | ||||
| job_id (int): The job id used to define the start of new step. Default: 0. | job_id (int): The job id used to define the start of new step. Default: 0. | ||||
| skip_first_step (bool): Whether skip the first step or not. | skip_first_step (bool): Whether skip the first step or not. | ||||
| is_training_mode (bool): Whether in training mode or not. | |||||
| """ | """ | ||||
| def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False): | |||||
| def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False, is_training_mode=True): | |||||
| self._input_dir = input_dir | self._input_dir = input_dir | ||||
| self._output_path = output_file_path | self._output_path = output_file_path | ||||
| self._job_id = job_id | self._job_id = job_id | ||||
| @@ -53,6 +54,7 @@ class BaseStepTraceParser: | |||||
| self._header = [] | self._header = [] | ||||
| self._step_num = 0 | self._step_num = 0 | ||||
| self._tag_map = {} | self._tag_map = {} | ||||
| self._is_training_mode = is_training_mode | |||||
| @property | @property | ||||
| def output_file(self): | def output_file(self): | ||||
| @@ -64,7 +66,7 @@ class BaseStepTraceParser: | |||||
| """The property of step trace info.""" | """The property of step trace info.""" | ||||
| summary_info = {} | summary_info = {} | ||||
| if self._result: | if self._result: | ||||
| summary_info = get_summary_for_step_trace(self._result[-1], self._header) | |||||
| summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode) | |||||
| summary_info['total_steps'] = len(self._result) - 1 | summary_info['total_steps'] = len(self._result) - 1 | ||||
| print('\nStep trace summary info (unit: syscnt):') | print('\nStep trace summary info (unit: syscnt):') | ||||
| print(summary_info) | print(summary_info) | ||||
| @@ -321,15 +323,27 @@ class BaseStepTraceParser: | |||||
| log.info("Finish add average info for step trace.") | log.info("Finish add average info for step trace.") | ||||
| def _save(self): | def _save(self): | ||||
| """save step trace file.""" | |||||
| BP_POINT, TAIL, FP_DURATION = 5, -1, -2 | |||||
| log.info("Start to save step trace file.") | log.info("Start to save step trace file.") | ||||
| if not self._header: | if not self._header: | ||||
| return | return | ||||
| with open(self._output_path, 'w') as file_handle: | |||||
| csv_writer = csv.writer(file_handle) | |||||
| csv_writer.writerow(self._header) | |||||
| for row_data in self._result: | |||||
| csv_writer.writerow(row_data) | |||||
| os.chmod(self._output_path, stat.S_IRUSR) | |||||
| try: | |||||
| with open(self._output_path, 'w') as file_handle: | |||||
| csv_writer = csv.writer(file_handle) | |||||
| if not self._is_training_mode: | |||||
| self._header[FP_DURATION] = 'fp' | |||||
| self._header = self._header[:BP_POINT] + self._header[BP_POINT+1:TAIL] | |||||
| csv_writer.writerow(self._header) | |||||
| for row_data in self._result: | |||||
| if not self._is_training_mode: | |||||
| row_data[FP_DURATION] += row_data[TAIL] | |||||
| row_data = row_data[:BP_POINT] + row_data[BP_POINT+1:TAIL] | |||||
| csv_writer.writerow(row_data) | |||||
| os.chmod(self._output_path, stat.S_IRUSR) | |||||
| except (IOError, OSError) as err: | |||||
| log.warning('Failed to save step trace raw info. %s', err) | |||||
| raise ProfilerIOException | |||||
| class GpuStepTraceParser(BaseStepTraceParser): | class GpuStepTraceParser(BaseStepTraceParser): | ||||
| @@ -356,10 +370,16 @@ class GpuStepTraceParser(BaseStepTraceParser): | |||||
| log.warning(f'Failed to read {source_file}', err) | log.warning(f'Failed to read {source_file}', err) | ||||
| raise ProfilerIOException | raise ProfilerIOException | ||||
| points = { | |||||
| 'fp_start': fp_start_name, | |||||
| 'bp_end': bp_end_name | |||||
| } | |||||
| if self._is_training_mode: | |||||
| points = { | |||||
| 'fp_start': fp_start_name, | |||||
| 'bp_end': bp_end_name | |||||
| } | |||||
| else: | |||||
| points = { | |||||
| 'fp_start': fp_start_name, | |||||
| } | |||||
| try: | try: | ||||
| with open(output_path, 'w') as json_file: | with open(output_path, 'w') as json_file: | ||||
| json.dump(points, json_file) | json.dump(points, json_file) | ||||
| @@ -456,10 +476,16 @@ class AscendStepTraceParser(BaseStepTraceParser): | |||||
| Returns: | Returns: | ||||
| dict, parsed point info. | dict, parsed point info. | ||||
| """ | """ | ||||
| points = { | |||||
| 'fp_start': point_info.get(self._fp_tag, ''), | |||||
| 'bp_end': point_info.get(self._bp_tag, '') | |||||
| } | |||||
| if self._is_training_mode: | |||||
| points = { | |||||
| 'fp_start': point_info.get(self._fp_tag, ''), | |||||
| 'bp_end': point_info.get(self._bp_tag, '') | |||||
| } | |||||
| else: | |||||
| points = { | |||||
| 'fp_start': point_info.get(self._fp_tag, ''), | |||||
| } | |||||
| try: | try: | ||||
| with open(output_path, 'w') as json_file: | with open(output_path, 'w') as json_file: | ||||
| json.dump(points, json_file) | json.dump(points, json_file) | ||||
| @@ -151,7 +151,7 @@ class Profiler: | |||||
| logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' | logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' | ||||
| 'and init(). Profiler should be initialized after these code. ') | 'and init(). Profiler should be initialized after these code. ') | ||||
| self._gpu_profiler.stop() | self._gpu_profiler.stop() | ||||
| self._generate_timeline() | |||||
| timeline_generator = self._generate_timeline() | |||||
| # parse minddata pipeline operator and queue for GPU | # parse minddata pipeline operator and queue for GPU | ||||
| try: | try: | ||||
| @@ -162,7 +162,7 @@ class Profiler: | |||||
| # analyse step trace info | # analyse step trace info | ||||
| try: | try: | ||||
| self._analyse_step_trace() | |||||
| self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients')) | |||||
| except ProfilerException as err: | except ProfilerException as err: | ||||
| logger.warning(err.message) | logger.warning(err.message) | ||||
| @@ -239,13 +239,14 @@ class Profiler: | |||||
| os.environ['PROFILING_MODE'] = str("false") | os.environ['PROFILING_MODE'] = str("false") | ||||
| context.set_context(enable_profiling=False) | context.set_context(enable_profiling=False) | ||||
| def _analyse_step_trace(self, source_path=None, framework_parser=None): | |||||
| def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True): | |||||
| """ | """ | ||||
| Analyse step trace data and save the result. | Analyse step trace data and save the result. | ||||
| Args: | Args: | ||||
| source_path (str): The directory that contains the step trace original data. | source_path (str): The directory that contains the step trace original data. | ||||
| framework_parser (FrameworkParser): The framework parse instance. | framework_parser (FrameworkParser): The framework parse instance. | ||||
| is_training_mode_flag (bool): Whether in training mode or not. | |||||
| """ | """ | ||||
| logger.info("Begin to parse step trace.") | logger.info("Begin to parse step trace.") | ||||
| # construct output path | # construct output path | ||||
| @@ -266,19 +267,23 @@ class Profiler: | |||||
| f'step_trace_profiling_{self._dev_id}.txt' | f'step_trace_profiling_{self._dev_id}.txt' | ||||
| ) | ) | ||||
| parser = GpuStepTraceParser(input_dir=input_file_path, | parser = GpuStepTraceParser(input_dir=input_file_path, | ||||
| output_file_path=step_trace_intermediate_file_path) | |||||
| output_file_path=step_trace_intermediate_file_path, | |||||
| is_training_mode=is_training_mode_flag) | |||||
| parser.parse_and_save() | parser.parse_and_save() | ||||
| point_info = parser.record_point_info(input_file_path, point_info_file_path) | point_info = parser.record_point_info(input_file_path, point_info_file_path) | ||||
| else: | else: | ||||
| # whether keep the first step | # whether keep the first step | ||||
| skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) | skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) | ||||
| point_info = framework_parser.point_info | point_info = framework_parser.point_info | ||||
| # recognize inference or traning mode | |||||
| is_traning_mode_flag = framework_parser.check_op_name("Gradients") | |||||
| # parser the step trace files and save the result to disk | # parser the step trace files and save the result to disk | ||||
| source_path = validate_and_normalize_path(source_path) | source_path = validate_and_normalize_path(source_path) | ||||
| parser = AscendStepTraceParser(input_dir=source_path, | parser = AscendStepTraceParser(input_dir=source_path, | ||||
| output_file_path=step_trace_intermediate_file_path, | output_file_path=step_trace_intermediate_file_path, | ||||
| job_id=self._job_id_env, | job_id=self._job_id_env, | ||||
| skip_first_step=skip_first_step_flag) | |||||
| skip_first_step=skip_first_step_flag, | |||||
| is_training_mode=is_traning_mode_flag) | |||||
| parser.update_tag_op_type_map(point_info) | parser.update_tag_op_type_map(point_info) | ||||
| parser.parse_and_save() | parser.parse_and_save() | ||||
| point_info = parser.record_point_info(point_info, point_info_file_path) | point_info = parser.record_point_info(point_info, point_info_file_path) | ||||
| @@ -332,6 +337,7 @@ class Profiler: | |||||
| timeline_generator.init_timeline() | timeline_generator.init_timeline() | ||||
| timeline_generator.write_timeline(size_limit) | timeline_generator.write_timeline(size_limit) | ||||
| timeline_generator.write_timeline_summary() | timeline_generator.write_timeline_summary() | ||||
| return timeline_generator | |||||
| except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: | except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: | ||||
| logger.warning('Fail to write timeline data: %s', err) | logger.warning('Fail to write timeline data: %s', err) | ||||
| raise RuntimeError('Fail to write timeline data.') | raise RuntimeError('Fail to write timeline data.') | ||||