From: @gzhcv Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -273,7 +273,7 @@ void DataSaver::WriteStepTrace(const std::string &saver_base_dir) { | |||
| } | |||
| // write step trace time info into file | |||
| uint32_t factor = 10; | |||
| const uint32_t factor = 10; | |||
| std::vector<std::string> op_name_arr; | |||
| op_name_arr.push_back(step_trace_op_name.trace_fp_start); | |||
| op_name_arr.push_back(step_trace_op_name.trace_bp_end); | |||
| @@ -170,7 +170,6 @@ std::string ProfilingUtils::GetGraphSecondLastKernelName(const std::vector<CNode | |||
| return second_last_kernel_name; | |||
| } | |||
| } // namespace gpu | |||
| } // namespace profiler | |||
| } // namespace mindspore | |||
| @@ -597,7 +597,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo | |||
| auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(profiler_inst); | |||
| if (is_first_step_map_[graph->graph_id()]) { | |||
| if (profiler_inst->GetEnableFlag() && is_first_step_map_[graph->graph_id()]) { | |||
| profiler::gpu::ProfilingTraceInfo profiling_trace = | |||
| profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); | |||
| profiler_inst->SetStepTraceOpName(profiling_trace); | |||
| @@ -233,24 +233,34 @@ def query_step_trace_file(profiler_dir): | |||
| return None | |||
| def get_summary_for_step_trace(average_info, header): | |||
| def get_summary_for_step_trace(average_info, header, is_training_mode=True): | |||
| """The property of summary info.""" | |||
| if not average_info or not header: | |||
| return {} | |||
| total_time = get_field_value(average_info, 'total', header) | |||
| iteration_interval = get_field_value(average_info, 'iteration_interval', | |||
| header) | |||
| fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) | |||
| tail = get_field_value(average_info, 'tail', header) | |||
| summary = { | |||
| summary_part = { | |||
| 'total_time': total_time, | |||
| 'iteration_interval': iteration_interval, | |||
| 'iteration_interval_percent': calculate_percent(iteration_interval, total_time), | |||
| 'fp_and_bp': fp_and_bp, | |||
| 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), | |||
| 'tail': tail, | |||
| 'tail_percent': calculate_percent(tail, total_time) | |||
| } | |||
| if is_training_mode: | |||
| fp_and_bp = get_field_value(average_info, 'fp_and_bp', header) | |||
| tail = get_field_value(average_info, 'tail', header) | |||
| summary = { | |||
| 'fp_and_bp': fp_and_bp, | |||
| 'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time), | |||
| 'tail': tail, | |||
| 'tail_percent': calculate_percent(tail, total_time) | |||
| } | |||
| else: | |||
| fp = get_field_value(average_info, 'fp', header) | |||
| summary = { | |||
| 'fp': fp, | |||
| 'fp_percent': calculate_percent(fp, total_time) | |||
| } | |||
| summary.update(summary_part) | |||
| return summary | |||
| @@ -21,7 +21,7 @@ from decimal import Decimal | |||
| from mindspore import log as logger | |||
| from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \ | |||
| ProfilerFileNotFoundException, ProfilerRawFileException | |||
| ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException | |||
| from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond | |||
| from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path | |||
| from mindspore.profiler.parser.container import TimelineContainer | |||
| @@ -776,6 +776,24 @@ class GpuTimelineGenerator(BaseTimelineGenerator): | |||
| # Update timeline summary info | |||
| self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) | |||
| def check_op_name(self, op_name): | |||
| """ | |||
| Check whether the operator name exists. | |||
| Args: | |||
| op_name (str): The operator name or operator name prefix. | |||
| Returns: | |||
| bool, `True` if the operator name does exist, else `False`. | |||
| """ | |||
| if not op_name: | |||
| raise ProfilerParamValueErrorException('The op_name should exist.') | |||
| for op_time_info in self._timeline_meta: | |||
| full_op_name = op_time_info['name'] | |||
| if full_op_name and full_op_name.startswith(op_name): | |||
| return True | |||
| return False | |||
| class AscendTimelineGenerator(BaseTimelineGenerator): | |||
| """Generate ascend Timeline data from file.""" | |||
| _display_filename = 'ascend_timeline_display_{}.json' | |||
| @@ -42,9 +42,10 @@ class BaseStepTraceParser: | |||
| output_file_path (str): The output file path. | |||
| job_id (int): The job id used to define the start of new step. Default: 0. | |||
| skip_first_step (bool): Whether skip the first step or not. | |||
| is_training_mode (bool): Whether in training mode or not. | |||
| """ | |||
| def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False): | |||
| def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False, is_training_mode=True): | |||
| self._input_dir = input_dir | |||
| self._output_path = output_file_path | |||
| self._job_id = job_id | |||
| @@ -53,6 +54,7 @@ class BaseStepTraceParser: | |||
| self._header = [] | |||
| self._step_num = 0 | |||
| self._tag_map = {} | |||
| self._is_training_mode = is_training_mode | |||
| @property | |||
| def output_file(self): | |||
| @@ -64,7 +66,7 @@ class BaseStepTraceParser: | |||
| """The property of step trace info.""" | |||
| summary_info = {} | |||
| if self._result: | |||
| summary_info = get_summary_for_step_trace(self._result[-1], self._header) | |||
| summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode) | |||
| summary_info['total_steps'] = len(self._result) - 1 | |||
| print('\nStep trace summary info (unit: syscnt):') | |||
| print(summary_info) | |||
| @@ -321,15 +323,27 @@ class BaseStepTraceParser: | |||
| log.info("Finish add average info for step trace.") | |||
| def _save(self): | |||
| """save step trace file.""" | |||
| BP_POINT, TAIL, FP_DURATION = 5, -1, -2 | |||
| log.info("Start to save step trace file.") | |||
| if not self._header: | |||
| return | |||
| with open(self._output_path, 'w') as file_handle: | |||
| csv_writer = csv.writer(file_handle) | |||
| csv_writer.writerow(self._header) | |||
| for row_data in self._result: | |||
| csv_writer.writerow(row_data) | |||
| os.chmod(self._output_path, stat.S_IRUSR) | |||
| try: | |||
| with open(self._output_path, 'w') as file_handle: | |||
| csv_writer = csv.writer(file_handle) | |||
| if not self._is_training_mode: | |||
| self._header[FP_DURATION] = 'fp' | |||
| self._header = self._header[:BP_POINT] + self._header[BP_POINT+1:TAIL] | |||
| csv_writer.writerow(self._header) | |||
| for row_data in self._result: | |||
| if not self._is_training_mode: | |||
| row_data[FP_DURATION] += row_data[TAIL] | |||
| row_data = row_data[:BP_POINT] + row_data[BP_POINT+1:TAIL] | |||
| csv_writer.writerow(row_data) | |||
| os.chmod(self._output_path, stat.S_IRUSR) | |||
| except (IOError, OSError) as err: | |||
| log.warning('Failed to save step trace raw info. %s', err) | |||
| raise ProfilerIOException | |||
| class GpuStepTraceParser(BaseStepTraceParser): | |||
| @@ -356,10 +370,16 @@ class GpuStepTraceParser(BaseStepTraceParser): | |||
| log.warning(f'Failed to read {source_file}', err) | |||
| raise ProfilerIOException | |||
| points = { | |||
| 'fp_start': fp_start_name, | |||
| 'bp_end': bp_end_name | |||
| } | |||
| if self._is_training_mode: | |||
| points = { | |||
| 'fp_start': fp_start_name, | |||
| 'bp_end': bp_end_name | |||
| } | |||
| else: | |||
| points = { | |||
| 'fp_start': fp_start_name, | |||
| } | |||
| try: | |||
| with open(output_path, 'w') as json_file: | |||
| json.dump(points, json_file) | |||
| @@ -456,10 +476,16 @@ class AscendStepTraceParser(BaseStepTraceParser): | |||
| Returns: | |||
| dict, parsed point info. | |||
| """ | |||
| points = { | |||
| 'fp_start': point_info.get(self._fp_tag, ''), | |||
| 'bp_end': point_info.get(self._bp_tag, '') | |||
| } | |||
| if self._is_training_mode: | |||
| points = { | |||
| 'fp_start': point_info.get(self._fp_tag, ''), | |||
| 'bp_end': point_info.get(self._bp_tag, '') | |||
| } | |||
| else: | |||
| points = { | |||
| 'fp_start': point_info.get(self._fp_tag, ''), | |||
| } | |||
| try: | |||
| with open(output_path, 'w') as json_file: | |||
| json.dump(points, json_file) | |||
| @@ -151,7 +151,7 @@ class Profiler: | |||
| logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' | |||
| 'and init(). Profiler should be initialized after these code. ') | |||
| self._gpu_profiler.stop() | |||
| self._generate_timeline() | |||
| timeline_generator = self._generate_timeline() | |||
| # parse minddata pipeline operator and queue for GPU | |||
| try: | |||
| @@ -162,7 +162,7 @@ class Profiler: | |||
| # analyse step trace info | |||
| try: | |||
| self._analyse_step_trace() | |||
| self._analyse_step_trace(is_training_mode_flag=timeline_generator.check_op_name('Gradients')) | |||
| except ProfilerException as err: | |||
| logger.warning(err.message) | |||
| @@ -239,13 +239,14 @@ class Profiler: | |||
| os.environ['PROFILING_MODE'] = str("false") | |||
| context.set_context(enable_profiling=False) | |||
| def _analyse_step_trace(self, source_path=None, framework_parser=None): | |||
| def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True): | |||
| """ | |||
| Analyse step trace data and save the result. | |||
| Args: | |||
| source_path (str): The directory that contains the step trace original data. | |||
| framework_parser (FrameworkParser): The framework parse instance. | |||
| is_training_mode_flag (bool): Whether in training mode or not. | |||
| """ | |||
| logger.info("Begin to parse step trace.") | |||
| # construct output path | |||
| @@ -266,19 +267,23 @@ class Profiler: | |||
| f'step_trace_profiling_{self._dev_id}.txt' | |||
| ) | |||
| parser = GpuStepTraceParser(input_dir=input_file_path, | |||
| output_file_path=step_trace_intermediate_file_path) | |||
| output_file_path=step_trace_intermediate_file_path, | |||
| is_training_mode=is_training_mode_flag) | |||
| parser.parse_and_save() | |||
| point_info = parser.record_point_info(input_file_path, point_info_file_path) | |||
| else: | |||
| # whether keep the first step | |||
| skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) | |||
| point_info = framework_parser.point_info | |||
| # recognize inference or traning mode | |||
| is_traning_mode_flag = framework_parser.check_op_name("Gradients") | |||
| # parser the step trace files and save the result to disk | |||
| source_path = validate_and_normalize_path(source_path) | |||
| parser = AscendStepTraceParser(input_dir=source_path, | |||
| output_file_path=step_trace_intermediate_file_path, | |||
| job_id=self._job_id_env, | |||
| skip_first_step=skip_first_step_flag) | |||
| skip_first_step=skip_first_step_flag, | |||
| is_training_mode=is_traning_mode_flag) | |||
| parser.update_tag_op_type_map(point_info) | |||
| parser.parse_and_save() | |||
| point_info = parser.record_point_info(point_info, point_info_file_path) | |||
| @@ -332,6 +337,7 @@ class Profiler: | |||
| timeline_generator.init_timeline() | |||
| timeline_generator.write_timeline(size_limit) | |||
| timeline_generator.write_timeline_summary() | |||
| return timeline_generator | |||
| except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: | |||
| logger.warning('Fail to write timeline data: %s', err) | |||
| raise RuntimeError('Fail to write timeline data.') | |||