|
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """The parser for step trace data."""
- import csv
- import json
- import os
- import stat
- import struct
- from collections import namedtuple
- from decimal import Decimal
- from abc import abstractmethod
-
- from mindspore.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \
- ProfilerIOException, ProfilerRawFileException
- from mindspore import log
- from mindspore.profiler.common.util import get_summary_for_step_trace
- from mindspore.profiler.common.validator.validate_path import \
- validate_and_normalize_path
-
- ProfilingHeadStruct = namedtuple(
- 'ProfilingHeadStruct', ['mode', 'rptType', 'bufSize']
- )
-
- StepTraceStruct = namedtuple(
- 'StepTraceStruct', ['timeStamp', 'index_id', 'model_id', 'stream_id', 'task_id', 'tag_id']
- )
-
-
- class BaseStepTraceParser:
- """
- The parser for step trace data.
-
- Args:
- input_dir (str): The directory that contains original step trace data.
- output_file_path (str): The output file path.
- job_id (int): The job id used to define the start of new step. Default: 0.
- skip_first_step (bool): Whether skip the first step or not.
- is_training_mode (bool): Whether in training mode or not.
- is_gpu_kernel_async_launch (bool): Whether is gpu kernel async launch or not.
- """
-
- def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False,
- is_training_mode=True, is_gpu_kernel_async_launch=False):
- self._input_dir = input_dir
- self._output_path = output_file_path
- self._job_id = job_id
- self._skip_first_step = skip_first_step
- self._result = []
- self._header = []
- self._step_num = 0
- self._tag_map = {}
- self._is_training_mode = is_training_mode
- self._step_end_tag_id = 4
- self._is_gpu_kernel_async_launch = is_gpu_kernel_async_launch
- self._model_start_tag_id = 0
- self._model_end_tag_id = 1
- self._fp_tag_id = 2
- self._bp_tag_id = 3
- self._reduce_min_tag_id = 10000
- self._reduce_max_tag_id = 20000
- self._profiling_head_len = 4
- self._profiling_head_pad_len = 4
- self._st_data_len = 8 + 8 + 8 + 2 + 2 + 2
-
- @property
- def output_file(self):
- """The property of step trace header."""
- file_name = self._output_path.rsplit('/', 2)
- return file_name[-1] if len(file_name) == 3 else ''
-
- def show(self):
- """The property of step trace info."""
- summary_info = {}
- if self._result:
- summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode)
- summary_info['total_steps'] = len(self._result) - 1
- print('\nStep trace summary info (unit: syscnt):')
- print(summary_info)
- print('\nThe step trace parse result saves under ${summary_dir}/profiler/%s'
- % self.output_file)
-
- def parse_and_save(self):
- """Parse step trace files and save the result."""
- try:
- source_files = self._get_step_trace_files()
- if self._is_gpu_kernel_async_launch:
- self._parse_async_launch(source_files)
- else:
- self._parse(source_files)
- self._save()
- except IOError as err:
- log.warning(err)
- raise ProfilerIOException()
- else:
- log.info("Finish to save intermediate result for step trace file.")
-
- def record_point_info(self, point_info, output_path):
- """
- Record point info into json.
-
- Args:
- point_info (dict): The point info about tag id and relative op name.
- output_path (str): The output path for saving point info.
-
- Returns:
- dict, parsed point info.
- """
-
- def update_tag_op_type_map(self, point_info):
- """
- update the map from tag id to op type.
-
- Args:
- point_info (dict): The point info about tag id and relative op name.
- """
- self._get_step_trace_files()
- tag_map = {}
- for tag, op_name in point_info.items():
- op_type = self._get_op_type(tag, op_name)
- tag_map[tag] = op_type
- log.info("Get tag types for step trace analysis: %s", tag_map)
- self._tag_map = tag_map
-
- def _get_op_type(self, tag, name):
- """
- Get op type from tag and name.
-
- Args:
- tag (int): The tag id.
- name (str): The op name.
-
- Returns:
- str, the op type or communication op name.
- """
- tag_map = {self._fp_tag: 'fp', self._bp_tag: 'bp', self._step_end_tag_id: 'end'}
- # get solid tag type
- op_type = tag_map.get(tag, '')
- if op_type:
- return op_type
- # check if the tag is step tag.
- if tag == 0:
- return 'start'
- # analyze the reduce tag
- op_name = name.rsplit('/', 1)[-1]
- if not op_name:
- log.warning("Unexpected op name:%s", name)
-
- return op_name
-
- def _get_step_trace_files(self):
- """Get step trace files."""
- return self._input_dir
-
- @staticmethod
- def _search_file(input_dir):
- """Search step trace file under specific input directory."""
- # validate input_dir
- if not os.path.isdir(input_dir):
- raise ProfilerPathErrorException(
- '{} does not exist or is not a dir'.format(input_dir)
- )
- # get step trace files
- files = os.listdir(input_dir)
- step_trace_files = list(
- filter(
- lambda file: file.startswith('ts_track.data') and not file.endswith('.done'),
- files
- )
- )
- # validate result
- if len(step_trace_files) > 1:
- # the format of file name is like
- # `training_trace.46.dev.profiler_default_tag.$id.slice_$number`
- # use the $number as the sorted key
- try:
- step_trace_files.sort(key=lambda path: int(path.rsplit('_', 1)[-1]))
- except ValueError as err:
- log.warning("Unable to parse file names: %s. %s", step_trace_files, err)
- step_trace_files = []
- else:
- training_trace_files = list(
- filter(
- lambda file: file.startswith('training_trace') and not file.endswith('.done'),
- files
- )
- )
- if len(training_trace_files) >= 1:
- log.warning("The training_trace file structure is changed, please upgrade "
- "mindspore and regenerate profiling data")
-
- file_paths = [os.path.join(input_dir, file) for file in step_trace_files]
- log.info("Find %d step trace files.", len(file_paths))
- return file_paths
-
- @abstractmethod
- def _parse(self, source_files):
- """Parse source step trace files."""
-
- def _get_next_step_trace(self, content, event_info):
- """
- Get next step trace info.
-
- Args:
- content (bytes): The input step trace info.
- event_info (dict): The event info.
-
- Returns:
- Generator, return the step trace one by one.
- """
- start_time = event_info.get('end', '-')
- event_info['start'] = start_time
- if 'reduce' not in event_info.keys():
- event_info['reduce'] = {}
-
- i = 0
- while i < len(content):
- profiling_head_data = content[i:i + self._profiling_head_len]
- parsed_head = struct.unpack('BBH', profiling_head_data)
- profiling_head = ProfilingHeadStruct(*parsed_head)
- if profiling_head.rptType == 10:
- st_data = content[i + self._profiling_head_len + self._profiling_head_pad_len:
- i + self._profiling_head_len + self._profiling_head_pad_len + self._st_data_len]
- parsed_data = struct.unpack('QQQHHH', st_data)
- next_event = StepTraceStruct(*parsed_data)
- self._construct_event_info(next_event, event_info)
-
- if event_info.get('end'):
- yield event_info
- start_time = event_info.get('end', '-')
- event_info.clear()
- event_info['start'] = start_time
- event_info['reduce'] = {}
- i = i + profiling_head.bufSize
-
- def _construct_event_info(self, next_event, event_info):
- """Construct event info according to next_event."""
- end_flag: bool = lambda tag: tag == self._step_end_tag_id
- fp_flag: bool = lambda tag: tag == self._fp_tag_id
- bp_flag: bool = lambda tag: tag == self._bp_tag_id
- reduce_flag: bool = lambda tag: self._reduce_min_tag_id <= tag < self._reduce_max_tag_id
-
- def _on_reduce_event(reduce_tag_id):
- """Handle reduce event."""
- stream_id = next_event.stream_id
- if event_info['reduce'].get(stream_id):
- event_info['reduce'][stream_id].append((reduce_tag_id, time_stamp))
- else:
- event_info['reduce'][stream_id] = [(reduce_tag_id, time_stamp)]
-
- tag_id = next_event.tag_id
- time_stamp = next_event.timeStamp
- if end_flag(tag_id):
- event_info['end'] = time_stamp
- elif fp_flag(tag_id):
- event_info['fp'] = time_stamp
- elif bp_flag(tag_id):
- event_info['bp'] = time_stamp
- elif reduce_flag(tag_id):
- _on_reduce_event(tag_id)
-
- def _record_trace_event(self, step_trace):
- """Record trace event."""
- self._step_num += 1
- start_time = step_trace.get('start')
- end_time = step_trace.get('end')
- fp_time = step_trace.get('fp')
- bp_time = step_trace.get('bp')
- if not (start_time and end_time and fp_time and bp_time):
- log.warning("The step %d lacks basic time.", self._step_num)
- return
- if start_time == '-':
- start_time = fp_time
- row_data = {
- 'step_num': self._step_num,
- 'start_point': start_time,
- 'end_point': end_time,
- 'total': end_time - start_time,
- 'fp_point': fp_time,
- 'bp_point': bp_time,
- 'iteration_interval': fp_time - start_time,
- 'fp_and_bp': bp_time - fp_time,
- 'tail': end_time - bp_time
- }
- # update reduce info
- self._update_reduce_info(step_trace, row_data)
- # save the row data
- if not self._header:
- self._header = list(row_data.keys())
- row_data_list = [row_data.get(header_name, 0) for header_name in self._header]
- self._result.append(row_data_list)
-
- def _update_reduce_info(self, step_trace, row_data):
- """Extract reduce info."""
- reduce_time = step_trace.get('reduce', {})
- for stream_id, time_points in reduce_time.items():
- time_point_num = len(time_points)
- if time_point_num % 2:
- log.warning("Stream %d has %d reduce time points.", stream_id, time_point_num)
- continue
- for index, point_id in enumerate(range(0, time_point_num, 2)):
- field_name = f'stream_{stream_id}_{index}'
- reduce_info = self._get_single_reduce_event_info(
- field_name, time_points[point_id], time_points[point_id + 1])
- row_data.update(reduce_info)
-
- def _get_single_reduce_event_info(self, field_name, start_point, end_point):
- """
- Get single reduce info.
-
- Args:
- field_name (str): The field name.
- start_point (Tuple[int, int]): Start point time info, including (tag_id, sys_count).
- end_point (Tuple[int, int]): End point time info, including (tag_id, sys_count).
-
- Returns:
- dict, reduce info.
- """
- ret_dict = {}
- return ret_dict
-
- def _record_average_info(self):
- """Calculate average info."""
- result_size = len(self._result)
- # calculate average data for each column in result data
- average_data = [0] * len(self._header)
- if result_size >= 2:
- for row_info in self._result[1:]:
- average_data = [
- Decimal(i) + Decimal(j) for i, j in zip(row_info, average_data)
- ]
- average_data = [
- round((item / (result_size - 1))) for item in average_data
- ]
- # change step num info in average_data to None
- step_num_index = self._header.index('step_num')
- average_data[step_num_index] = '-'
- self._result.append(average_data)
- log.info("Finish add average info for step trace.")
-
- def _save(self):
- """save step trace file."""
- bp_point, tail, fp_duration = 5, -1, -2
- log.info("Start to save step trace file.")
- if not self._header:
- return
- try:
- with open(self._output_path, 'w') as file_handle:
- csv_writer = csv.writer(file_handle)
- if not self._is_training_mode:
- self._header[fp_duration] = 'fp'
- self._header = self._header[:bp_point] + self._header[bp_point + 1:tail]
- csv_writer.writerow(self._header)
- for row_data in self._result:
- if not self._is_training_mode:
- row_data[fp_duration] += row_data[tail]
- row_data = row_data[:bp_point] + row_data[bp_point + 1:tail]
- csv_writer.writerow(row_data)
- os.chmod(self._output_path, stat.S_IREAD | stat.S_IWRITE)
- except (IOError, OSError) as err:
- log.warning('Failed to save step trace raw info. %s', err)
- raise ProfilerIOException
-
-
- class GpuStepTraceParser(BaseStepTraceParser):
- """The parser for gpu step trace data."""
-
- def get_fp_bp(self, f_obj, all_step_fp, all_step_bp):
- """Parser the fp and bp."""
- fp_start, bp_end = 0, 1
- if self._is_gpu_kernel_async_launch:
- for line in f_obj:
- line = line.strip().split()
- all_step_fp.append(line[1].split(',')[0])
- all_step_bp.append(line[2].split(',')[0])
- else:
- lines = f_obj.readlines()
- all_step_fp.append(lines[fp_start].split()[0])
- all_step_bp.append(lines[bp_end].split()[0])
-
- def record_point_info(self, source_file, output_path):
- """
- Record point info into json.
-
- Args:
- source_file (str): The file path of step trace original data.
- output_path (str): The output path for saving point info.
-
- Returns:
- dict, parsed point info.
- """
- all_step_points = []
- all_step_fp = []
- all_step_bp = []
- try:
- with open(source_file, 'r') as f_obj:
- self.get_fp_bp(f_obj, all_step_fp, all_step_bp)
- except (IOError, OSError) as err:
- log.warning(f'Failed to read {source_file}', err)
- raise ProfilerIOException
-
- for fp_name, bp_name in zip(all_step_fp, all_step_bp):
- if self._is_training_mode:
- points = {
- 'fp_start': fp_name,
- 'bp_end': bp_name
- }
- else:
- points = {
- 'fp_start': fp_name,
- }
- all_step_points.append(points)
-
- try:
- with open(output_path, 'w') as json_file:
- if self._is_gpu_kernel_async_launch:
- json.dump(all_step_points, json_file)
- else:
- json.dump(all_step_points[0], json_file)
- os.chmod(output_path, stat.S_IREAD | stat.S_IWRITE)
- except (IOError, OSError) as err:
- log.warning('Failed to save point info. %s', err)
- raise ProfilerIOException
-
- return all_step_points[0]
-
- def _get_step_trace_files(self):
- """Get step trace files."""
- return self._input_dir
-
- def _parse(self, source_file):
- """Parse source step trace files."""
- log.info("Start to parse step trace file.")
- fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3
- reduce_start = 4
- start_time, end_time = 0, 1
- step_trace_point_count = 3
-
- source_file = validate_and_normalize_path(source_file)
- try:
- with open(source_file, 'r') as f:
- lines = f.readlines()
- if len(lines) < step_trace_point_count:
- raise ProfilerRawFileException(
- f"Failed to parse {source_file} file. The FP_POINT/BP_POINT/ITER_END_POINT "
- f"do not recognized correctly. Try to set the environment variable'PROFILING_FP_START' "
- f"and 'PROFILING_BP_END' to solve this problem. For example, "
- f"'export PROFILING_FP_START=Default/xxx/Conv2d-op1' ")
- step_trace_info_all = [line.strip().split()[1:] for line in lines]
- num_of_step = len(step_trace_info_all[0])
- for step_trace_point in step_trace_info_all:
- if len(step_trace_point) != num_of_step:
- raise ProfilerRawFileException(
- f"Failed to parse {source_file} file. Due to the profiled "
- f"step_num of FP/BP/ITER_END Point are not equal")
- iter_start_info = [step_trace_info_all[fp_start][0]] + \
- step_trace_info_all[iter_end][:num_of_step]
- step_trace_info_all.insert(iter_start, iter_start_info)
- except (IOError, OSError) as err:
- log.warning(f'Failed to read {source_file}', err)
- raise ProfilerIOException
-
- for step_num in range(num_of_step):
- step_trace = {
- 'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]),
- 'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]),
- 'bp': int(step_trace_info_all[bp_end][step_num].split(',')[end_time]),
- 'end': int(step_trace_info_all[iter_end][step_num].split(',')[end_time]),
- 'reduce': {}
- }
- num_of_step_point = len(step_trace_info_all)
- if num_of_step_point > reduce_start:
- reduce_info = {}
- reduce_time_info = []
- for reduce_idx in range(reduce_start, num_of_step_point):
- cur_reduce_time = step_trace_info_all[reduce_idx][step_num]
- reduce_time_info += cur_reduce_time.split(',')
- reduce_info['ops'] = reduce_time_info
- step_trace['reduce'] = reduce_info
- self._record_trace_event(step_trace)
- self._record_average_info()
- log.info("Finish to parse step trace file.")
-
- def _parse_one_step(self, line):
- """
- Parse step text line to dict obj.
-
- Args:
- line (str): The step trace line text, it contains five parts, each part is separated by a space.
- part 1: start_op_name,start_op_time
- part 2: fp_op_name,fp_time
- part 3: bp_op_name,bp_time
- part 4: end_op_name,end_time
- part 5: [reduce_op_name,reduce1_start],it contains multiple reduce, each reduce is separated by a space.
- """
-
- line = line.strip().split()
- start_time = int(line[0].split(',')[1][:-1])
- fp_time = int(line[1].split(',')[1][:-1])
- bp_time = int(line[2].split(',')[1][:-1])
- end_time = int(line[3].split(',')[1][:-1])
- reduce_info = {}
- reduce_time_info = []
-
- for reduce_item in line[4:]:
- # add communication op start and end time, time unit from ns to 10ns.
- reduce_time_info.append(reduce_item.split(',')[1][:-1])
- reduce_time_info.append(reduce_item.split(',')[2][:-1])
- step_trace = {
- 'start': start_time,
- 'fp': fp_time,
- 'bp': bp_time,
- 'end': end_time
- }
- if reduce_time_info:
- reduce_info['ops'] = reduce_time_info
- step_trace['reduce'] = reduce_info
- self._record_trace_event(step_trace)
-
- def _parse_async_launch(self, source_file):
- """Parse source step trace files generated from async launch kernel."""
- log.info("Start to parse step trace file.")
- source_file = validate_and_normalize_path(source_file)
-
- try:
- with open(source_file, 'r') as f_obj:
- for line in f_obj:
- self._parse_one_step(line)
-
- except (IOError, OSError) as err:
- log.warning(f'Failed to read {source_file}', err)
- raise ProfilerIOException
-
- self._record_average_info()
- log.info("Finish to parse step trace file.")
-
- def _get_single_reduce_event_info(self, field_name, start_point, end_point):
- """
- Get single reduce info.
-
- Args:
- field_name (str): The field name.
- start_point (str): Start point time.
- end_point (str): End point time.
-
- Returns:
- dict, reduce info.
- """
- reduce_info = {}
-
- op_type = 'AllReduce'
- # append field name with op type.
- field_name += '_' + op_type
- reduce_info[field_name] = int(end_point) - int(start_point)
- reduce_info[field_name + '_start_point'] = start_point
- reduce_info[field_name + '_end_point'] = end_point
-
- return reduce_info
-
-
- class AscendStepTraceParser(BaseStepTraceParser):
- """The parser for ascend step trace data."""
- _event_size = 20
- _fp_tag = 2
- _bp_tag = 3
- _step_trace_files = []
-
- def record_point_info(self, point_info, output_path):
- """
- Record point info into json.
-
- Args:
- point_info (dict): The point info about tag id and relative op name.
- output_path (str): The output path for saving point info.
-
- Returns:
- dict, parsed point info.
- """
- if self._is_training_mode:
- points = {
- 'fp_start': point_info.get(self._fp_tag, ''),
- 'bp_end': point_info.get(self._bp_tag, '')
- }
- else:
- points = {
- 'fp_start': point_info.get(self._fp_tag, ''),
- }
- if os.path.exists(output_path):
- return points
- try:
- with open(output_path, 'w') as json_file:
- json.dump(points, json_file)
- os.chmod(output_path, stat.S_IREAD | stat.S_IWRITE)
- except (IOError, OSError) as err:
- log.warning('Failed to save point info. %s', err)
- raise ProfilerIOException
- return points
-
- def _get_step_trace_files(self):
- """Get step trace files."""
- # step trace files may under $profiler_dir or $profiler_dir/data
- if self._step_trace_files:
- return self._step_trace_files
-
- profiler_dir = self._input_dir
- step_trace_files = self._search_file(profiler_dir)
- if not step_trace_files:
- # try to find step trace files under $profiler_dir/data
- profiler_dir = os.path.join(profiler_dir, 'data')
- step_trace_files = self._search_file(profiler_dir)
- if not step_trace_files:
- raise ProfilerPathErrorException('Training trace file does not exist.')
- self._step_trace_files = step_trace_files
-
- return step_trace_files
-
- def _parse(self, source_files):
- """Parse source step trace files."""
- log.info("Start to parse step trace file.")
- event_info = {}
-
- for source_file in source_files:
- source_file = validate_and_normalize_path(source_file)
- try:
- with open(source_file, 'rb') as handler:
- content = handler.read()
- for step_trace in self._get_next_step_trace(content, event_info):
- if self._skip_first_step:
- self._skip_first_step = False
- continue
- self._record_trace_event(step_trace)
- except (IOError, OSError) as err:
- log.warning(f'Failed to read {source_file}', err)
- raise ProfilerIOException
-
- self._record_average_info()
- log.info("Finish to parse step trace file.")
-
- def _get_single_reduce_event_info(self, field_name, start_point, end_point):
- """
- Get single reduce info.
-
- Args:
- field_name (str): The field name.
- start_point (Tuple[int, int]): Start point time info, including (tag_id, sys_count).
- end_point (Tuple[int, int]): End point time info, including (tag_id, sys_count).
-
- Returns:
- dict, reduce info.
- """
- reduce_info = {}
- if end_point[0] - start_point[0] != 1 or start_point[0] % 2:
- log.warning("Unmatched reduce event <%s, %s>.", start_point, end_point)
- return reduce_info
- op_type = self._tag_map.get(start_point[0])
- # append field name with op type.
- if not op_type:
- log.warning("Can't recognize the inner type for point tag: %d.", start_point[0])
- field_name += '_parallel'
- else:
- field_name += '_' + op_type
- reduce_info[field_name] = end_point[1] - start_point[1]
- reduce_info[field_name + '_start_point'] = start_point[1]
- reduce_info[field_name + '_end_point'] = end_point[1]
-
- return reduce_info
|