implementation of step trace profiler

5 years ago · cce05461ec
--- a/mindinsight/backend/profiler/profile_api.py
+++ b/mindinsight/backend/profiler/profile_api.py
@@ -21,17 +21,17 @@ import json
 import os

 from flask import Blueprint
 from flask import request
 from flask import jsonify
 from flask import request
 from marshmallow import ValidationError

 from mindinsight.conf import settings
 from mindinsight.datavisual.utils.tools import get_train_id, get_profiler_dir
 from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory

 from mindinsight.datavisual.utils.tools import get_train_id, get_profiler_dir, to_int
 from mindinsight.lineagemgr.common.validator.validate_path import validate_and_normalize_path
 from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
 from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir
 from mindinsight.profiler.common.validator.validate import validate_condition
 from mindinsight.profiler.common.validator.validate import validate_condition, validate_ui_proc
 from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_profiler_path
 from mindinsight.utils.exceptions import ParamValueError

 BLUEPRINT = Blueprint("profile", __name__, url_prefix=settings.URL_PREFIX)
@@ -109,6 +109,64 @@ def get_profile_device_list():
    return jsonify(device_list)


@BLUEPRINT.route("/profile/training-trace/graph", methods=["GET"])
 def get_training_trace_graph():
    """
    Get training trace info of one step.

    Returns:
        Response, the training trace info of one step.

    Examples:
        >>> GET http://xxxx/v1/mindinsight/profile/training-trace/graph
    """
    summary_dir = request.args.get("dir")
    profiler_dir = validate_and_normalize_profiler_path(summary_dir)
    graph_type = request.args.get("type", default='0')
    graph_type = to_int(graph_type, 'graph_type')
    device_id = request.args.get("device_id", default='0')
    _ = to_int(device_id, 'device_id')

    analyser = AnalyserFactory.instance().get_analyser(
        'step_trace', profiler_dir, device_id)
    graph_info = analyser.query({
        'filter_condition': {
            'mode': 'step',
            'step_id': graph_type
        }})

    return jsonify(graph_info)


@BLUEPRINT.route("/profile/training-trace/target-time-info", methods=["GET"])
 def get_target_time_info():
    """
    Get all the time information of the specified column.

    Returns:
        Response, all the time information of the specified column.

    Examples:
        >>> GET http://xxxx/v1/mindinsight/profile/training-trace/target-time-info
    """
    summary_dir = request.args.get("dir")
    profiler_dir = validate_and_normalize_profiler_path(summary_dir)
    proc_name = request.args.get("type")
    validate_ui_proc(proc_name)
    device_id = request.args.get("device_id", default='0')
    _ = to_int(device_id, 'device_id')

    analyser = AnalyserFactory.instance().get_analyser(
        'step_trace', profiler_dir, device_id)
    target_time_info = analyser.query({
        'filter_condition': {
            'mode': 'proc',
            'proc_name': proc_name
        }})
    target_time_info['summary'] = analyser.summary
    return jsonify(target_time_info)


 def init_module(app):
    """
    Init module entry.
--- a/mindinsight/profiler/analyser/init.py
+++ b/mindinsight/profiler/analyser/init.py
@@ -12,3 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Import analyser."""
 from .analyser import *
 from .step_trace_analyser import StepTraceAnalyser
--- a/mindinsight/profiler/analyser/analyser_factory.py
+++ b/mindinsight/profiler/analyser/analyser_factory.py
@@ -15,7 +15,7 @@
 """The analyser factory."""
 import threading

 import mindinsight.profiler.analyser.analyser as analyser_module
 import mindinsight.profiler.analyser as analyser_module
 from mindinsight.profiler.common.exceptions.exceptions import \
    ProfilerAnalyserNotExistException

@@ -25,11 +25,12 @@ class AnalyserFactory:
    The analyser factory is used to create analyser special instance.

    Currently the factory supports creating `AicoreTypeAnalyser`,
    `AicoreDetailAnalyser` and `AicpuAnalyser`. The `AicoreTypeAnalyser` is used
    to analyze execution time according to AICORE operator type.
    `AicoreDetailAnalyser`, `AicpuAnalyser` and `StepTraceAnalyser`.
    The `AicoreTypeAnalyser` is used to analyze execution time according to AICORE operator type.
    The `AicoreDetailAnalyser` is used to analyze execution time according to
    all specific AICORE operator. The `AicpuAnalyser` is used to analyze
    execution time according to all specific AICPU operator.
    The `StepTraceAnalyser` is used to analyze the execution time according to different process.

    Examples:
        >>> analyser = AnalyserFactory.instance().get_analyser(
--- a/mindinsight/profiler/analyser/step_trace_analyser.py
+++ b/mindinsight/profiler/analyser/step_trace_analyser.py
@@ -0,0 +1,266 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """The StepTraceAnalyser analyser class."""
 import csv

 from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
 from mindinsight.profiler.common.exceptions.exceptions import ProfilerParamValueErrorException, \
    ProfilerFileNotFoundException, StepNumNotSupportedException
 from mindinsight.profiler.common.log import logger as log
 from mindinsight.profiler.common.util import query_latest_trace_time_file, get_field_value, \
    get_summary_for_step_trace


 class StepTraceAnalyser(BaseAnalyser):
    """The analyser for analyzing training steps."""

    __col_names__ = []
    _attr_ui_name = 'name'
    _attr_ui_start = 'start'
    _attr_ui_duration = 'duration'

    @property
    def summary(self):
        """The property of summary info."""

        summary = get_summary_for_step_trace(self._data[-1], self.__column__)
        summary['total_steps'] = self._size
        return summary

    def query(self, condition=None):
        """
        Query data according to the condition.

        Args:
            condition (dict): The search condition, only contains `filter_condition` parameter.
                Default: None.

        Returns:
            dict, the result after filtered, sorted and grouped.
        """
        if condition is None:
            condition = {}
        filter_condition = condition.get('filter_condition', {})
        self._validate_filter_condition(filter_condition)
        self._result = {'size': self._size}
        self._filter(filter_condition)

        return self._result

    def query_for_all_reduce(self):
        """
        Query for all reduce info.

        Returns:
            list[dict], each item is the reduce info for one step, the reduce info is format like:
                {stream_id: List[Tuple(start_point, end_point, duration, field_name)]}.
        """
        reduce_infos = []
        for row_info in self._data[:-1]:
            reduce_info = self._get_reduce_time_in_order(row_info, 'systime')
            reduce_infos.append(reduce_info)

        return reduce_infos

    def _load(self):
        """Load data according to the parsed AICORE operator types file."""
        file_path = query_latest_trace_time_file(self._profiling_dir, self._device_id)
        if not file_path:
            log.error("Failed to find parsed trace time file.")
            raise ProfilerFileNotFoundException('parsed step trace time file')
        with open(file_path, 'r') as handle:
            csv_reader = csv.reader(handle)
            self.__column__ = next(csv_reader)
            self._data = list(csv_reader)
        self._size = len(self._data) - 1
        self._display_col_names = self.__col_names__[:]

    def _filter(self, filter_condition):
        """
        Filter the profiling data according to the filter condition.

        Args:
            filter_condition (dict): The filter condition.

                - mode (str): The kind of information. `step` return the info about specific
                    step. `proc` return the info about specific field in parsed trace file.

                - step_id (int): The selected step_id. If not given, it means all steps is required.
                    If the value is 0, it means average info for all steps except the first is
                    required.

                - proc_name (str): The selected field name.

                - time_type (str): The value type. `systime` keeps the original value.
                    `realtime` transforms the value in millisecond. Default: `realtime`.
        """
        mode = filter_condition.get('mode', 'step')
        if mode == 'step':
            self._get_step_details(step_id=filter_condition.get('step_id'),
                                   time_type=filter_condition.get('time_type', 'realtime'))
        else:
            self._get_proc_details(step_id=filter_condition.get('step_id'),
                                   proc_name=filter_condition.get('proc_name'),
                                   time_type=filter_condition.get('time_type', 'realtime'))

    def _construct_time_point(self, name, start, duration):
        """Construct time point."""
        point = {
            self._attr_ui_name: name,
            self._attr_ui_start: round(start, 4),
            self._attr_ui_duration: round(duration, 4)
        }
        return point

    def _get_step_details(self, step_id, time_type='realtime'):
        """
        Get step trace info for selected step and save the result.

        Args:
            step_id (int): The selected step_id. If the value is 0, it means average info
                for all steps except the first is required.
            time_type (str): The value type. `systime` keeps the original value.
                `realtime` transforms the value in millisecond. Default: `realtime`.
        """
        if step_id is None:
            step_id = 0
        row_info = self._data[step_id - 1]

        start_point = get_field_value(row_info, 'start_point', self.__column__, time_type)
        total = get_field_value(row_info, 'total', self.__column__, time_type)
        iteration_interval = get_field_value(row_info, 'iteration_interval', self.__column__,
                                             time_type)
        fp_point = get_field_value(row_info, 'fp_point', self.__column__, time_type)
        fp_and_bp = get_field_value(row_info, 'fp_and_bp', self.__column__, time_type)
        bp_point = get_field_value(row_info, 'bp_point', self.__column__, time_type)
        tail = get_field_value(row_info, 'tail', self.__column__, time_type)
        # first line only contains total time
        first_line = [self._construct_time_point('', 0, total)]
        # second line contains iteration_interval, fp_and_bp and tail
        second_line = [
            self._construct_time_point('', 0, iteration_interval),
            self._construct_time_point('fp_and_bp', fp_point - start_point, fp_and_bp),
            self._construct_time_point('', bp_point - start_point, tail),
        ]
        # construct reduces lines
        reduce_lines = self._construct_reduce_lines(row_info, time_type)

        graph = [first_line, second_line]
        graph.extend(reduce_lines)
        self._result['training_trace_graph'] = graph

    def _get_reduce_time_in_order(self, row_info, time_type):
        """Get reduce time in order."""
        reduce_info = {}
        reduce_fields = [field_name for field_name in self.__column__
                         if field_name.startswith('stream_') and not field_name.endswith('point')]
        for reduce_field in reduce_fields:
            cur_stream_id = reduce_field.split('_', 2)[1]
            cur_stream = reduce_info.get(cur_stream_id)
            if not cur_stream:
                cur_stream = []
                reduce_info[cur_stream_id] = cur_stream
            reduce_start = get_field_value(
                row_info, reduce_field + '_start_point', self.__column__, time_type)
            reduce_end = get_field_value(
                row_info, reduce_field + '_end_point', self.__column__, time_type)
            reduce_duration = get_field_value(
                row_info, reduce_field, self.__column__, time_type)
            cur_stream.append((reduce_start, reduce_end, reduce_duration, reduce_field))
        for _, reduce_events in reduce_info.items():
            reduce_events.sort(key=lambda elem: elem[1])
        return reduce_info

    def _construct_reduce_lines(self, row_info, time_type):
        """Contruct first line in detailed graph."""
        reduce_lines = []
        start_point = get_field_value(row_info, 'start_point', self.__column__, time_type)
        fp_point = get_field_value(row_info, 'fp_point', self.__column__, time_type)
        end_point = get_field_value(row_info, 'end_point', self.__column__, time_type)
        reduce_info = self._get_reduce_time_in_order(row_info, time_type)
        # construct time point for each line
        for _, reduce_events in reduce_info.items():
            current_line = self._construct_reduce_line(
                start_point, end_point, fp_point, reduce_events)
            reduce_lines.append(current_line)

        return reduce_lines

    def _construct_reduce_line(self, start_point, end_point, fp_point, reduce_events):
        """Construct list of time points for reduce line."""
        current_line = []
        previous_start = fp_point
        for start, end, duration, field_name in reduce_events:
            current_line.extend([
                self._construct_time_point(
                    '', previous_start - start_point, start - previous_start),
                self._construct_time_point(
                    field_name, start - start_point, duration)
            ])
            previous_start = end
        current_line.append(self._construct_time_point(
            '', previous_start - start_point, end_point - previous_start))
        return current_line

    def _get_proc_details(self, proc_name, step_id=None, time_type='realtime'):
        """
        Get step trace info for selected step and save the result.

        Args:
            proc_name (str): The selected field name.
            step_id (int): The selected step_id. If not given, it means all steps is required.
                If the value is 0, it means average info for all steps except the first is
                required. Default: None.
            time_type (str): The value type. `systime` keeps the original value.
                `realtime` transforms the value in millisecond. Default: `realtime`.
        """

        if step_id is None:
            rows_info = self._data[:-1]
        else:
            rows_info = [self._data[step_id - 1]]

        proc_info = [get_field_value(row_info, proc_name, self.__column__, time_type)
                     for row_info in rows_info]
        self._result['info'] = {proc_name: proc_info}

    def _validate_filter_condition(self, filter_condition):
        """Validate step trace filter_condition."""
        mode = filter_condition.get('mode', 'step')
        self._validate_str_param(mode, ['step', 'proc'], 'mode')

        step_id = filter_condition.get('step_id', 0)
        self._validate_step_id(step_id)

        proc_name = filter_condition.get('proc_name')
        self._validate_str_param(proc_name, self.__column__, 'proc_name')

        time_type = filter_condition.get('time_type', 'realtime')
        self._validate_str_param(time_type, ['realtime', 'systime'], 'time_type')

    def _validate_step_id(self, step_id):
        """Validate step_id."""
        if isinstance(step_id, int) and 0 <= step_id <= self._size:
            return
        log.error("Invalid step_id in request. step_id should be in [0, %d].", self._size)
        raise StepNumNotSupportedException([0, self._size])

    @staticmethod
    def _validate_str_param(proc_name, accept_param, error_name=''):
        """Validate proc_name."""
        if proc_name is None or isinstance(proc_name, str) and proc_name in accept_param:
            return
        log.error("Invalid param %s in request. Acceptable value is %s.", error_name, accept_param)
        raise ProfilerParamValueErrorException("Invalid proc_name.")
--- a/mindinsight/profiler/common/_utils.py
+++ b/mindinsight/profiler/common/_utils.py
@@ -16,15 +16,16 @@
 import os
 import re


 def fwrite_format(output_data_path, data_source=None, is_print=False, is_start=False):
    """
    Write data to the output file.

    Args:
         output_data_path(str): the output file path of the data.
         data_source(list): the data to write.
         is_print(bool): whether to print the data to stdout.
         is_start(bool): Whether is the first line of the output file, will remove the old file if True."
         output_data_path (str): The output file path of the data.
         data_source (list): The data to write.
         is_print (bool): whether to print the data to stdout.
         is_start (bool): Whether is the first line of the output file, will remove the old file if True."
    """

    if is_start is True and os.path.exists(output_data_path):
@@ -48,16 +49,17 @@ def get_log_slice_id(file_name):
    index = re.findall(r'\d+', slice_list[0])
    return int(index[0])


 def get_file_join_name(input_path, file_name):
    """
    Search files under the special path, and will join all the files to one file.

    Args:
        input_path(str): the source path, will search files under it.
        file_name(str): the target of the filename, such as 'hwts.log.data.45.dev'.
        input_path (str): The source path, will search files under it.
        file_name (str): The target of the filename, such as 'hwts.log.data.45.dev'.

    Returns:
        str: the join file name.
        str, the join file name.
    """
    name_list = []
    file_join_name = ''
@@ -90,11 +92,11 @@ def get_file_names(input_path, file_name):
    Search files under the special path.

    Args:
        input_path(str): the souce path, will serch files under it.
        file_name(str): the target of the filename, such as 'host_start_log'.
        input_path (str): The source path, will search files under it.
        file_name (str): The target of the filename, such as 'host_start_log'.

    Returns:
        list: file name list.
        list, file name list.
    """

    input_path = os.path.realpath(input_path)
--- a/mindinsight/profiler/common/exceptions/error_code.py
+++ b/mindinsight/profiler/common/exceptions/error_code.py
@@ -37,6 +37,8 @@ class ProfilerErrors(ProfilerMgrErrors):
    # parser error code
    DEVICE_ID_MISMATCH_ERROR = 0 | _PARSER_MASK
    RAW_FILE_ERROR = 1 | _PARSER_MASK
    STEP_NUM_NOT_SUPPORTED_ERROR = 2 | _PARSER_MASK
    JOB_ID_MISMATCH_ERROR = 3 | _PARSER_MASK

    # analyser error code
    COLUMN_NOT_EXIST_ERROR = 0 | _ANALYSER_MASK
@@ -49,6 +51,8 @@ class ProfilerErrors(ProfilerMgrErrors):
    COLUMN_NOT_SUPPORT_SORT_ERROR = 7 | _ANALYSER_MASK




@unique
 class ProfilerErrorMsg(Enum):
    """Profiler error messages."""
@@ -63,6 +67,9 @@ class ProfilerErrorMsg(Enum):
    # parser error msg
    DEVICE_ID_MISMATCH_ERROR = 'The device ID mismatch.'
    RAW_FILE_ERROR = 'Raw file error. {}'
    STEP_NUM_NOT_SUPPORTED_ERROR = 'The step num must be in {}'
    JOB_ID_MISMATCH_ERROR = 'The job id in the parameter is not the same as ' \
                            'in the training trace file. '

    # analyser error msg
    COLUMN_NOT_EXIST_ERROR = 'The column {} does not exist.'
--- a/mindinsight/profiler/common/exceptions/exceptions.py
+++ b/mindinsight/profiler/common/exceptions/exceptions.py
@@ -192,3 +192,25 @@ class ProfilerColumnNotSupportSortException(MindInsightException):
            message=ProfilerErrorMsg.COLUMN_NOT_SUPPORT_SORT_ERROR.value.format(msg),
            http_code=400
        )


 class StepNumNotSupportedException(MindInsightException):
    """The step number error in profiler module."""

    def __init__(self, msg):
        super(StepNumNotSupportedException, self).__init__(
            error=ProfilerErrors.STEP_NUM_NOT_SUPPORTED_ERROR,
            message=ProfilerErrorMsg.STEP_NUM_NOT_SUPPORTED_ERROR.value.format(msg),
            http_code=400
        )


 class JobIdMismatchException(MindInsightException):
    """The Job ID mismatch error in profiler module."""

    def __init__(self):
        super(JobIdMismatchException, self).__init__(
            error=ProfilerErrors.JOB_ID_MISMATCH_ERROR,
            message=ProfilerErrorMsg.JOB_ID_MISMATCH_ERROR.value,
            http_code=400
        )
--- a/mindinsight/profiler/common/util.py
+++ b/mindinsight/profiler/common/util.py
@@ -19,6 +19,8 @@ This module provides the utils.
 """
 import os

 from mindinsight.datavisual.utils.tools import to_int


 def analyse_device_list_from_profiler_dir(profiler_dir):
    """
@@ -40,3 +42,99 @@ def analyse_device_list_from_profiler_dir(profiler_dir):
                device_id_list.add(device_num)

    return list(device_id_list)


 def query_latest_trace_time_file(profiler_dir, device_id=0):
    """
    Query the latest trace time file.

    Args:
        profiler_dir (str): The profiler directory.
        device_id (int): The id of device.

    Returns:
        str, the latest trace time file path.
    """
    files = os.listdir(profiler_dir)
    target_file = f'step_trace_raw_{device_id}_detail_time.csv'
    try:
        latest_file = max(
            filter(
                lambda file: file == target_file,
                files
            ),
            key=lambda file: os.stat(os.path.join(profiler_dir, file)).st_mtime
        )
    except ValueError:
        return None
    return os.path.join(profiler_dir, latest_file)


 def query_step_trace_file(profiler_dir):
    """
    Query for all step trace file.

    Args:
        profiler_dir (str): The directory that contains all step trace files.

    Returns:
        str, the file path of step trace time.
    """
    files = os.listdir(profiler_dir)
    training_trace_file = list(
        filter(
            lambda file: file.startswith('training_trace') and not file.endswith('.done'),
            files
        )
    )
    if training_trace_file:
        return os.path.join(profiler_dir, training_trace_file[0])
    return None


 def get_summary_for_step_trace(average_info, header):
    """The property of summary info."""
    if not average_info or not header:
        return {}
    total_time = get_field_value(average_info, 'total', header)
    iteration_interval = get_field_value(average_info, 'iteration_interval',
                                         header)
    fp_and_bp = get_field_value(average_info, 'fp_and_bp', header)
    tail = get_field_value(average_info, 'tail', header)
    summary = {
        'total_time': total_time,
        'iteration_interval': calculate_percent(iteration_interval, total_time),
        'fp_and_bp': calculate_percent(fp_and_bp, total_time),
        'tail': calculate_percent(tail, total_time)
    }
    return summary


 def calculate_percent(partial, total):
    """Calculate percent value."""
    percent = round(partial / total * 100, 2)
    return f'{percent}%'


 def get_field_value(row_info, field_name, header, time_type='realtime'):
    """
    Extract basic info through row_info.

    Args:
        row_info (list): The list of data info in one row.
        header (list[str]): The list of field names.
        field_name (str): The name in header.
        time_type (str): The type of value, `realtime` or `systime`. Default: `realtime`.

    Returns:
        dict, step trace info in dict format.
    """
    # one sys count takes 10 ns, 1 ms has 100000 syscnt
    per_ms_syscnt = 100000
    field_index = header.index(field_name)
    value = row_info[field_index]
    value = to_int(value, field_name)
    if time_type == 'realtime':
        value = value / per_ms_syscnt

    return value
--- a/mindinsight/profiler/common/validator/validate.py
+++ b/mindinsight/profiler/common/validator/validate.py
@@ -13,9 +13,14 @@
 # limitations under the License.
 # ============================================================================
 """Validate the profiler parameters."""
 import os
 import sys

 from mindinsight.datavisual.utils.tools import to_int
 from mindinsight.profiler.common.exceptions.exceptions import ProfilerParamTypeErrorException, \
    ProfilerDeviceIdException, ProfilerOpTypeException, \
    ProfilerSortConditionException, ProfilerFilterConditionException, ProfilerGroupConditionException
    ProfilerSortConditionException, ProfilerFilterConditionException, \
    ProfilerGroupConditionException, ProfilerParamValueErrorException
 from mindinsight.profiler.common.log import logger as log

 AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent"]
@@ -71,6 +76,7 @@ def validate_condition(search_condition):
    if "filter_condition" in search_condition:
        validate_filter_condition(search_condition)


 def validata_group_condition(search_condition):
    """
    Verify the group_condition in search_condition is valid or not.
@@ -185,3 +191,44 @@ def validate_filter_condition(search_condition):
            validate_op_filter_condition(op_name_condition)
        if "op_type" not in filter_condition and "op_name" not in filter_condition:
            raise ProfilerFilterConditionException("The key of filter_condition is not support")


 def validate_and_set_job_id_env(job_id_env):
    """
    Validate the job id and set it in environment.

    Args:
        job_id_env (str): The id that to be set in environment parameter `JOB_ID`.

    Returns:
        int, the valid job id env.
    """
    if job_id_env is None:
        return job_id_env
    # get job_id_env in int type
    valid_id = to_int(job_id_env, 'job_id_env')
    # check the range of valid_id
    if valid_id and 255 < valid_id < sys.maxsize:
        os.environ['JOB_ID'] = job_id_env
    else:
        log.warning("Invalid job_id_env %s. The value should be int and between 255 and %s. Use"
                    "default job id env instead.",
                    job_id_env, sys.maxsize)
    return valid_id


 def validate_ui_proc(proc_name):
    """
    Validate proc name in restful request.

    Args:
        proc_name (str): The proc name to query. Acceptable value is in
        [`iteration_interval`, `fp_and_bp`, `tail`].

    Raises:
        ProfilerParamValueErrorException: If the proc_name is invalid.
    """
    accept_names = ['iteration_interval', 'fp_and_bp', 'tail']
    if proc_name not in accept_names:
        log.error("Invalid proc_name. The proc_name for restful api is in %s", accept_names)
        raise ProfilerParamValueErrorException(f'proc_name should be in {accept_names}.')
--- a/mindinsight/profiler/common/validator/validate_path.py
+++ b/mindinsight/profiler/common/validator/validate_path.py
@@ -15,8 +15,14 @@
 """Validate the input path."""
 import os
 from typing import Union, List
 from urllib.parse import unquote

 from marshmallow import ValidationError

 from mindinsight.profiler.common.exceptions.exceptions import \
    ProfilerParamValueErrorException
 from mindinsight.profiler.common.log import logger as log


 def safe_normalize_path(
        path,
@@ -117,3 +123,30 @@ def validate_and_normalize_path(
        raise ValidationError({raise_key: {"The path is invalid!"}})

    return normalized_path


 def validate_and_normalize_profiler_path(path):
    """
    Validate and normalize profiler path.

    Args:
        path (str): The path of summary directory.

    Returns:
        str, normalized path of profiler directory.
    """
    if not path:
        raise ProfilerParamValueErrorException('The file dir does not exist.')
    try:
        unquote_path = unquote(path, errors='strict')
    except UnicodeDecodeError:
        raise ProfilerParamValueErrorException('Unquote error with strict mode')

    profiler_dir = os.path.join(unquote_path, 'profiler')
    try:
        profiler_dir = validate_and_normalize_path(profiler_dir, 'profiler')
    except ValidationError:
        log.error('profiler dir <%s> is invalid', unquote_path)
        raise ProfilerParamValueErrorException('Profiler dir is invalid.')

    return profiler_dir
--- a/mindinsight/profiler/parser/step_trace_parser.py
+++ b/mindinsight/profiler/parser/step_trace_parser.py
@@ -0,0 +1,264 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """The parser for step trace data."""
 import csv
 import os
 import stat
 import struct
 from collections import namedtuple
 from decimal import Decimal

 from mindinsight.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \
    JobIdMismatchException
 from mindinsight.profiler.common.log import logger as log
 from mindinsight.profiler.common.util import get_summary_for_step_trace
 from mindinsight.utils.exceptions import MindInsightException

 StepTraceStruct = namedtuple(
    'TrainingTraceStruct', ['tag_id', 'task_id', 'stream_id', 'sys_count']
 )


 class StepTraceParser:
    """
    The parser for step trace data.

    Args:
        input_dir (str): The directory that contains original step trace data.
        output_file_path (str): The output file path.
        skip_first_step (bool): Whether skip the first step or not.
    """
    _event_size = 20

    def __init__(self, input_dir, output_file_path, job_id, skip_first_step=False):
        self._input_dir = input_dir
        self._output_path = output_file_path
        self._job_id = job_id
        self._skip_first_step = skip_first_step
        self._result = []
        self._header = []
        self._step_num = 0

    @property
    def output_file(self):
        """The property of step trace header."""
        file_name = self._output_path.rsplit('/', 2)
        return file_name

    def show(self):
        """The property of step trace info."""
        summary_info = {}
        if self._result:
            summary_info = get_summary_for_step_trace(self._result[-1], self._header)
            summary_info['total_steps'] = len(self._result) - 1
        print('\nStep trace summary info (unit: syscnt):')
        print(summary_info)
        print('\nThe step trace parse result saves under {summary_dir}/%s' % self.output_file)

    def parse_and_save(self):
        """Parse step trace files and save the result."""
        try:
            source_file = self._get_step_trace_file()
            self._parse(source_file)
            self._save()
        except MindInsightException as err:
            log.error("Failed to parse and save step trace files.")
            log.exception(err)
        log.info("Finish to save intermediate result for step trace file.")

    def _get_step_trace_file(self):
        """Get step trace file."""
        profiling_path = self._input_dir
        # validate input_dir
        if not os.path.isdir(profiling_path):
            raise ProfilerPathErrorException(
                '{} does not exist or is not a dir'.format(profiling_path)
            )
        # get step trace files
        files = os.listdir(profiling_path)
        step_trace_files = list(
            filter(
                lambda file: file.startswith('training_trace') and not file.endswith('.done'),
                files
            )
        )
        # validate result
        if not step_trace_files:
            raise ProfilerPathErrorException('training trace file does not exist')
        if len(step_trace_files) > 1:
            log.warning("Not enable to parse multiple step trace files yet.")
        step_trace_file = os.path.join(profiling_path, step_trace_files[0])
        return step_trace_file

    def _parse(self, source_file):
        """Parse source step trace file."""
        log.info("Start to parse step  trace file.")
        with open(source_file, 'rb') as handler:
            content = handler.read()
            for step_trace in self._get_next_step_trace(content):
                if self._skip_first_step:
                    self._skip_first_step = False
                else:
                    self._record_trace_event(step_trace)
        self._record_average_info()
        log.info("Finish to parse step trace file.")

    def _get_next_step_trace(self, content):
        """
        Get next step trace info.

        Args:
            content (bytes): The input step trace info
        Returns:
            Generator, return the step trace one by one.
        """
        event_info = {}
        for pos in range(0, len(content), 20):
            next_event = self._get_trace_struct(content[pos:pos + self._event_size])
            self._construct_event_info(next_event, event_info)
            if event_info.get('end'):
                yield event_info
                event_info = {
                    'start': event_info.get('end'),
                    'reduce': {}
                }

    def _get_trace_struct(self, bin_info):
        """Translate event info to StepTraceStruct."""
        if len(bin_info) == self._event_size:
            parsed_info = struct.unpack('=QHHQ', bin_info)
            return StepTraceStruct(*parsed_info)
        return None

    def _construct_event_info(self, next_event, event_info):
        """Construct event info according to next_event."""
        min_job_id = 255
        step_flag: bool = lambda tag: tag > min_job_id
        end_flag: bool = lambda tag: tag == min_job_id
        fp_flag: bool = lambda tag: tag == 1
        bp_flag: bool = lambda tag: tag == 2

        def _on_step_event():
            """Handle step event."""
            self._validate_tag_id(tag_id)
            if event_info.get('start'):
                event_info['end'] = sys_count
            else:
                event_info['start'] = sys_count
                event_info['reduce'] = {}

        def _on_reduce_event():
            """Handle reduce event."""
            stream_id = next_event.stream_id
            if event_info['reduce'].get(stream_id):
                event_info['reduce'][stream_id].append(sys_count)
            else:
                event_info['reduce'][stream_id] = [sys_count]

        tag_id = next_event.tag_id
        sys_count = next_event.sys_count
        if end_flag(tag_id):
            event_info['end'] = sys_count
        elif step_flag(tag_id):
            _on_step_event()
        elif fp_flag(tag_id):
            event_info['fp'] = sys_count
        elif bp_flag(tag_id):
            event_info['bp'] = sys_count
        else:
            _on_reduce_event()

    def _validate_tag_id(self, job_id):
        """Check the job id in source step trace file is same os user set."""
        if not self._job_id:
            self._job_id = job_id
        elif self._job_id != job_id:
            raise JobIdMismatchException()

    def _record_trace_event(self, step_trace):
        """Record trace event."""
        self._step_num += 1
        start_time = step_trace.get('start')
        end_time = step_trace.get('end')
        fp_time = step_trace.get('fp')
        bp_time = step_trace.get('bp')
        if not (start_time and end_time and fp_time and bp_time):
            log.warning("The step %d is missing basic time.", self._step_num)
            return

        row_data = {
            'step_num': self._step_num,
            'start_point': start_time,
            'end_point': end_time,
            'total': end_time - start_time,
            'fp_point': fp_time,
            'bp_point': bp_time,
            'iteration_interval': fp_time - start_time,
            'fp_and_bp': bp_time - fp_time,
            'tail': end_time - bp_time
        }
        # update reduce info
        self._update_reduce_info(step_trace, row_data)
        # save the row data
        if not self._header:
            self._header = list(row_data.keys())
        row_data_list = [row_data[header_name] for header_name in self._header]
        self._result.append(row_data_list)

    @staticmethod
    def _update_reduce_info(step_trace, row_data):
        """Extract reduce info."""
        reduce_time = step_trace.get('reduce', {})
        for stream_id, time_points in reduce_time.items():
            time_point_num = len(time_points)
            if time_point_num % 2:
                log.warning("Stream %d has %d reduce time points.", stream_id, time_point_num)
                continue
            for index, point_id in enumerate(range(0, time_point_num, 2)):
                field_name = f'stream_{stream_id}_parallel_{index}'
                row_data[field_name + '_start_point'] = time_points[point_id]
                row_data[field_name + '_end_point'] = time_points[point_id + 1]
                row_data[field_name] = time_points[point_id + 1] - time_points[point_id]

    def _record_average_info(self):
        """Calculate average info."""
        result_size = len(self._result)
        if result_size < 2:
            return
        # calculate average data for each column in result data
        average_data = [0] * len(self._header)
        for row_info in self._result[1:]:
            average_data = [
                Decimal(i) + Decimal(j) for i, j in zip(row_info, average_data)
            ]
        average_data = [
            round((item / (result_size - 1))) for item in average_data
        ]
        # change step num info in average_data to None
        step_num_index = self._header.index('step_num')
        average_data[step_num_index] = '-'
        self._result.append(average_data)
        log.info("Finish add average info for step trace.")

    def _save(self):
        log.info("Start to save step  trace file.")
        if not self._header:
            return
        with open(self._output_path, 'w') as file_handle:
            csv_writer = csv.writer(file_handle)
            csv_writer.writerow(self._header)
            for row_data in self._result:
                csv_writer.writerow(row_data)
        os.chmod(self._output_path, stat.S_IREAD | stat.S_IWRITE)
--- a/mindinsight/profiler/profiling.py
+++ b/mindinsight/profiler/profiling.py
@@ -15,22 +15,26 @@
 """Profiling api file."""
 import os
 import time

 from tabulate import tabulate
 from mindinsight.profiler.parser.hwts_log_parser import HWTSLogParser
 from mindinsight.profiler.parser.framework_parser import FrameworkParser
 from mindinsight.profiler.parser.optime_parser import OPComputeTimeParser
 from mindinsight.profiler.parser.aicpu_data_parser import DataPreProcessParser

 from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
 from mindinsight.profiler.analyser.integrator import Integrator
 from mindinsight.profiler.common._utils import get_file_names, fwrite_format
 from mindinsight.profiler.common.validator.validate_path import \
    validate_and_normalize_path
 from mindinsight.profiler.common.log import logger
 from mindinsight.profiler.common.validator.checkparam import \
    check_bool, check_subgraph
 from mindinsight.profiler.common.log import logger
 from mindinsight.profiler.common.validator.validate_path import \
    validate_and_normalize_path
 from mindinsight.profiler.parser.aicpu_data_parser import DataPreProcessParser
 from mindinsight.profiler.parser.framework_parser import FrameworkParser
 from mindinsight.profiler.parser.hwts_log_parser import HWTSLogParser
 from mindinsight.profiler.parser.optime_parser import OPComputeTimeParser
 from mindinsight.profiler.parser.step_trace_parser import StepTraceParser
 from mindinsight.utils.exceptions import MindInsightException

 PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling"
 INIT_OP_NAME = 'Default/InitDataSetQueue'


 class Profiler:
@@ -87,7 +91,7 @@ class Profiler:
        if device_target and device_target != "Davinci" \
            and device_target != "Ascend":
            msg = ("Profiling: unsupport backend: %s" \
                  % device_target)
                   % device_target)
            raise RuntimeError(msg)

        self._dev_id = dev_id
@@ -120,6 +124,8 @@ class Profiler:
        self._detail = check_bool(is_detail, 'is_detail')
        self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
        self._profiling_job_id = job_id
        # add job id env through user input later
        self._job_id_env = None
        self._start_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling start time: %d", self._start_time)

@@ -193,6 +199,35 @@ class Profiler:
        except MindInsightException as err:
            logger.error(err.message)

        # analyse step trace info
        self._analyse_step_trace(source_path, framework_parser)

    def _analyse_step_trace(self, source_path, framework_parser):
        """
        Analyse step trace data and save the result.

        Args:
            source_path (str): The directory that contains the step trace original data.
            framework_parser (str): The framework parse instance.
        """
        logger.info("Begin to parse step trace.")
        # construct output path
        step_trace_intermediate_file_path = os.path.join(
            self._output_path,
            f'step_trace_raw_{self._dev_id}_detail_time.csv'
        )
        # whether keep the first step
        skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
        # parser the step trace files and save the result to disk
        parser = StepTraceParser(input_dir=source_path,
                                 output_file_path=step_trace_intermediate_file_path,
                                 job_id=self._job_id_env,
                                 skip_first_step=skip_first_step_flag)
        parser.parse_and_save()
        # print parser result
        parser.show()
        logger.info("Finish save the intermediate result %s", step_trace_intermediate_file_path)

    def __del__(self):
        """Disable the profiling collection service, called after training."""