!30003 add cpu timeline profiling and change threadpool default threads

Merge pull request !30003 from fangzehua/add_profi
4 years ago · d37c8719fd
--- a/docs/api/api_python/mindspore.context.rst
+++ b/docs/api/api_python/mindspore.context.rst
@@ -138,7 +138,7 @@ MindSpore context，用于配置当前执行环境，包括执行模式、执行
    - **grad_for_scalar** (bool)：  表示是否获取标量梯度。默认值：False。当 `grad_for_scalar` 设置为True时，则可以导出函数的标量输入。由于后端目前不支持伸缩操作，所以该接口只支持在前端可推演的简单操作。
    - **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时，在第一次执行的过程中，一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时，如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改，那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测，这意味着可能有正确性风险。默认值：False。这是一个实验特性，可能会被更改或者删除。
    - **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值："."。如果目录不存在，系统会自动创建这个目录。缓存会被保存到如下目录： `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。
    - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍。
    - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30。

    **异常：**

--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
@@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
  const size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
  auto runtime_num_threads = static_cast<size_t>(context_ptr->get_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS));
  size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num);
  const float kActorUsage = 0.2;
  const float kActorUsage = 0.18;
  const size_t kActorThreadMinNum = 2;
  size_t actor_thread_max_num =
    std::max(static_cast<size_t>(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum);
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true);
  set_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER, false);

  size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
  constexpr float kCpuUsage = 0.6;
  uint32_t runtime_num_threads = std::max(static_cast<int>(std::floor(cpu_core_num * kCpuUsage)), 1);
  set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads);
  uint32_t kDefaultRuntimeNumThreads = 30;
  set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads);

  backend_policy_ = policy_map_[policy];
 }
--- a/mindspore/python/mindspore/context.py
+++ b/mindspore/python/mindspore/context.py
@@ -830,7 +830,7 @@ def set_context(**kwargs):
            The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is
            the ID of the current device in the cluster.
        runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime,
            which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at
            which must bigger than 0. Default value is 30, if you run many processes at
            the same time, you should set the value smaller to avoid thread contention.
    Raises:
        ValueError: If input key is not an attribute in context.
--- a/mindspore/python/mindspore/profiler/parser/integrator.py
+++ b/mindspore/python/mindspore/profiler/parser/integrator.py
@@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator):
 class CpuTimelineGenerator(GpuTimelineGenerator):
    """Generate cpu Timeline data from file."""
    _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
    _display_filename = 'cpu_timeline_display_{}.json'
    _timeline_summary_filename = 'cpu_timeline_summary_{}.json'

    def _get_and_validate_path(self, file_name):
        """Generate op or activity file path from file name, and validate this path."""
@@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator):
            time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms

        return timeline_list

    def _load_timeline_data(self):
        """Load timeline data from file."""
        timeline_list = self.load_cpu_op_data()

        timeline_list.sort(key=lambda x: float(x[2]))
        self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
        self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num

        # Generate step time.
        factor_start_time_uint_to_duration = 1e-3
        self._set_step_start_and_end_op_name(timeline_list)

        step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)

        # Add Scope Name.
        default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
                                                                      factor_start_time_uint_to_duration)
        gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
                                                                       factor_start_time_uint_to_duration)
        recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
                                                                        factor_start_time_uint_to_duration)
        timeline_list.extend(default_scope_name_time_list)
        timeline_list.extend(gradient_scope_name_time_list)
        timeline_list.extend(recompute_scope_name_time_list)
        timeline_list.extend(step_time_list)

        timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
        timeline_list.sort(key=lambda x: float(x[2]))

        return timeline_list

    def init_timeline(self):
        """Init timeline metadata, adding all collected info."""
        timeline_list = self._load_timeline_data()

        # Init a dict for counting the num of streams.
        stream_count_dict = {}
        for timeline in timeline_list:
            self._parse_timeline_data(timeline, 0)
            # Updating the collection of streams.
            if len(timeline) == 4:
                self._update_num_of_streams(timeline, stream_count_dict)

        # Add format thread meta data.
        self._format_meta_data_list.extend(self._timeline_meta)
        self._timeline_meta = self._format_meta_data_list

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
--- a/mindspore/python/mindspore/profiler/profiling.py
+++ b/mindspore/python/mindspore/profiler/profiling.py
@@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
 from mindspore.profiler.parser.framework_parser import FrameworkParser
 from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
 from mindspore.profiler.parser.integrator import Integrator
 from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
 from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator
 from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser
 from mindspore.profiler.parser.minddata_parser import MinddataParser
 from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
@@ -164,6 +164,11 @@ class Profiler:
            self._cpu_profiler = cpu_profiler.get_instance()
            self._cpu_profiler.init(self._output_path)

        if self._device_target and self._device_target == "CPU":
            self.start_profile = kwargs.pop("start_profile", True)
            if not isinstance(self.start_profile, bool):
                raise TypeError("The parameter start_profile must be bool.")

        if self._device_target and self._device_target == "GPU":
            gpu_profiler = c_expression.GPUProfiler
            self._gpu_profiler = gpu_profiler.get_instance()
@@ -296,6 +301,9 @@ class Profiler:

        self._cpu_profiler.stop()

        if self._device_target and self._device_target == "CPU":
            self._cpu_analyse()

        if self._device_target and self._device_target == "GPU":
            self._gpu_analyse()

@@ -590,6 +598,21 @@ class Profiler:
            'otherwise, this warning can be ignored.'
        )

    def _cpu_analyse(self):
        """Collect and analyse cpu performance data"""

        try:
            size_limit = 100 * 1024 * 1024  # 100MB
            timeline_generator = CpuTimelineGenerator(self._output_path, 0)
            timeline_generator.init_timeline()
            timeline_generator.write_timeline(size_limit)
            timeline_generator.write_timeline_summary()
            return timeline_generator
        except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
            logger.warning('Fail to write timeline data: %s', err)
            raise RuntimeError('Fail to write timeline data.')


    def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True,
                            is_gpu_kernel_async_launch_flag=False):
        """