Merge pull request !30003 from fangzehua/add_profifeature/build-system-rewrite
| @@ -138,7 +138,7 @@ MindSpore context,用于配置当前执行环境,包括执行模式、执行 | |||
| - **grad_for_scalar** (bool): 表示是否获取标量梯度。默认值:False。当 `grad_for_scalar` 设置为True时,则可以导出函数的标量输入。由于后端目前不支持伸缩操作,所以该接口只支持在前端可推演的简单操作。 | |||
| - **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时,在第一次执行的过程中,一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时,如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改,那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测,这意味着可能有正确性风险。默认值:False。这是一个实验特性,可能会被更改或者删除。 | |||
| - **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值:"."。如果目录不存在,系统会自动创建这个目录。缓存会被保存到如下目录: `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。 | |||
| - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍。 | |||
| - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30。 | |||
| **异常:** | |||
| @@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread | |||
| const size_t cpu_core_num = std::thread::hardware_concurrency() - 1; | |||
| auto runtime_num_threads = static_cast<size_t>(context_ptr->get_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS)); | |||
| size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num); | |||
| const float kActorUsage = 0.2; | |||
| const float kActorUsage = 0.18; | |||
| const size_t kActorThreadMinNum = 2; | |||
| size_t actor_thread_max_num = | |||
| std::max(static_cast<size_t>(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum); | |||
| @@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { | |||
| set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true); | |||
| set_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER, false); | |||
| size_t cpu_core_num = std::thread::hardware_concurrency() - 1; | |||
| constexpr float kCpuUsage = 0.6; | |||
| uint32_t runtime_num_threads = std::max(static_cast<int>(std::floor(cpu_core_num * kCpuUsage)), 1); | |||
| set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads); | |||
| uint32_t kDefaultRuntimeNumThreads = 30; | |||
| set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads); | |||
| backend_policy_ = policy_map_[policy]; | |||
| } | |||
| @@ -830,7 +830,7 @@ def set_context(**kwargs): | |||
| The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is | |||
| the ID of the current device in the cluster. | |||
| runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime, | |||
| which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at | |||
| which must bigger than 0. Default value is 30, if you run many processes at | |||
| the same time, you should set the value smaller to avoid thread contention. | |||
| Raises: | |||
| ValueError: If input key is not an attribute in context. | |||
| @@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator): | |||
| class CpuTimelineGenerator(GpuTimelineGenerator): | |||
| """Generate cpu Timeline data from file.""" | |||
| _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt" | |||
| _display_filename = 'cpu_timeline_display_{}.json' | |||
| _timeline_summary_filename = 'cpu_timeline_summary_{}.json' | |||
| def _get_and_validate_path(self, file_name): | |||
| """Generate op or activity file path from file name, and validate this path.""" | |||
| @@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator): | |||
| time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms | |||
| return timeline_list | |||
| def _load_timeline_data(self): | |||
| """Load timeline data from file.""" | |||
| timeline_list = self.load_cpu_op_data() | |||
| timeline_list.sort(key=lambda x: float(x[2])) | |||
| self._max_scope_name_num = self._get_max_scope_name_num(timeline_list) | |||
| self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num | |||
| # Generate step time. | |||
| factor_start_time_uint_to_duration = 1e-3 | |||
| self._set_step_start_and_end_op_name(timeline_list) | |||
| step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration) | |||
| # Add Scope Name. | |||
| default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default", | |||
| factor_start_time_uint_to_duration) | |||
| gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients", | |||
| factor_start_time_uint_to_duration) | |||
| recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default", | |||
| factor_start_time_uint_to_duration) | |||
| timeline_list.extend(default_scope_name_time_list) | |||
| timeline_list.extend(gradient_scope_name_time_list) | |||
| timeline_list.extend(recompute_scope_name_time_list) | |||
| timeline_list.extend(step_time_list) | |||
| timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx])) | |||
| timeline_list.sort(key=lambda x: float(x[2])) | |||
| return timeline_list | |||
| def init_timeline(self): | |||
| """Init timeline metadata, adding all collected info.""" | |||
| timeline_list = self._load_timeline_data() | |||
| # Init a dict for counting the num of streams. | |||
| stream_count_dict = {} | |||
| for timeline in timeline_list: | |||
| self._parse_timeline_data(timeline, 0) | |||
| # Updating the collection of streams. | |||
| if len(timeline) == 4: | |||
| self._update_num_of_streams(timeline, stream_count_dict) | |||
| # Add format thread meta data. | |||
| self._format_meta_data_list.extend(self._timeline_meta) | |||
| self._timeline_meta = self._format_meta_data_list | |||
| # Update timeline summary info | |||
| self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) | |||
| @@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser | |||
| from mindspore.profiler.parser.framework_parser import FrameworkParser | |||
| from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser | |||
| from mindspore.profiler.parser.integrator import Integrator | |||
| from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator | |||
| from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator | |||
| from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser | |||
| from mindspore.profiler.parser.minddata_parser import MinddataParser | |||
| from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer | |||
| @@ -164,6 +164,11 @@ class Profiler: | |||
| self._cpu_profiler = cpu_profiler.get_instance() | |||
| self._cpu_profiler.init(self._output_path) | |||
| if self._device_target and self._device_target == "CPU": | |||
| self.start_profile = kwargs.pop("start_profile", True) | |||
| if not isinstance(self.start_profile, bool): | |||
| raise TypeError("The parameter start_profile must be bool.") | |||
| if self._device_target and self._device_target == "GPU": | |||
| gpu_profiler = c_expression.GPUProfiler | |||
| self._gpu_profiler = gpu_profiler.get_instance() | |||
| @@ -296,6 +301,9 @@ class Profiler: | |||
| self._cpu_profiler.stop() | |||
| if self._device_target and self._device_target == "CPU": | |||
| self._cpu_analyse() | |||
| if self._device_target and self._device_target == "GPU": | |||
| self._gpu_analyse() | |||
| @@ -590,6 +598,21 @@ class Profiler: | |||
| 'otherwise, this warning can be ignored.' | |||
| ) | |||
| def _cpu_analyse(self): | |||
| """Collect and analyse cpu performance data""" | |||
| try: | |||
| size_limit = 100 * 1024 * 1024 # 100MB | |||
| timeline_generator = CpuTimelineGenerator(self._output_path, 0) | |||
| timeline_generator.init_timeline() | |||
| timeline_generator.write_timeline(size_limit) | |||
| timeline_generator.write_timeline_summary() | |||
| return timeline_generator | |||
| except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: | |||
| logger.warning('Fail to write timeline data: %s', err) | |||
| raise RuntimeError('Fail to write timeline data.') | |||
| def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True, | |||
| is_gpu_kernel_async_launch_flag=False): | |||
| """ | |||