Merge pull request !525 from yuximiao/yuximiao_gpu_profilertags/0.7.0-beta
| @@ -114,7 +114,7 @@ def get_profile_device_list(): | |||||
| except ValidationError: | except ValidationError: | ||||
| raise ParamValueError("Invalid profiler dir") | raise ParamValueError("Invalid profiler dir") | ||||
| device_list = analyse_device_list_from_profiler_dir(profiler_dir_abs) | |||||
| device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs) | |||||
| return jsonify(device_list) | return jsonify(device_list) | ||||
| @@ -59,14 +59,17 @@ class _BasicTrainJob: | |||||
| create_time (DateTime): The create time of summary directory. | create_time (DateTime): The create time of summary directory. | ||||
| update_time (DateTime): The latest modify time of summary files directly in the summary directory. | update_time (DateTime): The latest modify time of summary files directly in the summary directory. | ||||
| profiler_dir (str): The relative path of profiler directory. | profiler_dir (str): The relative path of profiler directory. | ||||
| profiler_type (str): The profiler device type. | |||||
| """ | """ | ||||
| def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir): | |||||
| def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir, | |||||
| profiler_type=""): | |||||
| self._train_id = train_id | self._train_id = train_id | ||||
| self._abs_summary_base_dir = abs_summary_base_dir | self._abs_summary_base_dir = abs_summary_base_dir | ||||
| self._abs_summary_dir = abs_summary_dir | self._abs_summary_dir = abs_summary_dir | ||||
| self._create_time = create_time | self._create_time = create_time | ||||
| self._update_time = update_time | self._update_time = update_time | ||||
| self._profiler_dir = profiler_dir | self._profiler_dir = profiler_dir | ||||
| self._profiler_type = profiler_type | |||||
| @property | @property | ||||
| def abs_summary_dir(self): | def abs_summary_dir(self): | ||||
| @@ -98,6 +101,11 @@ class _BasicTrainJob: | |||||
| """Get update time.""" | """Get update time.""" | ||||
| return self._update_time | return self._update_time | ||||
| @property | |||||
| def profiler_type(self): | |||||
| """Get profiler type""" | |||||
| return self._profiler_type | |||||
| class CachedTrainJob: | class CachedTrainJob: | ||||
| """ | """ | ||||
| @@ -952,6 +960,7 @@ class DataManager: | |||||
| create_time=info['create_time'], | create_time=info['create_time'], | ||||
| update_time=info['update_time'], | update_time=info['update_time'], | ||||
| profiler_dir=None if profiler is None else profiler['directory'], | profiler_dir=None if profiler is None else profiler['directory'], | ||||
| profiler_type="" if profiler is None else profiler['profiler_type'], | |||||
| )) | )) | ||||
| self._brief_cache.update_cache(basic_train_jobs) | self._brief_cache.update_cache(basic_train_jobs) | ||||
| @@ -109,6 +109,7 @@ class SummaryWatcher: | |||||
| 'directory': profiler['directory'], | 'directory': profiler['directory'], | ||||
| 'create_time': profiler['ctime'], | 'create_time': profiler['ctime'], | ||||
| 'update_time': profiler['mtime'], | 'update_time': profiler['mtime'], | ||||
| 'profiler_type': profiler['profiler_type'] | |||||
| } | } | ||||
| directories.append(directory) | directories.append(directory) | ||||
| @@ -226,13 +227,15 @@ class SummaryWatcher: | |||||
| elif entry.is_dir(): | elif entry.is_dir(): | ||||
| profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) | profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) | ||||
| full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name) | full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name) | ||||
| if profiler_pattern is None or not self._is_valid_profiler_directory(full_dir_path): | |||||
| is_valid_profiler_dir, profiler_type = self._is_valid_profiler_directory(full_dir_path) | |||||
| if profiler_pattern is None or not is_valid_profiler_dir: | |||||
| return | return | ||||
| profiler = { | profiler = { | ||||
| 'directory': os.path.join('.', entry.name), | 'directory': os.path.join('.', entry.name), | ||||
| 'ctime': ctime, | 'ctime': ctime, | ||||
| 'mtime': mtime, | 'mtime': mtime, | ||||
| "profiler_type": profiler_type | |||||
| } | } | ||||
| summary_dict[relative_path] = { | summary_dict[relative_path] = { | ||||
| @@ -286,19 +289,20 @@ class SummaryWatcher: | |||||
| profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) | profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) | ||||
| if profiler_pattern is not None and entry.is_dir(): | if profiler_pattern is not None and entry.is_dir(): | ||||
| full_path = os.path.realpath(os.path.join(summary_directory, entry.name)) | full_path = os.path.realpath(os.path.join(summary_directory, entry.name)) | ||||
| if self._is_valid_profiler_directory(full_path): | |||||
| if self._is_valid_profiler_directory(full_path)[0]: | |||||
| return True | return True | ||||
| return False | return False | ||||
| def _is_valid_profiler_directory(self, directory): | def _is_valid_profiler_directory(self, directory): | ||||
| profiler_type = "" | |||||
| try: | try: | ||||
| from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir | from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir | ||||
| device_list = analyse_device_list_from_profiler_dir(directory) | |||||
| device_list, profiler_type = analyse_device_list_from_profiler_dir(directory) | |||||
| except ImportError: | except ImportError: | ||||
| device_list = [] | device_list = [] | ||||
| return bool(device_list) | |||||
| return bool(device_list), profiler_type | |||||
| def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10): | def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10): | ||||
| """ | """ | ||||
| @@ -144,6 +144,7 @@ class TrainTaskManager(BaseProcessor): | |||||
| update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), | update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), | ||||
| profiler_dir=basic_info.profiler_dir, | profiler_dir=basic_info.profiler_dir, | ||||
| cache_status=train_job.cache_status.value, | cache_status=train_job.cache_status.value, | ||||
| profiler_type=basic_info.profiler_type, | |||||
| ) | ) | ||||
| if train_job.cache_status == CacheStatus.CACHED: | if train_job.cache_status == CacheStatus.CACHED: | ||||
| @@ -14,4 +14,4 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """The analyser module.""" | """The analyser module.""" | ||||
| from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \ | from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \ | ||||
| minddata_analyser, timeline_analyser | |||||
| minddata_analyser, timeline_analyser, gpu_analyser | |||||
| @@ -0,0 +1,129 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """The gpu base analyser.""" | |||||
| import csv | |||||
| import os | |||||
| from mindinsight.profiler.analyser.base_analyser import BaseAnalyser | |||||
| from mindinsight.profiler.common.log import logger | |||||
| class GpuAnalyser(BaseAnalyser): | |||||
| """Gpu base analyser.""" | |||||
| _csv_file_to_analyse = "" | |||||
| def _load(self): | |||||
| """Load data according to the parsed AICORE operator types file.""" | |||||
| op_type_file_path = os.path.join( | |||||
| self._profiling_dir, | |||||
| self._csv_file_to_analyse.format(self._device_id) | |||||
| ) | |||||
| if not os.path.isfile(op_type_file_path): | |||||
| logger.warning('The file <%s> does not exist.', op_type_file_path) | |||||
| return | |||||
| with open(op_type_file_path, 'r') as file: | |||||
| csv_reader = csv.reader(file) | |||||
| _ = next(csv_reader) | |||||
| for info in csv_reader: | |||||
| self._data.append(self._convert_field_type(info)) | |||||
| @staticmethod | |||||
| def _convert_field_type(row): | |||||
| """ | |||||
| Convert the field type to the specific type. | |||||
| Args: | |||||
| row (list): One row data from parsed data. | |||||
| Returns: | |||||
| list, the converted data. | |||||
| """ | |||||
| return row | |||||
| def _filter(self, filter_condition): | |||||
| """ | |||||
| Filter the profiling data according to the filter condition. | |||||
| Args: | |||||
| filter_condition (dict): The filter condition. | |||||
| """ | |||||
| def _inner_filter(item: list): | |||||
| return self._default_filter(item, filter_condition) | |||||
| self._result = list(filter(_inner_filter, self._data)) | |||||
| class GpuOpTypeAnalyser(GpuAnalyser): | |||||
| """Gpu operation type analyser.""" | |||||
| _col_names = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"] | |||||
| _csv_file_to_analyse = 'gpu_op_type_info_{}.csv' | |||||
| @staticmethod | |||||
| def _convert_field_type(row): | |||||
| """ | |||||
| Convert the field type to the specific type. | |||||
| Args: | |||||
| row (list): One row data from parsed data. | |||||
| Returns: | |||||
| list, the converted data. | |||||
| """ | |||||
| return [row[0], int(row[1]), float(row[2]), float(row[3]), float(row[4])] | |||||
| class GpuOpInfoAnalyser(GpuAnalyser): | |||||
| """Gpu operation detail info analyser.""" | |||||
| _col_names = ["op_side", "op_type", "op_name", "op_full_name", | |||||
| "op_occurrences", "op_total_time", "op_avg_time", | |||||
| "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"] | |||||
| _csv_file_to_analyse = 'gpu_op_detail_info_{}.csv' | |||||
| @staticmethod | |||||
| def _convert_field_type(row): | |||||
| """ | |||||
| Convert the field type to the specific type. | |||||
| Args: | |||||
| row (list): One row data from parsed data. | |||||
| Returns: | |||||
| list, the converted data. | |||||
| """ | |||||
| return [row[0], row[1], row[2], row[3], int(row[4]), float(row[5]), | |||||
| float(row[6]), float(row[7]), float(row[8]), int(row[9])] | |||||
| class GpuCudaActivityAnalyser(GpuAnalyser): | |||||
| """Gpu activity type analyser.""" | |||||
| _col_names = ["name", "type", "op_full_name", "stream_id", | |||||
| "block_dim", "grid_dim", "occurrences", "total_duration", | |||||
| "avg_duration", "max_duration", "min_duration"] | |||||
| _csv_file_to_analyse = 'gpu_activity_data_{}.csv' | |||||
| @staticmethod | |||||
| def _convert_field_type(row): | |||||
| """ | |||||
| Convert the field type to the specific type. | |||||
| Args: | |||||
| row (list): One row data from parsed data. | |||||
| Returns: | |||||
| list, the converted data. | |||||
| """ | |||||
| return [row[0], row[1], row[2], row[3], row[4], row[5], int(row[6]), | |||||
| float(row[7]), float(row[8]), float(row[9]), float(row[10])] | |||||
| @@ -36,8 +36,10 @@ def analyse_device_list_from_profiler_dir(profiler_dir): | |||||
| list, the device_id list. | list, the device_id list. | ||||
| """ | """ | ||||
| profiler_file_prefix = ["timeline_display", "output_op_compute_time"] | profiler_file_prefix = ["timeline_display", "output_op_compute_time"] | ||||
| gpu_profiler_file_prefix = ["gpu_op_detail_info", "gpu_activity_data", "gpu_op_type_info"] | |||||
| device_id_list = set() | device_id_list = set() | ||||
| gpu_device_id_list = set() | |||||
| for _, _, filenames in os.walk(profiler_dir): | for _, _, filenames in os.walk(profiler_dir): | ||||
| for filename in filenames: | for filename in filenames: | ||||
| if filename.startswith("step_trace_raw"): | if filename.startswith("step_trace_raw"): | ||||
| @@ -51,8 +53,19 @@ def analyse_device_list_from_profiler_dir(profiler_dir): | |||||
| if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix: | if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix: | ||||
| device_id_list.add(device_num) | device_id_list.add(device_num) | ||||
| return sorted(list(device_id_list)) | |||||
| elif device_num.isdigit() and '_'.join(items[:-1]) in gpu_profiler_file_prefix: | |||||
| gpu_device_id_list.add(device_num) | |||||
| if device_id_list: | |||||
| result_list = sorted(list(device_id_list)) | |||||
| profiler_type = "ascend" | |||||
| elif gpu_device_id_list: | |||||
| result_list = sorted(list(gpu_device_id_list)) | |||||
| profiler_type = "gpu" | |||||
| else: | |||||
| result_list = [] | |||||
| profiler_type = "" | |||||
| return result_list, profiler_type | |||||
| def query_latest_trace_time_file(profiler_dir, device_id=0): | def query_latest_trace_time_file(profiler_dir, device_id=0): | ||||
| @@ -27,6 +27,13 @@ AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent" | |||||
| AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"] | AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"] | ||||
| AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start", | AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start", | ||||
| "run_end"] | "run_end"] | ||||
| GPU_TYPE_COL = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"] | |||||
| GPU_ACTIVITY_COL = ["name", "type", "op_full_name", "stream_id", | |||||
| "block_dim", "grid_dim", "occurrences", "total_duration", | |||||
| "avg_duration", "max_duration", "min_duration"] | |||||
| GPU_DETAIL_COL = ["op_side", "op_type", "op_name", "op_full_name", | |||||
| "op_occurrences", "op_total_time", "op_avg_time", | |||||
| "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"] | |||||
| MINDDATA_PIPELINE_COL = [ | MINDDATA_PIPELINE_COL = [ | ||||
| 'op_id', 'op_type', 'num_workers', 'output_queue_average_size', | 'op_id', 'op_type', 'num_workers', 'output_queue_average_size', | ||||
| 'output_queue_length', 'output_queue_usage_rate', 'sample_interval', | 'output_queue_length', 'output_queue_usage_rate', 'sample_interval', | ||||
| @@ -67,10 +74,20 @@ def validate_condition(search_condition): | |||||
| search_scope = AICORE_TYPE_COL | search_scope = AICORE_TYPE_COL | ||||
| elif op_type == "aicore_detail": | elif op_type == "aicore_detail": | ||||
| search_scope = AICORE_DETAIL_COL | search_scope = AICORE_DETAIL_COL | ||||
| elif op_type == "gpu_op_type": | |||||
| search_scope = GPU_TYPE_COL | |||||
| elif op_type == "gpu_op_info": | |||||
| search_scope = GPU_DETAIL_COL | |||||
| elif op_type == "gpu_cuda_activity": | |||||
| search_scope = GPU_ACTIVITY_COL | |||||
| else: | else: | ||||
| raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") | |||||
| raise ProfilerOpTypeException( | |||||
| "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', " | |||||
| "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") | |||||
| else: | else: | ||||
| raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") | |||||
| raise ProfilerOpTypeException( | |||||
| "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', " | |||||
| "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") | |||||
| if "group_condition" in search_condition: | if "group_condition" in search_condition: | ||||
| validate_group_condition(search_condition) | validate_group_condition(search_condition) | ||||
| @@ -199,8 +216,6 @@ def validate_filter_condition(search_condition): | |||||
| if "op_name" in filter_condition: | if "op_name" in filter_condition: | ||||
| op_name_condition = filter_condition.get("op_name") | op_name_condition = filter_condition.get("op_name") | ||||
| validate_op_filter_condition(op_name_condition) | validate_op_filter_condition(op_name_condition) | ||||
| if "op_type" not in filter_condition and "op_name" not in filter_condition: | |||||
| raise ProfilerFilterConditionException("The key of filter_condition is not support") | |||||
| def validate_and_set_job_id_env(job_id_env): | def validate_and_set_job_id_env(job_id_env): | ||||