Merge pull request !525 from yuximiao/yuximiao_gpu_profilertags/0.7.0-beta
| @@ -114,7 +114,7 @@ def get_profile_device_list(): | |||
| except ValidationError: | |||
| raise ParamValueError("Invalid profiler dir") | |||
| device_list = analyse_device_list_from_profiler_dir(profiler_dir_abs) | |||
| device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs) | |||
| return jsonify(device_list) | |||
| @@ -59,14 +59,17 @@ class _BasicTrainJob: | |||
| create_time (DateTime): The create time of summary directory. | |||
| update_time (DateTime): The latest modify time of summary files directly in the summary directory. | |||
| profiler_dir (str): The relative path of profiler directory. | |||
| profiler_type (str): The profiler device type. | |||
| """ | |||
| def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir): | |||
| def __init__(self, train_id, abs_summary_base_dir, abs_summary_dir, create_time, update_time, profiler_dir, | |||
| profiler_type=""): | |||
| self._train_id = train_id | |||
| self._abs_summary_base_dir = abs_summary_base_dir | |||
| self._abs_summary_dir = abs_summary_dir | |||
| self._create_time = create_time | |||
| self._update_time = update_time | |||
| self._profiler_dir = profiler_dir | |||
| self._profiler_type = profiler_type | |||
| @property | |||
| def abs_summary_dir(self): | |||
| @@ -98,6 +101,11 @@ class _BasicTrainJob: | |||
| """Get update time.""" | |||
| return self._update_time | |||
| @property | |||
| def profiler_type(self): | |||
| """Get profiler type""" | |||
| return self._profiler_type | |||
| class CachedTrainJob: | |||
| """ | |||
| @@ -952,6 +960,7 @@ class DataManager: | |||
| create_time=info['create_time'], | |||
| update_time=info['update_time'], | |||
| profiler_dir=None if profiler is None else profiler['directory'], | |||
| profiler_type="" if profiler is None else profiler['profiler_type'], | |||
| )) | |||
| self._brief_cache.update_cache(basic_train_jobs) | |||
| @@ -109,6 +109,7 @@ class SummaryWatcher: | |||
| 'directory': profiler['directory'], | |||
| 'create_time': profiler['ctime'], | |||
| 'update_time': profiler['mtime'], | |||
| 'profiler_type': profiler['profiler_type'] | |||
| } | |||
| directories.append(directory) | |||
| @@ -226,13 +227,15 @@ class SummaryWatcher: | |||
| elif entry.is_dir(): | |||
| profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) | |||
| full_dir_path = os.path.join(summary_base_dir, relative_path, entry.name) | |||
| if profiler_pattern is None or not self._is_valid_profiler_directory(full_dir_path): | |||
| is_valid_profiler_dir, profiler_type = self._is_valid_profiler_directory(full_dir_path) | |||
| if profiler_pattern is None or not is_valid_profiler_dir: | |||
| return | |||
| profiler = { | |||
| 'directory': os.path.join('.', entry.name), | |||
| 'ctime': ctime, | |||
| 'mtime': mtime, | |||
| "profiler_type": profiler_type | |||
| } | |||
| summary_dict[relative_path] = { | |||
| @@ -286,19 +289,20 @@ class SummaryWatcher: | |||
| profiler_pattern = re.search(self.PROFILER_DIRECTORY_REGEX, entry.name) | |||
| if profiler_pattern is not None and entry.is_dir(): | |||
| full_path = os.path.realpath(os.path.join(summary_directory, entry.name)) | |||
| if self._is_valid_profiler_directory(full_path): | |||
| if self._is_valid_profiler_directory(full_path)[0]: | |||
| return True | |||
| return False | |||
| def _is_valid_profiler_directory(self, directory): | |||
| profiler_type = "" | |||
| try: | |||
| from mindinsight.profiler.common.util import analyse_device_list_from_profiler_dir | |||
| device_list = analyse_device_list_from_profiler_dir(directory) | |||
| device_list, profiler_type = analyse_device_list_from_profiler_dir(directory) | |||
| except ImportError: | |||
| device_list = [] | |||
| return bool(device_list) | |||
| return bool(device_list), profiler_type | |||
| def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10): | |||
| """ | |||
| @@ -144,6 +144,7 @@ class TrainTaskManager(BaseProcessor): | |||
| update_time=basic_info.update_time.strftime('%Y-%m-%d %H:%M:%S'), | |||
| profiler_dir=basic_info.profiler_dir, | |||
| cache_status=train_job.cache_status.value, | |||
| profiler_type=basic_info.profiler_type, | |||
| ) | |||
| if train_job.cache_status == CacheStatus.CACHED: | |||
| @@ -14,4 +14,4 @@ | |||
| # ============================================================================ | |||
| """The analyser module.""" | |||
| from . import analyser, minddata_pipeline_analyser, step_trace_analyser, \ | |||
| minddata_analyser, timeline_analyser | |||
| minddata_analyser, timeline_analyser, gpu_analyser | |||
| @@ -0,0 +1,129 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """The gpu base analyser.""" | |||
| import csv | |||
| import os | |||
| from mindinsight.profiler.analyser.base_analyser import BaseAnalyser | |||
| from mindinsight.profiler.common.log import logger | |||
| class GpuAnalyser(BaseAnalyser): | |||
| """Gpu base analyser.""" | |||
| _csv_file_to_analyse = "" | |||
| def _load(self): | |||
| """Load data according to the parsed AICORE operator types file.""" | |||
| op_type_file_path = os.path.join( | |||
| self._profiling_dir, | |||
| self._csv_file_to_analyse.format(self._device_id) | |||
| ) | |||
| if not os.path.isfile(op_type_file_path): | |||
| logger.warning('The file <%s> does not exist.', op_type_file_path) | |||
| return | |||
| with open(op_type_file_path, 'r') as file: | |||
| csv_reader = csv.reader(file) | |||
| _ = next(csv_reader) | |||
| for info in csv_reader: | |||
| self._data.append(self._convert_field_type(info)) | |||
| @staticmethod | |||
| def _convert_field_type(row): | |||
| """ | |||
| Convert the field type to the specific type. | |||
| Args: | |||
| row (list): One row data from parsed data. | |||
| Returns: | |||
| list, the converted data. | |||
| """ | |||
| return row | |||
| def _filter(self, filter_condition): | |||
| """ | |||
| Filter the profiling data according to the filter condition. | |||
| Args: | |||
| filter_condition (dict): The filter condition. | |||
| """ | |||
| def _inner_filter(item: list): | |||
| return self._default_filter(item, filter_condition) | |||
| self._result = list(filter(_inner_filter, self._data)) | |||
| class GpuOpTypeAnalyser(GpuAnalyser): | |||
| """Gpu operation type analyser.""" | |||
| _col_names = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"] | |||
| _csv_file_to_analyse = 'gpu_op_type_info_{}.csv' | |||
| @staticmethod | |||
| def _convert_field_type(row): | |||
| """ | |||
| Convert the field type to the specific type. | |||
| Args: | |||
| row (list): One row data from parsed data. | |||
| Returns: | |||
| list, the converted data. | |||
| """ | |||
| return [row[0], int(row[1]), float(row[2]), float(row[3]), float(row[4])] | |||
| class GpuOpInfoAnalyser(GpuAnalyser): | |||
| """Gpu operation detail info analyser.""" | |||
| _col_names = ["op_side", "op_type", "op_name", "op_full_name", | |||
| "op_occurrences", "op_total_time", "op_avg_time", | |||
| "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"] | |||
| _csv_file_to_analyse = 'gpu_op_detail_info_{}.csv' | |||
| @staticmethod | |||
| def _convert_field_type(row): | |||
| """ | |||
| Convert the field type to the specific type. | |||
| Args: | |||
| row (list): One row data from parsed data. | |||
| Returns: | |||
| list, the converted data. | |||
| """ | |||
| return [row[0], row[1], row[2], row[3], int(row[4]), float(row[5]), | |||
| float(row[6]), float(row[7]), float(row[8]), int(row[9])] | |||
| class GpuCudaActivityAnalyser(GpuAnalyser): | |||
| """Gpu activity type analyser.""" | |||
| _col_names = ["name", "type", "op_full_name", "stream_id", | |||
| "block_dim", "grid_dim", "occurrences", "total_duration", | |||
| "avg_duration", "max_duration", "min_duration"] | |||
| _csv_file_to_analyse = 'gpu_activity_data_{}.csv' | |||
| @staticmethod | |||
| def _convert_field_type(row): | |||
| """ | |||
| Convert the field type to the specific type. | |||
| Args: | |||
| row (list): One row data from parsed data. | |||
| Returns: | |||
| list, the converted data. | |||
| """ | |||
| return [row[0], row[1], row[2], row[3], row[4], row[5], int(row[6]), | |||
| float(row[7]), float(row[8]), float(row[9]), float(row[10])] | |||
| @@ -36,8 +36,10 @@ def analyse_device_list_from_profiler_dir(profiler_dir): | |||
| list, the device_id list. | |||
| """ | |||
| profiler_file_prefix = ["timeline_display", "output_op_compute_time"] | |||
| gpu_profiler_file_prefix = ["gpu_op_detail_info", "gpu_activity_data", "gpu_op_type_info"] | |||
| device_id_list = set() | |||
| gpu_device_id_list = set() | |||
| for _, _, filenames in os.walk(profiler_dir): | |||
| for filename in filenames: | |||
| if filename.startswith("step_trace_raw"): | |||
| @@ -51,8 +53,19 @@ def analyse_device_list_from_profiler_dir(profiler_dir): | |||
| if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix: | |||
| device_id_list.add(device_num) | |||
| return sorted(list(device_id_list)) | |||
| elif device_num.isdigit() and '_'.join(items[:-1]) in gpu_profiler_file_prefix: | |||
| gpu_device_id_list.add(device_num) | |||
| if device_id_list: | |||
| result_list = sorted(list(device_id_list)) | |||
| profiler_type = "ascend" | |||
| elif gpu_device_id_list: | |||
| result_list = sorted(list(gpu_device_id_list)) | |||
| profiler_type = "gpu" | |||
| else: | |||
| result_list = [] | |||
| profiler_type = "" | |||
| return result_list, profiler_type | |||
| def query_latest_trace_time_file(profiler_dir, device_id=0): | |||
| @@ -27,6 +27,13 @@ AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent" | |||
| AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"] | |||
| AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start", | |||
| "run_end"] | |||
| GPU_TYPE_COL = ["op_type", "type_occurrences", "total_time", "proportion", "avg_time"] | |||
| GPU_ACTIVITY_COL = ["name", "type", "op_full_name", "stream_id", | |||
| "block_dim", "grid_dim", "occurrences", "total_duration", | |||
| "avg_duration", "max_duration", "min_duration"] | |||
| GPU_DETAIL_COL = ["op_side", "op_type", "op_name", "op_full_name", | |||
| "op_occurrences", "op_total_time", "op_avg_time", | |||
| "proportion", "cuda_activity_cost_time", "cuda_activity_call_count"] | |||
| MINDDATA_PIPELINE_COL = [ | |||
| 'op_id', 'op_type', 'num_workers', 'output_queue_average_size', | |||
| 'output_queue_length', 'output_queue_usage_rate', 'sample_interval', | |||
| @@ -67,10 +74,20 @@ def validate_condition(search_condition): | |||
| search_scope = AICORE_TYPE_COL | |||
| elif op_type == "aicore_detail": | |||
| search_scope = AICORE_DETAIL_COL | |||
| elif op_type == "gpu_op_type": | |||
| search_scope = GPU_TYPE_COL | |||
| elif op_type == "gpu_op_info": | |||
| search_scope = GPU_DETAIL_COL | |||
| elif op_type == "gpu_cuda_activity": | |||
| search_scope = GPU_ACTIVITY_COL | |||
| else: | |||
| raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") | |||
| raise ProfilerOpTypeException( | |||
| "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', " | |||
| "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") | |||
| else: | |||
| raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']") | |||
| raise ProfilerOpTypeException( | |||
| "The op_type must in ['aicpu', 'aicore_type', 'aicore_detail', " | |||
| "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") | |||
| if "group_condition" in search_condition: | |||
| validate_group_condition(search_condition) | |||
| @@ -199,8 +216,6 @@ def validate_filter_condition(search_condition): | |||
| if "op_name" in filter_condition: | |||
| op_name_condition = filter_condition.get("op_name") | |||
| validate_op_filter_condition(op_name_condition) | |||
| if "op_type" not in filter_condition and "op_name" not in filter_condition: | |||
| raise ProfilerFilterConditionException("The key of filter_condition is not support") | |||
| def validate_and_set_job_id_env(job_id_env): | |||