# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """The analyzer for MindData profiling files.""" import copy import csv import json import os import stat from mindspore.profiler.common.exceptions.exceptions import \ ProfilerPathErrorException, ProfilerFileNotFoundException, \ ProfilerDirNotFoundException, ProfilerRawFileException from mindspore import log as logger from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path class MinddataProfilingAnalyzer: """ The analyzer for MindData profiling files. Args: source_dir (str): The source directory for MindData profiling input files. device_id (str): The device ID. output_path (str): The target directory for the analyzed summary. Default: `./`. Raises: ProfilerPathErrorException: If the source directory or the output path is invalid. ProfilerDirNotFoundException: If the source directory or the output path does not exist. ProfilerFileNotFoundException: If any of the MindData profiling input files do not exist. """ def __init__(self, source_dir, device_id, output_path='./'): # Validate and save input parameters self._device_id = device_id self._source_dir = self._validate_directory(source_dir, 'Source directory') self._output_path = self._validate_directory(output_path, 'Output path') # Get MindData profiling input filenames self._pipeline_path_filename = self._get_pipeline_path_filename(source_dir) self._cpu_utilization_path_filename = self._get_cpu_utilization_path_filename(source_dir) self._device_trace_path_filename, self._device_queue_file_found = \ self._get_device_trace_path_filename(source_dir) # Save output filename self._save_path = self._get_save_path(output_path) @property def save_path(self): """ The property of save path. Returns: str, the save path. """ return self._save_path def analyze(self): """ Analyze the MindData profiling files, produce summary pipeline information, including potential bottleneck operator in the MindData pipeline, and save the result to disk. Returns: dict, Analyzed MindData pipeline summary information, which is also written to disk in JSON file 'minddata_pipeline_summary_.json' and CSV file 'minddata_pipeline_summary_.csv'. Raises: ProfilerRawFileException: If fails to find a MindData profiling file or a file is empty. """ # Open the MindData pipeline file with open(self._pipeline_path_filename, 'r') as pipeline_file: try: pipeline_info = json.load(pipeline_file) except (json.JSONDecodeError, TypeError) as path_filename_error: logger.warning(path_filename_error) raise ProfilerRawFileException( 'Failed to find the MindData pipeline profiling file.') from path_filename_error if not pipeline_info: logger.warning('The MindData pipeline file <%s> is empty.', self._pipeline_path_filename) raise ProfilerRawFileException('The MindData pipeline file is empty.') # Open the CPU utilization file with open(self._cpu_utilization_path_filename, 'r') as cpu_util_file: try: cpu_util_info = json.load(cpu_util_file) except (json.JSONDecodeError, TypeError) as path_filename_error: logger.warning(path_filename_error) raise ProfilerRawFileException( 'Failed to find the MindData CPU utilization file.') from path_filename_error if not cpu_util_info: logger.warning('The MindData CPU utilization file <%s> is empty.', self._cpu_utilization_path_filename) raise ProfilerRawFileException('The MindData CPU utilization file is empty.') # Open the device queue or dataset iterator trace profiling file with open(self._device_trace_path_filename, 'r') as device_trace_file: try: device_trace_info = device_trace_file.readlines() except (TypeError) as path_filename_error: logger.warning(path_filename_error) raise ProfilerRawFileException( 'Failed to find the MindData trace profiling file.') from path_filename_error if not device_trace_info: logger.warning('The MindData trace profiling file <%s> is empty.', self._device_trace_path_filename) raise ProfilerRawFileException('The MindData trace profiling file is empty.') # Analyze the MindData profiling file information and save the result summary_dict = self._analyze_and_save(pipeline_info, cpu_util_info, device_trace_info) return summary_dict @staticmethod def _validate_directory(dir_name, dir_type): """ Validate the input directory. Args: dir_name (str): The directory name. dir_type (str): The type of directory. (Should begin with capital since is used for output messages.) """ try: validated_dir = validate_and_normalize_path(dir_name) except RuntimeError as path_error: logger.warning('<%s> <%s> is invalid.', dir_type, validated_dir) raise ProfilerPathErrorException(dir_type + 'is invalid.') from path_error if not os.path.isdir(validated_dir): logger.warning('<%s> <%s> not found.', dir_type, validated_dir) raise ProfilerDirNotFoundException(validated_dir) return validated_dir def _get_pipeline_path_filename(self, source_dir): """ Get the MindData pipeline full path filename. The filename is 'pipeline_profiling_.json'. Args: source_dir (str): The source directory for MindData profiling files. Returns: str, the MindData pipeline full path filename. """ pipeline_profiling_templatename = 'pipeline_profiling_{}.json' pipeline_path_filename = os.path.join( source_dir, pipeline_profiling_templatename.format(self._device_id)) try: pipeline_path_filename = validate_and_normalize_path(pipeline_path_filename) except RuntimeError as path_filename_error: logger.warning('The MindData pipeline path %s is invalid.', pipeline_path_filename) raise ProfilerPathErrorException('The MindData pipeline path is invalid.') from path_filename_error if not os.path.isfile(pipeline_path_filename): logger.warning('The MindData pipeline file <%s> is not found.', pipeline_path_filename) raise ProfilerFileNotFoundException(pipeline_path_filename) return pipeline_path_filename def _get_cpu_utilization_path_filename(self, source_dir): """ Get the MindData CPU utilization full path filename. The filename is 'minddata_cpu_utilization_.json'. Args: source_dir (str): The source directory for MindData profiling files. Returns: str, the MindData CPU utilization full path filename. """ cpu_utilization_templatename = 'minddata_cpu_utilization_{}.json' cpu_utilization_path_filename = os.path.join( source_dir, cpu_utilization_templatename.format(self._device_id)) try: cpu_utilization_path_filename = validate_and_normalize_path(cpu_utilization_path_filename) except RuntimeError as path_filename_error: logger.warning('The MindData CPU utilization path <%s> is invalid.', cpu_utilization_path_filename) raise ProfilerPathErrorException('The MindData CPU utilization path is invalid.') from path_filename_error if not os.path.isfile(cpu_utilization_path_filename): logger.warning('The MindData CPU utilization file <%s> is not found.', cpu_utilization_path_filename) raise ProfilerFileNotFoundException(cpu_utilization_path_filename) return cpu_utilization_path_filename def _get_device_trace_path_filename(self, source_dir): """ Get the MindData device trace profiling full path filename. File search order: 1) 'device_queue_profiling_.txt' and then 2) 'dataset_iterator_profiling_.txt'. Args: source_dir (str): The source directory for MindData profiling files. Returns: str, the MindData device trace profiling full path filename. bool, flag which indicates if 'device_queue_profiling_.txt' has been found or not """ # Initialize variable for MindData device trace profiling filename device_trace_path_filename = '' # Initialize flag that 'device_queue_profiling_.txt' has not yet been found device_queue_file_found = False txt_names = [os.path.join( source_dir, txt_name.format(self._device_id)) for txt_name in ('device_queue_profiling_{}.txt', 'dataset_iterator_profiling_{}.txt')] # Search for a device trace profiling file if os.path.exists(txt_names[0]): device_trace_path_filename = txt_names[0] device_queue_file_found = True elif os.path.exists(txt_names[1]): device_trace_path_filename = txt_names[1] else: logger.warning('A MindData device trace profiling file <%s> nor <%s> cannot be found.', txt_names[0], txt_names[1]) raise ProfilerPathErrorException('A MindData device trace profiling file cannot be found.') if not os.path.isfile(device_trace_path_filename): logger.warning('The MindData device trace profiling file <%s> is not found.', device_trace_path_filename) raise ProfilerFileNotFoundException(device_trace_path_filename) return device_trace_path_filename, device_queue_file_found def _get_save_path(self, output_path): """ Get the full pathname for the output file to save MindData pipeline summary analyzed information. The output filename is 'minddata_pipeline_summary_.json'. Args: output_path (str): The output directory. Returns: str, the save path. """ try: output_dir = validate_and_normalize_path(output_path) except RuntimeError as path_error: logger.warning('Output path <%s> is invalid.', output_path) raise ProfilerPathErrorException('Output path is invalid.') from path_error if not os.path.isdir(output_dir): logger.warning('The output directory <%s> not found.', output_dir) raise ProfilerDirNotFoundException(output_dir) summary_templatename = 'minddata_pipeline_summary_{}.json' return os.path.join(output_dir, summary_templatename.format(self._device_id)) @staticmethod def _parse_pipeline_metrics_info(metrics): """ Parse and process the pipeline profiling metrics information for a given op. Args: metrics (dict): The pipeline profiling metrics information for a given op. Returns: List with the following analyzed metrics information: output queue size output queue length output queue average size, output queue utilization percentage output queue empty frequency percentage """ # Note: Some ops like DeviceQueue and inline ops do not have metrics information queue_size = -1 queue_length = -1 queue_average_size = -1 queue_utilization_pct = -1 queue_empty_freq_pct = -1 if metrics and metrics['output_queue']: queue_size = metrics['output_queue']['size'] queue_length = metrics['output_queue']['length'] queue_average_size = round(sum(queue_size) / len(queue_size), 2) if queue_size else -1 queue_utilization_pct = round(100 * queue_average_size / queue_length, 2) if queue_length else -1 # Compute percentage of time queue is empty empty_count = 0 for q_size in queue_size: if q_size == 0: empty_count += 1 queue_empty_freq_pct = round(100 * empty_count / len(queue_size), 2) if queue_size else -1 return [queue_size, queue_length, queue_average_size, queue_utilization_pct, queue_empty_freq_pct] def _parse_pipeline_info(self, pipeline_info): """ Parse and process the pipeline profiling information. Args: pipeline_info (dict): The pipeline profiling information. Returns: Dictionary with analyzed summary output information For the following key-value pairs, each value is a list ordered by increasing op id pipeline_ops: operator name and operator id, a string, with example format Batch(id=0) op_names: operator name, a string op_ids: operator id, an integer num_workers: number of workers for the op, an integer queue_average_size: average queue size for the op, a float queue_utilization_pct: average percentage of time queue is used for op, a float from 0.00 to 1.00 queue_empty_freq_pct: percentage of time queue is empty for op, a float from 0.00 to 1.00 children_ids: children op ids of op; list if empty [] if op has no children parent_id: parent id of op Raises: ProfilerRawFileException: If the format of the input is wrong. """ # Perform sanity checks for pipeline information pipeline_op_info = pipeline_info.get('op_info') for item in pipeline_op_info: if not item: raise ProfilerRawFileException('The contents of MindData pipeline JSON file is wrong.') # Parse and process pipeline information # Obtain the following for each op (and build a list), ordered by increasing op id # - op id (handy for user output) # - op name (needed for basic processing) # - op name with op id (handy for user output) # - num_workers # - various queue information # - children op ids # - parent op id dict_opid_pipeline_ops = {} dict_opid_opname = {} dict_opid_numworkers = {} dict_opid_queue_info = {} dict_opid_children_ids = {} dict_opid_parent_id = {} # Note: Will process the input pipeline ops in "reversed" order since typically they are ordered # from largest op id (usually leaf/source op) to smallest op id (usually root). # However, since there may be non-linear pipelines, the processed op info needs to be sorted # before final output is produced and saved. for op_info in reversed(pipeline_info['op_info']): op_id = op_info.get('op_id') op_name = op_info.get('op_type')[0:-2] dict_opid_pipeline_ops[op_id] = op_name + '(id=' + str(op_id) + ')' dict_opid_opname[op_id] = op_name dict_opid_numworkers[op_id] = op_info.get('num_workers') # Obtain the output queue metrics information for the current op dict_opid_queue_info[op_id] = self._parse_pipeline_metrics_info(op_info.get('metrics')) # For current op, initialize parent_id=-1, in case after processing all children in pipeline, # it is determined that current op has no parent if dict_opid_parent_id.get(op_id) is None: dict_opid_parent_id[op_id] = -1 children_ids = op_info.get('children') if children_ids: # Set children op ids for current op dict_opid_children_ids[op_id] = children_ids # For each child op, set parent op to be current op for child_op_id in children_ids: dict_opid_parent_id[child_op_id] = op_id else: dict_opid_children_ids[op_id] = [] # Build resultant dictionary return_dict = {} return_dict['pipeline_ops'] = [x[1] for x in sorted(dict_opid_pipeline_ops.items())] return_dict['op_names'] = [x[1] for x in sorted(dict_opid_opname.items())] return_dict['op_ids'] = sorted(dict_opid_opname.keys()) return_dict['num_workers'] = [x[1] for x in sorted(dict_opid_numworkers.items())] queue_info_items = [x[1] for x in sorted(dict_opid_queue_info.items())] return_dict['queue_average_size'] = [y[2] for y in queue_info_items] return_dict['queue_utilization_pct'] = [y[3] for y in queue_info_items] return_dict['queue_empty_freq_pct'] = [y[4] for y in queue_info_items] return_dict['children_ids'] = [x[1] for x in sorted(dict_opid_children_ids.items())] return_dict['parent_id'] = [x[1] for x in sorted(dict_opid_parent_id.items())] return return_dict @staticmethod def _parse_cpu_util_info(cpu_util_info): """ Parse and process the CPU profiling information. Args: cpu_util_info (dict): The CPU utilization profiling information. Returns: Dictionary with analyzed summary output information Dictionary consists of: avg_cpu_pct: Average CPU utilization percentage for each op, a list ordered by increasing op id Raises: ProfilerRawFileException: If the format of the input is wrong. """ # Perform sanity checks for CPU utilization information cpu_processor_num = cpu_util_info.get('cpu_processor_num') cpu_op_info = cpu_util_info.get('op_info') if cpu_processor_num is None or not cpu_op_info: raise ProfilerRawFileException('The format of MindData CPU utilization JSON file is wrong.') for item in cpu_op_info: if not item: raise ProfilerRawFileException('The contents of MindData CPU utilization JSON file is wrong.') # Parse and process the following CPU utilization information: # - overage cpu utilization for each op dict_opid_cpuutil = {} for op in cpu_util_info["op_info"]: # Note: The CPU utilization data may have an extra entry with op_id=-1 # Omit info for op_id=1 if op["op_id"] != -1: op_sys, op_usr = op["metrics"]["sys_utilization"], op["metrics"]["user_utilization"] dict_opid_cpuutil[op["op_id"]] = [op_sys[i] + op_usr[i] for i in range(len(op_sys))] # Initialize oplist_avg_cpu_pct with -1 for each pipeline op, since # CPU utilization data may not have information for each pipeline op oplist_avg_cpu_pct = [-1] * len(dict_opid_cpuutil) total_cpu = 0 for op_id, cpu in dict_opid_cpuutil.items(): op_avg_cpu_pct = sum(cpu) / len(cpu) if cpu else 0 oplist_avg_cpu_pct[op_id] = round(op_avg_cpu_pct, 2) total_cpu += op_avg_cpu_pct return_dict = {} return_dict['avg_cpu_pct'] = oplist_avg_cpu_pct return return_dict def _parse_device_trace_info(self, device_trace_info): """ Parse and process the device trace profiling information. Args: device_trace_info: The device trace profiling information in text format, one line per record. Returns: Dictionary with analyzed summary output information Dictionary consists of: per_batch_time: Average per batch time for pipeline in milliseconds per_pipeline_time: Average per pipeline time in milliseconds per_push_queue_time: Average per queue push time in milliseconds """ # Information on the format of the device tracing profiling information. # Format is: type extra-info batch-num value timestamp # 0) type: 0: time, 1: connector size # 1) extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time # if type is 1 - connector capacity # 2) batch-num: batch number # 3) value: if type is 0 - value is time(ms) # if type is 1 - value is connector size # 4) timestamp # Examples: # 0 0 20 10 xxx - The 20th batch took 10ms to get data from pipeline. # 1 64 20 5 yyy - Connector size is 5 when get the 20th batch.Connector capacity is 64. prev_time = 0 q_time = [[], [], []] # pipeline time, push TDT time, batch time # Parse each record for line_data in device_trace_info: record = [int(d) for d in line_data.split(" ")][0:5] if record[2] < 2: # skip 1st batch prev_time = record[4] continue if record[0] == 0: # type 0: time record q_time[record[1]].append(record[3]) elif record[0] == 1: # type 1: connector size record # Check if dataset_iterator trace profiling file was found if not self._device_queue_file_found: q_time[2].append(record[4] - prev_time) prev_time = record[4] # Compute average queue times avg_pipeline_time = sum(q_time[0]) / len(q_time[0]) if q_time[0] else -1 avg_push_queue_time = sum(q_time[1]) / len(q_time[1]) if q_time[1] else -1 avg_batch_time = sum(q_time[2]) / len(q_time[2]) if q_time[2] else -1 return_dict = {} return_dict['per_batch_time'] = [round(avg_batch_time, 3)] return_dict['per_pipeline_time'] = [round(avg_pipeline_time, 3)] return_dict['per_push_queue_time'] = [round(avg_push_queue_time, 3)] return return_dict def _compute_composite_info(self, summary_dict): """ Compute composite analysis information from the current summary pipeline data. Args: summary_dict (dict): Input summary pipeline information. Returns: Dictionary with composite analysis output information Dictionary consists of: avg_cpu_pct_per_worker: Average CPU utilization percentage per worker """ return_dict = {} # Build list: average CPU utilization percentage per worker - for each op avg_cpu_pct_per_worker = [] for c, n in zip(summary_dict.get('avg_cpu_pct'), summary_dict.get('num_workers')): avg_cpu_pct_per_worker.append(round(c / n if (n != 0 and c >= 0) else -1, 2)) return_dict['avg_cpu_pct_per_worker'] = avg_cpu_pct_per_worker return return_dict @staticmethod def _analyze_for_bottleneck_op(summary_dict): """ Analyze the MindData summary information and identify any potential bottleneck operator in the MindData pipeline. Args: summary_dict (dict): Input summary pipeline information. Returns: Dictionary with the following information, if applicable: - CPU utilization analysis - queue utilization analysis - bottleneck warning: Information on the bottleneck op (This is returned only if a potential bottleneck is identified.) - bottleneck suggestion: Reason why the subject op is it is identified as a potential bottleneck, plus suggestion on how to resolve the bottleneck. (This is returned only if a potential bottleneck is identified.) """ try: bottleneck_analyzer = BottleneckAnalyzer(summary_dict) return_dict = bottleneck_analyzer.analyze() except IndexError: return_dict = {} return return_dict def _save_as_csv_file(self, data_dict): """ Save data dictionary information to CSV file. Args: data_dict (dict): Input data dictionary information. Returns: Data dictionary information is saved to CSV file named 'minddata_pipeline_summary_.csv'. """ summary_templatename = 'minddata_pipeline_summary_{}.csv' output_csv_path_filename = os.path.join(self._output_path, summary_templatename.format(self._device_id)) # Open file for writing data_file = open(output_csv_path_filename, 'w') # Create CSV writer object csv_writer = csv.writer(data_file) # Write the dictionary information to CSV file # Create deepcopy of input data_dict so zip processing in this function does NOT change the data_dict temp_dict = copy.deepcopy(data_dict) for data_key, data_value in zip(temp_dict.keys(), temp_dict.values()): # Begin/prefix the data value with the data key data_value.insert(0, data_key) csv_writer.writerow(data_value) # Close file for writing data_file.close() # Update file permissions os.chmod(output_csv_path_filename, stat.S_IREAD | stat.S_IWRITE) def _analyze_and_save(self, pipeline_info, cpu_util_info, device_trace_info): """ Analyze and save the MindData summary information to file. Args: pipeline_info (dict): The pipeline information read from the input JSON file. cpu_util_info (dict): The CPU utilization information read from the input JSON file. device_trace_info (text): The dataset iterator (CPU) or device queue (GPU, Ascend) trace profiling text file. Value is None if such file could not be identified. Returns: summary_dict (dict): Analyzed summary information. The summary dictionary information is doubly saved to a JSON file and a CSV file (so that these different formats are available to the users). """ # Initialize summary output dictionary summary_dict = {} # Parse and process pipeline information summary_dict.update(self._parse_pipeline_info(pipeline_info)) # Parse and process CPU utilization information summary_dict.update(self._parse_cpu_util_info(cpu_util_info)) if device_trace_info is not None: # Parse and process device queue or dataset iterator trace profiling information summary_dict.update(self._parse_device_trace_info(device_trace_info)) # Check if both pipeline data and CPU utilization data have the same number of ops num_pipeline_ops = len(summary_dict.get('pipeline_ops')) num_cpu_util_ops = len(summary_dict.get('avg_cpu_pct')) if num_pipeline_ops == num_cpu_util_ops: # Compute composite analysis information summary_dict.update(self._compute_composite_info(summary_dict)) # Analyze pipeline info for potential bottleneck op bottleneck_dict = self._analyze_for_bottleneck_op(summary_dict) if bottleneck_dict: summary_dict.update(bottleneck_dict) else: # Produce a warning since the pipeline data and the CPU utilization data do not include information # for the same number of ops warning_msg = 'Number of ops for pipeline data: ' + str(num_pipeline_ops) + \ ' does not match number of ops for CPU utilization data: ' + str(num_cpu_util_ops) logger.warning(warning_msg) # Save summary output dictionary to JSON output file (format#1) with open(self._save_path, 'w') as save_file: json.dump(summary_dict, save_file) os.chmod(self._save_path, stat.S_IREAD | stat.S_IWRITE) # Save summary output to CSV file (format#2) self._save_as_csv_file(summary_dict) # Return summary output dictionary (format#3) return summary_dict class BottleneckAnalyzer: """ analyzer for bottleneck """ def __init__(self, summary_dict): """ constructor for BottleneckAnalyzer """ self.pipeline_ops = summary_dict["pipeline_ops"] self.op_names = summary_dict["op_names"] self.op_ids = summary_dict["op_ids"] self.num_workers = summary_dict["num_workers"] self.queue_average_size = summary_dict["queue_average_size"] self.queue_utilization_pct = summary_dict["queue_utilization_pct"] self.queue_empty_freq_pct = summary_dict["queue_empty_freq_pct"] self.children_ids = summary_dict["children_ids"] self.parent_id = summary_dict["parent_id"] self.avg_cpu_pct = summary_dict["avg_cpu_pct"] self.avg_cpu_pct_per_worker = summary_dict["avg_cpu_pct_per_worker"] self.op_id_not_exist = -1 self.queue_usage_not_exist = -1 self.non_multithreaded_ops = set(["Barrier", "Concat", "EpochCtrl", "Rename", "Repeat", "Shuffle", "Skip", "Take", "Zip"]) # These are the threshold values used in the pipeline bottleneck analyzer algorithm self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM = 75.0 self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM = 20.0 self._LEAF_OUTPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM = 50 self._DEVICEQUEUE_INPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM = 60 self._IN_OUT_QUEUE_UTIL_PCT_DIFF_MAXIMUM = 50 self._IN_QUEUE_UTIL_PCT_MAXIMUM = 10 def analyze(self): """ analyze all op's usage """ detailed_analysis = {} cpu_analysis = self.analyze_cpu_usage() queue_analysis = self.analyze_queue_usage() if cpu_analysis: detailed_analysis["cpu_analysis_details"] = cpu_analysis if queue_analysis: detailed_analysis["queue_analysis_details"] = queue_analysis bottleneck, suggestion = self.analyze_bottleneck() if bottleneck[0]: detailed_analysis["bottleneck_warning"] = bottleneck detailed_analysis["bottleneck_suggestion"] = suggestion return detailed_analysis def __get_non_inline_child_recur(self, cur_op_id): """get the child id of cur op which isn't an inline op""" if cur_op_id == self.op_id_not_exist or not self.children_ids[cur_op_id]: return self.op_id_not_exist cur_child_id = self.children_ids[cur_op_id][0] if self.queue_average_size[cur_child_id] != -1: return cur_child_id return self.__get_non_inline_child_recur(cur_child_id) def analyze_cpu_usage(self): """ analyze cpu usage of each op """ cpu_usage_analysis = [] for op_id in self.op_ids: if op_id == self.op_id_not_exist or self.op_names[op_id] in self.non_multithreaded_ops: continue if self.avg_cpu_pct_per_worker[op_id] > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM and \ self.op_names[op_id]: cpu_usage_analysis.append( ("{} is using {}% CPU per worker." " Setting num_parallel_workers" ">{} might bring extra performance.").format(self.pipeline_ops[op_id], self.avg_cpu_pct_per_worker[op_id], self.num_workers[op_id])) elif self.avg_cpu_pct_per_worker[op_id] < self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM and \ self.num_workers[op_id] > 1: cpu_usage_analysis.append( ("{} is using {}% CPU per worker. Using num_parallel_workers={} might not bring as much benefit" " due to low CPU usage per worker.").format(self.pipeline_ops[op_id], self.avg_cpu_pct_per_worker[op_id], self.num_workers[op_id])) return cpu_usage_analysis def analyze_queue_usage(self): """ analyze queue usage of each op """ queue_usage_analysis = [] for op_id in self.op_ids: if op_id == self.op_id_not_exist or self.op_names[op_id] in self.non_multithreaded_ops: continue if self.op_names[op_id] == "Batch": continue in_op_id, out_q = self.__get_non_inline_child_recur( op_id), self.queue_utilization_pct[op_id] if in_op_id == self.op_id_not_exist and out_q != self.queue_usage_not_exist: # This is a leaf node since input queue does not exist and output queue exists if out_q < self._LEAF_OUTPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM: queue_usage_analysis.append(("Leaf op {} is using {}% of its output queue." "Setting num_parallel_workers" ">{} might speed up I/O.").format(self.pipeline_ops[op_id], out_q, self.num_workers[op_id])) elif self.op_names[op_id] == "DeviceQueue" and in_op_id != self.op_id_not_exist: # if this is device_queue op, if self.queue_empty_freq_pct[in_op_id] > self._DEVICEQUEUE_INPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM: queue_usage_analysis.append(( "{}'s input queue is empty {}% of the time. This might indicate dataset bottlenecks." " Hence host cannot keep up with the device {}% of the time." " Device waits whenever input queue is empty.").format(self.pipeline_ops[op_id], self.queue_empty_freq_pct[in_op_id], self.queue_empty_freq_pct[in_op_id])) elif in_op_id != self.op_id_not_exist and out_q != self.queue_usage_not_exist: in_q = self.queue_utilization_pct[in_op_id] if in_q != self.queue_usage_not_exist and in_q - out_q > self._IN_OUT_QUEUE_UTIL_PCT_DIFF_MAXIMUM: queue_usage_analysis.append(( "{}'s input queue usage={}% is greater output queue usage={}%." " This indicates child op {} might be producing faster than its parent {} can consume." " If this op has low CPU utilization, try increasing " "prefetch_size or increasing num_workers.").format(self.pipeline_ops[op_id], in_q, out_q, self.pipeline_ops[in_op_id], self.pipeline_ops[op_id])) return queue_usage_analysis def analyze_bottleneck(self): """ analyze bottleneck by using both cpu and queue usage """ bottleneck, suggestion = "", "" for op_id in reversed(self.op_ids): in_op_id, out_q = self.__get_non_inline_child_recur( op_id), self.queue_utilization_pct[op_id] wkr_cpu = self.avg_cpu_pct_per_worker[op_id] if op_id == self.op_id_not_exist or \ self.op_names[op_id] in self.non_multithreaded_ops \ or self.op_names[op_id] == "DeviceQueue": continue if wkr_cpu > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM: bottleneck = self.pipeline_ops[op_id] suggestion = "{} has high CPU utilization per worker of {}%".format( self.pipeline_ops[op_id], wkr_cpu) suggestion += " Try increasing num_parallel_workers above {}.".format(self.num_workers[op_id]) elif wkr_cpu < self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM: in_op_id = self.__get_non_inline_child_recur(op_id) in_q_usage = self.queue_utilization_pct[in_op_id] if in_op_id != self.op_id_not_exist and ( in_q_usage < self._IN_QUEUE_UTIL_PCT_MAXIMUM or out_q - in_q_usage > self._IN_OUT_QUEUE_UTIL_PCT_DIFF_MAXIMUM): bottleneck = self.pipeline_ops[op_id] suggestion = "{} has low CPU utilization per worker of {}%".format( self.pipeline_ops[op_id], wkr_cpu) suggestion += " and abnormal queue usage. Try increasing prefetch_size." return [bottleneck], [suggestion]