You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

minddata_analyzer.py 38 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """The analyzer for MindData profiling files."""
  16. import copy
  17. import csv
  18. import json
  19. import os
  20. import stat
  21. from mindspore.profiler.common.exceptions.exceptions import \
  22. ProfilerPathErrorException, ProfilerFileNotFoundException, \
  23. ProfilerDirNotFoundException, ProfilerRawFileException
  24. from mindspore import log as logger
  25. from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
  26. class MinddataProfilingAnalyzer:
  27. """
  28. The analyzer for MindData profiling files.
  29. Args:
  30. source_dir (str): The source directory for MindData profiling input files.
  31. device_id (str): The device ID.
  32. output_path (str): The target directory for the analyzed summary. Default: `./`.
  33. Raises:
  34. ProfilerPathErrorException: If the source directory or the output path is invalid.
  35. ProfilerDirNotFoundException: If the source directory or the output path does not exist.
  36. ProfilerFileNotFoundException: If any of the MindData profiling input files do not exist.
  37. """
  38. def __init__(self, source_dir, device_id, output_path='./'):
  39. # Validate and save input parameters
  40. self._device_id = device_id
  41. self._source_dir = self._validate_directory(source_dir, 'Source directory')
  42. self._output_path = self._validate_directory(output_path, 'Output path')
  43. # Get MindData profiling input filenames
  44. self._pipeline_path_filename = self._get_pipeline_path_filename(source_dir)
  45. self._cpu_utilization_path_filename = self._get_cpu_utilization_path_filename(source_dir)
  46. self._device_trace_path_filename, self._device_queue_file_found = \
  47. self._get_device_trace_path_filename(source_dir)
  48. # Save output filename
  49. self._save_path = self._get_save_path(output_path)
  50. @property
  51. def save_path(self):
  52. """
  53. The property of save path.
  54. Returns:
  55. str, the save path.
  56. """
  57. return self._save_path
  58. def analyze(self):
  59. """
  60. Analyze the MindData profiling files, produce summary pipeline information, including potential
  61. bottleneck operator in the MindData pipeline, and save the result to disk.
  62. Returns:
  63. dict, Analyzed MindData pipeline summary information, which is also written to disk in
  64. JSON file 'minddata_pipeline_summary_<device_id>.json' and
  65. CSV file 'minddata_pipeline_summary_<device_id>.csv'.
  66. Raises:
  67. ProfilerRawFileException: If fails to find a MindData profiling file or a file is empty.
  68. """
  69. # Open the MindData pipeline file
  70. with open(self._pipeline_path_filename, 'r') as pipeline_file:
  71. try:
  72. pipeline_info = json.load(pipeline_file)
  73. except (json.JSONDecodeError, TypeError) as path_filename_error:
  74. logger.warning(path_filename_error)
  75. raise ProfilerRawFileException(
  76. 'Failed to find the MindData pipeline profiling file.') from path_filename_error
  77. if not pipeline_info:
  78. logger.warning('The MindData pipeline file <%s> is empty.', self._pipeline_path_filename)
  79. raise ProfilerRawFileException('The MindData pipeline file is empty.')
  80. # Open the CPU utilization file
  81. with open(self._cpu_utilization_path_filename, 'r') as cpu_util_file:
  82. try:
  83. cpu_util_info = json.load(cpu_util_file)
  84. except (json.JSONDecodeError, TypeError) as path_filename_error:
  85. logger.warning(path_filename_error)
  86. raise ProfilerRawFileException(
  87. 'Failed to find the MindData CPU utilization file.') from path_filename_error
  88. if not cpu_util_info:
  89. logger.warning('The MindData CPU utilization file <%s> is empty.', self._cpu_utilization_path_filename)
  90. raise ProfilerRawFileException('The MindData CPU utilization file is empty.')
  91. # Open the device queue or dataset iterator trace profiling file
  92. with open(self._device_trace_path_filename, 'r') as device_trace_file:
  93. try:
  94. device_trace_info = device_trace_file.readlines()
  95. except (TypeError) as path_filename_error:
  96. logger.warning(path_filename_error)
  97. raise ProfilerRawFileException(
  98. 'Failed to find the MindData trace profiling file.') from path_filename_error
  99. if not device_trace_info:
  100. logger.warning('The MindData trace profiling file <%s> is empty.', self._device_trace_path_filename)
  101. raise ProfilerRawFileException('The MindData trace profiling file is empty.')
  102. # Analyze the MindData profiling file information and save the result
  103. summary_dict = self._analyze_and_save(pipeline_info, cpu_util_info, device_trace_info)
  104. return summary_dict
  105. @staticmethod
  106. def _validate_directory(dir_name, dir_type):
  107. """
  108. Validate the input directory.
  109. Args:
  110. dir_name (str): The directory name.
  111. dir_type (str): The type of directory. (Should begin with capital since is used for output messages.)
  112. """
  113. try:
  114. validated_dir = validate_and_normalize_path(dir_name)
  115. except RuntimeError as path_error:
  116. logger.warning('<%s> <%s> is invalid.', dir_type, validated_dir)
  117. raise ProfilerPathErrorException(dir_type + 'is invalid.') from path_error
  118. if not os.path.isdir(validated_dir):
  119. logger.warning('<%s> <%s> not found.', dir_type, validated_dir)
  120. raise ProfilerDirNotFoundException(validated_dir)
  121. return validated_dir
  122. def _get_pipeline_path_filename(self, source_dir):
  123. """
  124. Get the MindData pipeline full path filename.
  125. The filename is 'pipeline_profiling_<device_id>.json'.
  126. Args:
  127. source_dir (str): The source directory for MindData profiling files.
  128. Returns:
  129. str, the MindData pipeline full path filename.
  130. """
  131. pipeline_profiling_templatename = 'pipeline_profiling_{}.json'
  132. pipeline_path_filename = os.path.join(
  133. source_dir,
  134. pipeline_profiling_templatename.format(self._device_id))
  135. try:
  136. pipeline_path_filename = validate_and_normalize_path(pipeline_path_filename)
  137. except RuntimeError as path_filename_error:
  138. logger.warning('The MindData pipeline path %s is invalid.', pipeline_path_filename)
  139. raise ProfilerPathErrorException('The MindData pipeline path is invalid.') from path_filename_error
  140. if not os.path.isfile(pipeline_path_filename):
  141. logger.warning('The MindData pipeline file <%s> is not found.', pipeline_path_filename)
  142. raise ProfilerFileNotFoundException(pipeline_path_filename)
  143. return pipeline_path_filename
  144. def _get_cpu_utilization_path_filename(self, source_dir):
  145. """
  146. Get the MindData CPU utilization full path filename.
  147. The filename is 'minddata_cpu_utilization_<device_id>.json'.
  148. Args:
  149. source_dir (str): The source directory for MindData profiling files.
  150. Returns:
  151. str, the MindData CPU utilization full path filename.
  152. """
  153. cpu_utilization_templatename = 'minddata_cpu_utilization_{}.json'
  154. cpu_utilization_path_filename = os.path.join(
  155. source_dir,
  156. cpu_utilization_templatename.format(self._device_id))
  157. try:
  158. cpu_utilization_path_filename = validate_and_normalize_path(cpu_utilization_path_filename)
  159. except RuntimeError as path_filename_error:
  160. logger.warning('The MindData CPU utilization path <%s> is invalid.', cpu_utilization_path_filename)
  161. raise ProfilerPathErrorException('The MindData CPU utilization path is invalid.') from path_filename_error
  162. if not os.path.isfile(cpu_utilization_path_filename):
  163. logger.warning('The MindData CPU utilization file <%s> is not found.', cpu_utilization_path_filename)
  164. raise ProfilerFileNotFoundException(cpu_utilization_path_filename)
  165. return cpu_utilization_path_filename
  166. def _get_device_trace_path_filename(self, source_dir):
  167. """
  168. Get the MindData device trace profiling full path filename.
  169. File search order:
  170. 1) 'device_queue_profiling_<device_id>.txt' and then
  171. 2) 'dataset_iterator_profiling_<device_id>.txt'.
  172. Args:
  173. source_dir (str): The source directory for MindData profiling files.
  174. Returns:
  175. str, the MindData device trace profiling full path filename.
  176. bool, flag which indicates if 'device_queue_profiling_<device_id>.txt' has been found or not
  177. """
  178. # Initialize variable for MindData device trace profiling filename
  179. device_trace_path_filename = ''
  180. # Initialize flag that 'device_queue_profiling_<device_id>.txt' has not yet been found
  181. device_queue_file_found = False
  182. txt_names = [os.path.join(
  183. source_dir,
  184. txt_name.format(self._device_id)) for txt_name in
  185. ('device_queue_profiling_{}.txt', 'dataset_iterator_profiling_{}.txt')]
  186. # Search for a device trace profiling file
  187. if os.path.exists(txt_names[0]):
  188. device_trace_path_filename = txt_names[0]
  189. device_queue_file_found = True
  190. elif os.path.exists(txt_names[1]):
  191. device_trace_path_filename = txt_names[1]
  192. else:
  193. logger.warning('A MindData device trace profiling file <%s> nor <%s> cannot be found.',
  194. txt_names[0], txt_names[1])
  195. raise ProfilerPathErrorException('A MindData device trace profiling file cannot be found.')
  196. if not os.path.isfile(device_trace_path_filename):
  197. logger.warning('The MindData device trace profiling file <%s> is not found.', device_trace_path_filename)
  198. raise ProfilerFileNotFoundException(device_trace_path_filename)
  199. return device_trace_path_filename, device_queue_file_found
  200. def _get_save_path(self, output_path):
  201. """
  202. Get the full pathname for the output file to save MindData pipeline summary analyzed information.
  203. The output filename is 'minddata_pipeline_summary_<device_id>.json'.
  204. Args:
  205. output_path (str): The output directory.
  206. Returns:
  207. str, the save path.
  208. """
  209. try:
  210. output_dir = validate_and_normalize_path(output_path)
  211. except RuntimeError as path_error:
  212. logger.warning('Output path <%s> is invalid.', output_path)
  213. raise ProfilerPathErrorException('Output path is invalid.') from path_error
  214. if not os.path.isdir(output_dir):
  215. logger.warning('The output directory <%s> not found.', output_dir)
  216. raise ProfilerDirNotFoundException(output_dir)
  217. summary_templatename = 'minddata_pipeline_summary_{}.json'
  218. return os.path.join(output_dir, summary_templatename.format(self._device_id))
  219. @staticmethod
  220. def _parse_pipeline_metrics_info(metrics):
  221. """
  222. Parse and process the pipeline profiling metrics information for a given op.
  223. Args:
  224. metrics (dict): The pipeline profiling metrics information for a given op.
  225. Returns:
  226. List with the following analyzed metrics information:
  227. output queue size
  228. output queue length
  229. output queue average size,
  230. output queue utilization percentage
  231. output queue empty frequency percentage
  232. """
  233. # Note: Some ops like DeviceQueue and inline ops do not have metrics information
  234. queue_size = -1
  235. queue_length = -1
  236. queue_average_size = -1
  237. queue_utilization_pct = -1
  238. queue_empty_freq_pct = -1
  239. if metrics and metrics['output_queue']:
  240. queue_size = metrics['output_queue']['size']
  241. queue_length = metrics['output_queue']['length']
  242. queue_average_size = round(sum(queue_size) / len(queue_size), 2) if queue_size else -1
  243. queue_utilization_pct = round(100 * queue_average_size / queue_length, 2) if queue_length else -1
  244. # Compute percentage of time queue is empty
  245. empty_count = 0
  246. for q_size in queue_size:
  247. if q_size == 0:
  248. empty_count += 1
  249. queue_empty_freq_pct = round(100 * empty_count / len(queue_size), 2) if queue_size else -1
  250. return [queue_size, queue_length, queue_average_size, queue_utilization_pct, queue_empty_freq_pct]
  251. def _parse_pipeline_info(self, pipeline_info):
  252. """
  253. Parse and process the pipeline profiling information.
  254. Args:
  255. pipeline_info (dict): The pipeline profiling information.
  256. Returns:
  257. Dictionary with analyzed summary output information
  258. For the following key-value pairs, each value is a list ordered by increasing op id
  259. pipeline_ops: operator name and operator id, a string, with example format Batch(id=0)
  260. op_names: operator name, a string
  261. op_ids: operator id, an integer
  262. num_workers: number of workers for the op, an integer
  263. queue_average_size: average queue size for the op, a float
  264. queue_utilization_pct: average percentage of time queue is used for op, a float from 0.00 to 1.00
  265. queue_empty_freq_pct: percentage of time queue is empty for op, a float from 0.00 to 1.00
  266. children_ids: children op ids of op; list if empty [] if op has no children
  267. parent_id: parent id of op
  268. Raises:
  269. ProfilerRawFileException: If the format of the input is wrong.
  270. """
  271. # Perform sanity checks for pipeline information
  272. pipeline_op_info = pipeline_info.get('op_info')
  273. for item in pipeline_op_info:
  274. if not item:
  275. raise ProfilerRawFileException('The contents of MindData pipeline JSON file is wrong.')
  276. # Parse and process pipeline information
  277. # Obtain the following for each op (and build a list), ordered by increasing op id
  278. # - op id (handy for user output)
  279. # - op name (needed for basic processing)
  280. # - op name with op id (handy for user output)
  281. # - num_workers
  282. # - various queue information
  283. # - children op ids
  284. # - parent op id
  285. dict_opid_pipeline_ops = {}
  286. dict_opid_opname = {}
  287. dict_opid_numworkers = {}
  288. dict_opid_queue_info = {}
  289. dict_opid_children_ids = {}
  290. dict_opid_parent_id = {}
  291. # Note: Will process the input pipeline ops in "reversed" order since typically they are ordered
  292. # from largest op id (usually leaf/source op) to smallest op id (usually root).
  293. # However, since there may be non-linear pipelines, the processed op info needs to be sorted
  294. # before final output is produced and saved.
  295. for op_info in reversed(pipeline_info['op_info']):
  296. op_id = op_info.get('op_id')
  297. op_name = op_info.get('op_type')[0:-2]
  298. dict_opid_pipeline_ops[op_id] = op_name + '(id=' + str(op_id) + ')'
  299. dict_opid_opname[op_id] = op_name
  300. dict_opid_numworkers[op_id] = op_info.get('num_workers')
  301. # Obtain the output queue metrics information for the current op
  302. dict_opid_queue_info[op_id] = self._parse_pipeline_metrics_info(op_info.get('metrics'))
  303. # For current op, initialize parent_id=-1, in case after processing all children in pipeline,
  304. # it is determined that current op has no parent
  305. if dict_opid_parent_id.get(op_id) is None:
  306. dict_opid_parent_id[op_id] = -1
  307. children_ids = op_info.get('children')
  308. if children_ids:
  309. # Set children op ids for current op
  310. dict_opid_children_ids[op_id] = children_ids
  311. # For each child op, set parent op to be current op
  312. for child_op_id in children_ids:
  313. dict_opid_parent_id[child_op_id] = op_id
  314. else:
  315. dict_opid_children_ids[op_id] = []
  316. # Build resultant dictionary
  317. return_dict = {}
  318. return_dict['pipeline_ops'] = [x[1] for x in sorted(dict_opid_pipeline_ops.items())]
  319. return_dict['op_names'] = [x[1] for x in sorted(dict_opid_opname.items())]
  320. return_dict['op_ids'] = sorted(dict_opid_opname.keys())
  321. return_dict['num_workers'] = [x[1] for x in sorted(dict_opid_numworkers.items())]
  322. queue_info_items = [x[1] for x in sorted(dict_opid_queue_info.items())]
  323. return_dict['queue_average_size'] = [y[2] for y in queue_info_items]
  324. return_dict['queue_utilization_pct'] = [y[3] for y in queue_info_items]
  325. return_dict['queue_empty_freq_pct'] = [y[4] for y in queue_info_items]
  326. return_dict['children_ids'] = [x[1] for x in sorted(dict_opid_children_ids.items())]
  327. return_dict['parent_id'] = [x[1] for x in sorted(dict_opid_parent_id.items())]
  328. return return_dict
  329. @staticmethod
  330. def _parse_cpu_util_info(cpu_util_info):
  331. """
  332. Parse and process the CPU profiling information.
  333. Args:
  334. cpu_util_info (dict): The CPU utilization profiling information.
  335. Returns:
  336. Dictionary with analyzed summary output information
  337. Dictionary consists of:
  338. avg_cpu_pct: Average CPU utilization percentage for each op, a list ordered by increasing op id
  339. Raises:
  340. ProfilerRawFileException: If the format of the input is wrong.
  341. """
  342. # Perform sanity checks for CPU utilization information
  343. cpu_processor_num = cpu_util_info.get('cpu_processor_num')
  344. cpu_op_info = cpu_util_info.get('op_info')
  345. if cpu_processor_num is None or not cpu_op_info:
  346. raise ProfilerRawFileException('The format of MindData CPU utilization JSON file is wrong.')
  347. for item in cpu_op_info:
  348. if not item:
  349. raise ProfilerRawFileException('The contents of MindData CPU utilization JSON file is wrong.')
  350. # Parse and process the following CPU utilization information:
  351. # - overage cpu utilization for each op
  352. dict_opid_cpuutil = {}
  353. for op in cpu_util_info["op_info"]:
  354. # Note: The CPU utilization data may have an extra entry with op_id=-1
  355. # Omit info for op_id=1
  356. if op["op_id"] != -1:
  357. op_sys, op_usr = op["metrics"]["sys_utilization"], op["metrics"]["user_utilization"]
  358. dict_opid_cpuutil[op["op_id"]] = [op_sys[i] + op_usr[i] for i in range(len(op_sys))]
  359. # Initialize oplist_avg_cpu_pct with -1 for each pipeline op, since
  360. # CPU utilization data may not have information for each pipeline op
  361. oplist_avg_cpu_pct = [-1] * len(dict_opid_cpuutil)
  362. total_cpu = 0
  363. for op_id, cpu in dict_opid_cpuutil.items():
  364. op_avg_cpu_pct = sum(cpu) / len(cpu) if cpu else 0
  365. oplist_avg_cpu_pct[op_id] = round(op_avg_cpu_pct, 2)
  366. total_cpu += op_avg_cpu_pct
  367. return_dict = {}
  368. return_dict['avg_cpu_pct'] = oplist_avg_cpu_pct
  369. return return_dict
  370. def _parse_device_trace_info(self, device_trace_info):
  371. """
  372. Parse and process the device trace profiling information.
  373. Args:
  374. device_trace_info: The device trace profiling information in text format, one line per record.
  375. Returns:
  376. Dictionary with analyzed summary output information
  377. Dictionary consists of:
  378. per_batch_time: Average per batch time for pipeline in milliseconds
  379. per_pipeline_time: Average per pipeline time in milliseconds
  380. per_push_queue_time: Average per queue push time in milliseconds
  381. """
  382. # Information on the format of the device tracing profiling information.
  383. # Format is: type extra-info batch-num value timestamp
  384. # 0) type: 0: time, 1: connector size
  385. # 1) extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
  386. # if type is 1 - connector capacity
  387. # 2) batch-num: batch number
  388. # 3) value: if type is 0 - value is time(ms)
  389. # if type is 1 - value is connector size
  390. # 4) timestamp
  391. # Examples:
  392. # 0 0 20 10 xxx - The 20th batch took 10ms to get data from pipeline.
  393. # 1 64 20 5 yyy - Connector size is 5 when get the 20th batch.Connector capacity is 64.
  394. prev_time = 0
  395. q_time = [[], [], []] # pipeline time, push TDT time, batch time
  396. # Parse each record
  397. for line_data in device_trace_info:
  398. record = [int(d) for d in line_data.split(" ")][0:5]
  399. if record[2] < 2: # skip 1st batch
  400. prev_time = record[4]
  401. continue
  402. if record[0] == 0: # type 0: time record
  403. q_time[record[1]].append(record[3])
  404. elif record[0] == 1: # type 1: connector size record
  405. # Check if dataset_iterator trace profiling file was found
  406. if not self._device_queue_file_found:
  407. q_time[2].append(record[4] - prev_time)
  408. prev_time = record[4]
  409. # Compute average queue times
  410. avg_pipeline_time = sum(q_time[0]) / len(q_time[0]) if q_time[0] else -1
  411. avg_push_queue_time = sum(q_time[1]) / len(q_time[1]) if q_time[1] else -1
  412. avg_batch_time = sum(q_time[2]) / len(q_time[2]) if q_time[2] else -1
  413. return_dict = {}
  414. return_dict['per_batch_time'] = [round(avg_batch_time, 3)]
  415. return_dict['per_pipeline_time'] = [round(avg_pipeline_time, 3)]
  416. return_dict['per_push_queue_time'] = [round(avg_push_queue_time, 3)]
  417. return return_dict
  418. def _compute_composite_info(self, summary_dict):
  419. """
  420. Compute composite analysis information from the current summary pipeline data.
  421. Args:
  422. summary_dict (dict): Input summary pipeline information.
  423. Returns:
  424. Dictionary with composite analysis output information
  425. Dictionary consists of:
  426. avg_cpu_pct_per_worker: Average CPU utilization percentage per worker
  427. """
  428. return_dict = {}
  429. # Build list: average CPU utilization percentage per worker - for each op
  430. avg_cpu_pct_per_worker = []
  431. for c, n in zip(summary_dict.get('avg_cpu_pct'), summary_dict.get('num_workers')):
  432. avg_cpu_pct_per_worker.append(round(c / n if (n != 0 and c >= 0) else -1, 2))
  433. return_dict['avg_cpu_pct_per_worker'] = avg_cpu_pct_per_worker
  434. return return_dict
  435. @staticmethod
  436. def _analyze_for_bottleneck_op(summary_dict):
  437. """
  438. Analyze the MindData summary information and identify any potential bottleneck operator
  439. in the MindData pipeline.
  440. Args:
  441. summary_dict (dict): Input summary pipeline information.
  442. Returns:
  443. Dictionary with the following information, if applicable:
  444. - CPU utilization analysis
  445. - queue utilization analysis
  446. - bottleneck warning: Information on the bottleneck op
  447. (This is returned only if a potential bottleneck is identified.)
  448. - bottleneck suggestion: Reason why the subject op is it is identified as
  449. a potential bottleneck, plus suggestion on how to resolve the bottleneck.
  450. (This is returned only if a potential bottleneck is identified.)
  451. """
  452. try:
  453. bottleneck_analyzer = BottleneckAnalyzer(summary_dict)
  454. return_dict = bottleneck_analyzer.analyze()
  455. except IndexError:
  456. return_dict = {}
  457. return return_dict
  458. def _save_as_csv_file(self, data_dict):
  459. """
  460. Save data dictionary information to CSV file.
  461. Args:
  462. data_dict (dict): Input data dictionary information.
  463. Returns:
  464. Data dictionary information is saved to CSV file named 'minddata_pipeline_summary_<device_id>.csv'.
  465. """
  466. summary_templatename = 'minddata_pipeline_summary_{}.csv'
  467. output_csv_path_filename = os.path.join(self._output_path, summary_templatename.format(self._device_id))
  468. # Open file for writing
  469. data_file = open(output_csv_path_filename, 'w')
  470. # Create CSV writer object
  471. csv_writer = csv.writer(data_file)
  472. # Write the dictionary information to CSV file
  473. # Create deepcopy of input data_dict so zip processing in this function does NOT change the data_dict
  474. temp_dict = copy.deepcopy(data_dict)
  475. for data_key, data_value in zip(temp_dict.keys(), temp_dict.values()):
  476. # Begin/prefix the data value with the data key
  477. data_value.insert(0, data_key)
  478. csv_writer.writerow(data_value)
  479. # Close file for writing
  480. data_file.close()
  481. # Update file permissions
  482. os.chmod(output_csv_path_filename, stat.S_IREAD | stat.S_IWRITE)
  483. def _analyze_and_save(self, pipeline_info, cpu_util_info, device_trace_info):
  484. """
  485. Analyze and save the MindData summary information to file.
  486. Args:
  487. pipeline_info (dict): The pipeline information read from the input JSON file.
  488. cpu_util_info (dict): The CPU utilization information read from the input JSON file.
  489. device_trace_info (text): The dataset iterator (CPU) or device queue (GPU, Ascend) trace profiling
  490. text file. Value is None if such file could not be identified.
  491. Returns:
  492. summary_dict (dict): Analyzed summary information.
  493. The summary dictionary information is doubly saved to a JSON file and a CSV file
  494. (so that these different formats are available to the users).
  495. """
  496. # Initialize summary output dictionary
  497. summary_dict = {}
  498. # Parse and process pipeline information
  499. summary_dict.update(self._parse_pipeline_info(pipeline_info))
  500. # Parse and process CPU utilization information
  501. summary_dict.update(self._parse_cpu_util_info(cpu_util_info))
  502. if device_trace_info is not None:
  503. # Parse and process device queue or dataset iterator trace profiling information
  504. summary_dict.update(self._parse_device_trace_info(device_trace_info))
  505. # Check if both pipeline data and CPU utilization data have the same number of ops
  506. num_pipeline_ops = len(summary_dict.get('pipeline_ops'))
  507. num_cpu_util_ops = len(summary_dict.get('avg_cpu_pct'))
  508. if num_pipeline_ops == num_cpu_util_ops:
  509. # Compute composite analysis information
  510. summary_dict.update(self._compute_composite_info(summary_dict))
  511. # Analyze pipeline info for potential bottleneck op
  512. bottleneck_dict = self._analyze_for_bottleneck_op(summary_dict)
  513. if bottleneck_dict:
  514. summary_dict.update(bottleneck_dict)
  515. else:
  516. # Produce a warning since the pipeline data and the CPU utilization data do not include information
  517. # for the same number of ops
  518. warning_msg = 'Number of ops for pipeline data: ' + str(num_pipeline_ops) + \
  519. ' does not match number of ops for CPU utilization data: ' + str(num_cpu_util_ops)
  520. logger.warning(warning_msg)
  521. # Save summary output dictionary to JSON output file (format#1)
  522. with open(self._save_path, 'w') as save_file:
  523. json.dump(summary_dict, save_file)
  524. os.chmod(self._save_path, stat.S_IREAD | stat.S_IWRITE)
  525. # Save summary output to CSV file (format#2)
  526. self._save_as_csv_file(summary_dict)
  527. # Return summary output dictionary (format#3)
  528. return summary_dict
  529. class BottleneckAnalyzer:
  530. """ analyzer for bottleneck """
  531. def __init__(self, summary_dict):
  532. """ constructor for BottleneckAnalyzer """
  533. self.pipeline_ops = summary_dict["pipeline_ops"]
  534. self.op_names = summary_dict["op_names"]
  535. self.op_ids = summary_dict["op_ids"]
  536. self.num_workers = summary_dict["num_workers"]
  537. self.queue_average_size = summary_dict["queue_average_size"]
  538. self.queue_utilization_pct = summary_dict["queue_utilization_pct"]
  539. self.queue_empty_freq_pct = summary_dict["queue_empty_freq_pct"]
  540. self.children_ids = summary_dict["children_ids"]
  541. self.parent_id = summary_dict["parent_id"]
  542. self.avg_cpu_pct = summary_dict["avg_cpu_pct"]
  543. self.avg_cpu_pct_per_worker = summary_dict["avg_cpu_pct_per_worker"]
  544. self.op_id_not_exist = -1
  545. self.queue_usage_not_exist = -1
  546. self.non_multithreaded_ops = set(["Barrier",
  547. "Concat",
  548. "EpochCtrl",
  549. "Rename",
  550. "Repeat",
  551. "Shuffle",
  552. "Skip",
  553. "Take",
  554. "Zip"])
  555. # These are the threshold values used in the pipeline bottleneck analyzer algorithm
  556. self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM = 75.0
  557. self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM = 20.0
  558. self._LEAF_OUTPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM = 50
  559. self._DEVICEQUEUE_INPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM = 60
  560. self._IN_OUT_QUEUE_UTIL_PCT_DIFF_MAXIMUM = 50
  561. self._IN_QUEUE_UTIL_PCT_MAXIMUM = 10
  562. def analyze(self):
  563. """ analyze all op's usage """
  564. detailed_analysis = {}
  565. cpu_analysis = self.analyze_cpu_usage()
  566. queue_analysis = self.analyze_queue_usage()
  567. if cpu_analysis:
  568. detailed_analysis["cpu_analysis_details"] = cpu_analysis
  569. if queue_analysis:
  570. detailed_analysis["queue_analysis_details"] = queue_analysis
  571. bottleneck, suggestion = self.analyze_bottleneck()
  572. if bottleneck[0]:
  573. detailed_analysis["bottleneck_warning"] = bottleneck
  574. detailed_analysis["bottleneck_suggestion"] = suggestion
  575. return detailed_analysis
  576. def __get_non_inline_child_recur(self, cur_op_id):
  577. """get the child id of cur op which isn't an inline op"""
  578. if cur_op_id == self.op_id_not_exist or not self.children_ids[cur_op_id]:
  579. return self.op_id_not_exist
  580. cur_child_id = self.children_ids[cur_op_id][0]
  581. if self.queue_average_size[cur_child_id] != -1:
  582. return cur_child_id
  583. return self.__get_non_inline_child_recur(cur_child_id)
  584. def analyze_cpu_usage(self):
  585. """ analyze cpu usage of each op """
  586. cpu_usage_analysis = []
  587. for op_id in self.op_ids:
  588. if op_id == self.op_id_not_exist or self.op_names[op_id] in self.non_multithreaded_ops:
  589. continue
  590. if self.avg_cpu_pct_per_worker[op_id] > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM and \
  591. self.op_names[op_id]:
  592. cpu_usage_analysis.append(
  593. ("{} is using {}% CPU per worker."
  594. " Setting num_parallel_workers"
  595. ">{} might bring extra performance.").format(self.pipeline_ops[op_id],
  596. self.avg_cpu_pct_per_worker[op_id],
  597. self.num_workers[op_id]))
  598. elif self.avg_cpu_pct_per_worker[op_id] < self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM and \
  599. self.num_workers[op_id] > 1:
  600. cpu_usage_analysis.append(
  601. ("{} is using {}% CPU per worker. Using num_parallel_workers={} might not bring as much benefit"
  602. " due to low CPU usage per worker.").format(self.pipeline_ops[op_id],
  603. self.avg_cpu_pct_per_worker[op_id],
  604. self.num_workers[op_id]))
  605. return cpu_usage_analysis
  606. def analyze_queue_usage(self):
  607. """ analyze queue usage of each op """
  608. queue_usage_analysis = []
  609. for op_id in self.op_ids:
  610. if op_id == self.op_id_not_exist or self.op_names[op_id] in self.non_multithreaded_ops:
  611. continue
  612. if self.op_names[op_id] == "Batch":
  613. continue
  614. in_op_id, out_q = self.__get_non_inline_child_recur(
  615. op_id), self.queue_utilization_pct[op_id]
  616. if in_op_id == self.op_id_not_exist and out_q != self.queue_usage_not_exist:
  617. # This is a leaf node since input queue does not exist and output queue exists
  618. if out_q < self._LEAF_OUTPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM:
  619. queue_usage_analysis.append(("Leaf op {} is using {}% of its output queue."
  620. "Setting num_parallel_workers"
  621. ">{} might speed up I/O.").format(self.pipeline_ops[op_id],
  622. out_q,
  623. self.num_workers[op_id]))
  624. elif self.op_names[op_id] == "DeviceQueue" and in_op_id != self.op_id_not_exist:
  625. # if this is device_queue op,
  626. if self.queue_empty_freq_pct[in_op_id] > self._DEVICEQUEUE_INPUT_QUEUE_EMPTY_FREQ_PCT_MAXIMUM:
  627. queue_usage_analysis.append((
  628. "{}'s input queue is empty {}% of the time. This might indicate dataset bottlenecks."
  629. " Hence host cannot keep up with the device {}% of the time."
  630. " Device waits whenever input queue is empty.").format(self.pipeline_ops[op_id],
  631. self.queue_empty_freq_pct[in_op_id],
  632. self.queue_empty_freq_pct[in_op_id]))
  633. elif in_op_id != self.op_id_not_exist and out_q != self.queue_usage_not_exist:
  634. in_q = self.queue_utilization_pct[in_op_id]
  635. if in_q != self.queue_usage_not_exist and in_q - out_q > self._IN_OUT_QUEUE_UTIL_PCT_DIFF_MAXIMUM:
  636. queue_usage_analysis.append((
  637. "{}'s input queue usage={}% is greater output queue usage={}%."
  638. " This indicates child op {} might be producing faster than its parent {} can consume."
  639. " If this op has low CPU utilization, try increasing "
  640. "prefetch_size or increasing num_workers.").format(self.pipeline_ops[op_id],
  641. in_q, out_q, self.pipeline_ops[in_op_id],
  642. self.pipeline_ops[op_id]))
  643. return queue_usage_analysis
  644. def analyze_bottleneck(self):
  645. """ analyze bottleneck by using both cpu and queue usage """
  646. bottleneck, suggestion = "", ""
  647. for op_id in reversed(self.op_ids):
  648. in_op_id, out_q = self.__get_non_inline_child_recur(
  649. op_id), self.queue_utilization_pct[op_id]
  650. wkr_cpu = self.avg_cpu_pct_per_worker[op_id]
  651. if op_id == self.op_id_not_exist or \
  652. self.op_names[op_id] in self.non_multithreaded_ops \
  653. or self.op_names[op_id] == "DeviceQueue":
  654. continue
  655. if wkr_cpu > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM:
  656. bottleneck = self.pipeline_ops[op_id]
  657. suggestion = "{} has high CPU utilization per worker of {}%".format(
  658. self.pipeline_ops[op_id], wkr_cpu)
  659. suggestion += " Try increasing num_parallel_workers above {}.".format(self.num_workers[op_id])
  660. elif wkr_cpu < self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM:
  661. in_op_id = self.__get_non_inline_child_recur(op_id)
  662. in_q_usage = self.queue_utilization_pct[in_op_id]
  663. if in_op_id != self.op_id_not_exist and (
  664. in_q_usage < self._IN_QUEUE_UTIL_PCT_MAXIMUM or out_q -
  665. in_q_usage > self._IN_OUT_QUEUE_UTIL_PCT_DIFF_MAXIMUM):
  666. bottleneck = self.pipeline_ops[op_id]
  667. suggestion = "{} has low CPU utilization per worker of {}%".format(
  668. self.pipeline_ops[op_id], wkr_cpu)
  669. suggestion += " and abnormal queue usage. Try increasing prefetch_size."
  670. return [bottleneck], [suggestion]