You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

integrator.py 71 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """The integrator for integrating parsed profiling files."""
  16. import csv
  17. import json
  18. import os
  19. import stat
  20. from decimal import Decimal
  21. from mindspore import log as logger
  22. from mindspore.context import get_auto_parallel_context
  23. from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \
  24. ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException
  25. from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond
  26. from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
  27. from mindspore.profiler.parser.container import TimelineContainer
  28. SIZE_LIMIT_DEFAULT = 20 * 1024 * 1024 # 20MB
  29. class Integrator:
  30. """
  31. The integrator for integrating parsed profiling files.
  32. Args:
  33. profiling_dir (str): The directory where the parsed profiling files are
  34. located.
  35. device_id (str): The device ID.
  36. """
  37. _file_name_aicore_detail_time = 'output_op_compute_time_{}.txt'
  38. _file_name_aicpu_time = 'output_data_preprocess_aicpu_{}.txt'
  39. _file_name_framework = 'framework_raw_{}.csv'
  40. _header_aicore_type = ['op_type', 'execution_time', 'execution_frequency',
  41. 'percent']
  42. _header_aicore_detail = ['full_op_name', 'execution_time']
  43. _header_aicpu = ['serial_number', 'op_type', 'total_time', 'dispatch_time',
  44. 'execution_time', 'run_start', 'run_end']
  45. _file_name_aicore_type_time = 'aicore_intermediate_{}_type.csv'
  46. _file_name_aicore_detail_info = 'aicore_intermediate_{}_detail.csv'
  47. _col_names_detail = ['op_name', 'op_type', 'avg_execution_time', 'subgraph', 'full_op_name', 'op_info']
  48. _none_filter_condition_key = ['is_display_detail', 'is_display_full_op_name']
  49. _none_sort_col_names = ['op_info']
  50. _aicore_data = []
  51. _aicore_detail_data = []
  52. _aicore_trace_data = []
  53. def __init__(self, profiling_dir, device_id):
  54. self._profiling_dir = profiling_dir
  55. self._device_id = device_id
  56. self._op_time_cache = {}
  57. self._total_time = Decimal('0.0')
  58. self._column = ""
  59. self._result = []
  60. def integrate(self):
  61. """Integrate the parsed profiling files."""
  62. self._parse_aicore_detail_time()
  63. self._parse_aicore_type_time()
  64. self._parse_aicpu_time()
  65. def get_aicore_data(self):
  66. self._aicore_data_load()
  67. return self._aicore_data
  68. def get_aicore_detail_data(self):
  69. self._aicore_detail_data_load()
  70. return self._aicore_detail_data
  71. def get_aicore_trace_data(self):
  72. self._aicore_trace_data_load()
  73. return self._aicore_trace_data
  74. def query_for_all_reduce(self):
  75. return self._query_for_all_reduce()
  76. def query_and_sort_by_op_type(self, filter_condition, op_type_order):
  77. return self._query_and_sort_by_op_type(filter_condition, op_type_order)
  78. def _parse_aicore_type_time(self):
  79. """Parse the parsed AICORE operator type file."""
  80. framework_file = os.path.join(
  81. self._profiling_dir,
  82. self._file_name_framework.format(self._device_id)
  83. )
  84. framework_file = validate_and_normalize_path(framework_file)
  85. if not os.path.isfile(framework_file):
  86. return
  87. op_name_type_cache = {}
  88. with open(framework_file, 'r') as src_file:
  89. csv_reader = csv.reader(src_file)
  90. _ = next(csv_reader)
  91. for row in csv_reader:
  92. op_name_type_cache[row[3]] = row[5]
  93. op_type_time_cache = {}
  94. for full_op_name, op_time in self._op_time_cache.items():
  95. op_type = op_name_type_cache.get(full_op_name)
  96. if op_type_time_cache.get(op_type) is None:
  97. op_type_time_cache[op_type] = [op_time, 1]
  98. else:
  99. op_type_time_cache[op_type][0] += op_time
  100. op_type_time_cache[op_type][1] += 1
  101. if self._total_time == 0:
  102. raise ValueError("The total time of operations can not be 0.")
  103. op_type_file_name = 'aicore_intermediate_' + self._device_id + '_type.csv'
  104. op_type_file_path = os.path.join(self._profiling_dir, op_type_file_name)
  105. with open(op_type_file_path, 'w') as type_file:
  106. csv_writer = csv.writer(type_file)
  107. csv_writer.writerow(self._header_aicore_type)
  108. for op_type, op_type_time_info in op_type_time_cache.items():
  109. type_info = [
  110. op_type, op_type_time_info[0], op_type_time_info[1],
  111. round((op_type_time_info[0] / self._total_time) * 100, 2)
  112. ]
  113. csv_writer.writerow(type_info)
  114. def _parse_aicore_detail_time(self):
  115. """Parse the parsed AICORE operator time file."""
  116. aicore_detail_file = os.path.join(
  117. self._profiling_dir,
  118. self._file_name_aicore_detail_time.format(self._device_id)
  119. )
  120. aicore_detail_file = validate_and_normalize_path(aicore_detail_file)
  121. if not os.path.isfile(aicore_detail_file):
  122. return
  123. op_detail_file_name = 'aicore_intermediate_' + self._device_id + '_detail.csv'
  124. op_detail_file_path = os.path.join(
  125. self._profiling_dir, op_detail_file_name
  126. )
  127. with open(aicore_detail_file, 'r') as src_file:
  128. row = src_file.readline()
  129. if row.startswith('op_name'):
  130. _ = src_file.readline()
  131. elif row.startswith('====='):
  132. _ = src_file.readline()
  133. _ = src_file.readline()
  134. else:
  135. return
  136. with open(op_detail_file_path, 'w') as detail_file:
  137. csv_writer = csv.writer(detail_file)
  138. csv_writer.writerow(self._header_aicore_detail)
  139. while True:
  140. row = src_file.readline()
  141. if not row:
  142. break
  143. op_infos = row.split()
  144. if op_infos[0] == 'total':
  145. self._total_time = Decimal(op_infos[2])
  146. continue
  147. self._op_time_cache[op_infos[0]] = Decimal(op_infos[1])
  148. csv_writer.writerow([op_infos[0], op_infos[1]])
  149. def _parse_aicpu_time(self):
  150. """Parse the parsed AICPU operator time file."""
  151. aicpu_file = os.path.join(
  152. self._profiling_dir,
  153. self._file_name_aicpu_time.format(self._device_id)
  154. )
  155. aicpu_file = validate_and_normalize_path(aicpu_file)
  156. if not os.path.isfile(aicpu_file):
  157. return
  158. save_file_name = 'aicpu_intermediate_' + self._device_id + '.csv'
  159. save_file_path = os.path.join(self._profiling_dir, save_file_name)
  160. with open(aicpu_file, 'r') as src_file:
  161. row = src_file.readline()
  162. if not row.startswith('serial_number'):
  163. return
  164. with open(save_file_path, 'w') as save_file:
  165. csv_writer = csv.writer(save_file)
  166. csv_writer.writerow(self._header_aicpu)
  167. while True:
  168. row = src_file.readline()
  169. if not row:
  170. break
  171. infos = row.split()
  172. if infos[0] == 'AI':
  173. continue
  174. csv_writer.writerow(infos)
  175. def _aicore_data_load(self):
  176. """Load data according to the parsed AICORE operator types file."""
  177. op_type_file_path = os.path.join(
  178. self._profiling_dir,
  179. self._file_name_aicore_type_time.format(self._device_id)
  180. )
  181. op_type_file_path = validate_and_normalize_path(op_type_file_path)
  182. if not os.path.isfile(op_type_file_path):
  183. logger.warning('The file <%s> does not exist.', op_type_file_path)
  184. return
  185. with open(op_type_file_path, 'r') as file:
  186. csv_reader = csv.reader(file)
  187. _ = next(csv_reader)
  188. for info in csv_reader:
  189. self._aicore_data.append([info[0], float(info[1]), int(info[2]), float(info[3])])
  190. def _aicore_detail_data_load(self):
  191. """Load data according to the parsed AICORE operator file."""
  192. op_detail_file_path = os.path.join(
  193. self._profiling_dir,
  194. self._file_name_aicore_detail_info.format(self._device_id)
  195. )
  196. framework_file_path = os.path.join(
  197. self._profiling_dir,
  198. self._file_name_framework.format(self._device_id)
  199. )
  200. op_detail_file_path = validate_and_normalize_path(op_detail_file_path)
  201. framework_file_path = validate_and_normalize_path(framework_file_path)
  202. if not os.path.isfile(op_detail_file_path):
  203. logger.warning('The file <%s> does not exist.', op_detail_file_path)
  204. return
  205. if not os.path.isfile(framework_file_path):
  206. logger.warning('The file <%s> does not exist.', framework_file_path)
  207. return
  208. framework_infos = dict()
  209. with open(framework_file_path, 'r') as file:
  210. csv_reader = csv.reader(file)
  211. _ = next(csv_reader)
  212. for info in csv_reader:
  213. framework_infos[info[3]] = [
  214. info[3], info[4], info[5], info[6], json.loads(info[7]) if info[7] else None]
  215. with open(op_detail_file_path, 'r') as file:
  216. csv_reader = csv.reader(file)
  217. _ = next(csv_reader)
  218. for info in csv_reader:
  219. framework_info = framework_infos.get(info[0])
  220. self._aicore_detail_data.append(
  221. [
  222. framework_info[1], framework_info[2], float(info[1]),
  223. framework_info[3], framework_info[0], framework_info[4]
  224. ]
  225. )
  226. del framework_infos
  227. def _aicore_trace_data_load(self):
  228. """Load data according to the parsed AICORE operator types file."""
  229. file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id))
  230. if not file_path:
  231. logger.critical("Failed to find parsed trace time file.")
  232. raise ProfilerFileNotFoundException('parsed step trace time file')
  233. file_path = validate_and_normalize_path(file_path)
  234. with open(file_path, 'r') as handle:
  235. csv_reader = csv.reader(handle)
  236. self._column = next(csv_reader)
  237. self._aicore_trace_data = list(csv_reader)
  238. self._size = len(self._aicore_trace_data) - 1
  239. self._load_point_info()
  240. def _load_point_info(self):
  241. """Load point info."""
  242. file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json')
  243. file_path = validate_and_normalize_path(file_path)
  244. if os.path.isfile(file_path):
  245. with open(file_path, 'r', encoding='utf-8') as file:
  246. try:
  247. self._point_info = json.load(file)
  248. except (json.JSONDecodeError, TypeError) as err:
  249. logger.warning(err)
  250. raise ProfilerRawFileException('Fail to parse point info file.')
  251. def _query_for_all_reduce(self):
  252. """
  253. Query for all reduce info.
  254. Returns:
  255. list[dict], reduce information. Each item is the reduce info for one step.
  256. The reduce info is format like:
  257. {stream_id: List[Tuple(start_point, end_point, duration, field_name)]}.
  258. """
  259. self._aicore_trace_data_load()
  260. reduce_infos = []
  261. for row_info in self._aicore_trace_data[:-1]:
  262. row_info_dict = self._get_info_dict_from_row_data(row_info, 'systime')
  263. reduce_info = self._sort_reduce_by_time(row_info_dict)
  264. if reduce_info:
  265. reduce_infos.extend(reduce_info)
  266. reduce_infos.sort(key=lambda x: float(x[2]))
  267. return reduce_infos
  268. def _get_info_dict_from_row_data(self, row_info, time_type):
  269. """
  270. Get step info in dict format.
  271. Args:
  272. row_info (list[str]): Step info, the value is corresponding to `__column__`.
  273. time_type (str): The value type. `systime` keeps the original value.
  274. `realtime` transforms the value in millisecond. Default: `realtime`.
  275. Returns:
  276. dict, step trace information. The key is in `__column__`.
  277. """
  278. row_info_dict = {}
  279. for key, value in zip(self._column, row_info):
  280. if key == 'step_num':
  281. continue
  282. value = to_int(value, key)
  283. row_info_dict[key] = to_millisecond(value) if time_type == 'realtime' else value
  284. return row_info_dict
  285. def _sort_reduce_by_time(self, row_info_dict):
  286. """
  287. Sort reduce info by time.
  288. Args:
  289. row_info_dict (dict): Step trace information.
  290. Returns:
  291. list, including the all reduce info sorted by start time only.
  292. [
  293. [reduce_field, stream_id, reduce_start, reduce_duration],
  294. [...],
  295. [...]
  296. ]
  297. """
  298. factor = 1e5 # convert time unit from 10ns to 1ms
  299. reduce_pid = 10000
  300. reduce_info = []
  301. reduce_fields = [field_name for field_name in self._column
  302. if field_name.startswith('stream_') and not field_name.endswith('point')]
  303. for reduce_field in reduce_fields:
  304. reduce_start = row_info_dict.get(reduce_field + '_start_point')
  305. reduce_start = reduce_start / factor \
  306. if reduce_start else 0
  307. reduce_duration = row_info_dict.get(reduce_field)
  308. reduce_duration = reduce_duration / factor if reduce_duration else 0
  309. if not (reduce_start and reduce_duration):
  310. logger.info("Reduce event missing value.")
  311. continue
  312. cur_stream_id = reduce_field.split('_', 2)[1]
  313. reduce_meta = [reduce_field, int(cur_stream_id), reduce_start,
  314. reduce_duration, reduce_pid]
  315. reduce_info.append(reduce_meta)
  316. return reduce_info
  317. def _query_and_sort_by_op_type(self, filter_condition, op_type_order: list):
  318. """
  319. Query the AICORE operator detail information by `filter_condition`,
  320. and sort by `op_type_order` and execution time.
  321. Args:
  322. filter_condition (dict): The filter condition.
  323. op_type_order (list[str]): The name of the operator type in order.
  324. Returns:
  325. dict, The results are filtered and sorted.
  326. """
  327. self._aicore_detail_data_load()
  328. if filter_condition is None:
  329. filter_condition = {}
  330. self._filter(filter_condition)
  331. type_detail_cache = {}
  332. for detail_info in self._result:
  333. op_type = detail_info[1]
  334. if op_type not in op_type_order:
  335. continue
  336. infos = type_detail_cache.get(op_type)
  337. if infos:
  338. infos.append(detail_info)
  339. else:
  340. type_detail_cache[op_type] = [detail_info]
  341. result = []
  342. for op_type in op_type_order:
  343. detail_infos = type_detail_cache.get(op_type)
  344. if detail_infos is None:
  345. continue
  346. detail_infos.sort(key=lambda item: item[2], reverse=True)
  347. result.extend(detail_infos)
  348. return {
  349. 'col_name_detail': self._display_col_names_detail,
  350. 'object': result
  351. }
  352. def _filter(self, filter_condition):
  353. """
  354. Filter the profiling data according to the filter condition.
  355. Args:
  356. filter_condition (dict): The filter condition.
  357. """
  358. def _inner_filter(item: list):
  359. return self._default_filter(item, filter_condition)
  360. def _inner_map(item: list):
  361. inner_item = item[0:4]
  362. if is_display_full_op_name:
  363. inner_item.append(item[4])
  364. if is_display_detail:
  365. inner_item.append(item[5])
  366. return inner_item
  367. is_display_detail = filter_condition.get('is_display_detail', True)
  368. is_display_full_op_name = filter_condition.get(
  369. 'is_display_full_op_name', True
  370. )
  371. self._set_display_col_name(is_display_detail, is_display_full_op_name)
  372. if is_display_detail and is_display_full_op_name:
  373. self._result = list(filter(_inner_filter, self._aicore_detail_data))
  374. else:
  375. self._result = list(
  376. map(_inner_map, filter(_inner_filter, self._aicore_detail_data))
  377. )
  378. def _default_filter(self, item, condition):
  379. """
  380. The default filter method.
  381. Args:
  382. item (list[Union[str, float, int]]): A piece of data to be filtered.
  383. condition (dict): The filter condition.
  384. Returns:
  385. bool, `True` if the item is satisfied.
  386. """
  387. for condition_key, condition_value in condition.items():
  388. if condition_key in self._none_filter_condition_key:
  389. continue
  390. if condition_key in self._col_names_detail:
  391. index = self._col_names_detail.index(condition_key)
  392. actual_value = item[index]
  393. for exp_key, exp_value in condition_value.items():
  394. if not self._is_match_condition(
  395. exp_key, exp_value, actual_value):
  396. return False
  397. return True
  398. @staticmethod
  399. def _is_match_condition(exp_key, exp_value, actual_value):
  400. """
  401. Check whether the actual value meets the expect condition.
  402. Args:
  403. exp_key (str): Expect key of the condition.
  404. exp_value (str): Expect value.
  405. actual_value (str): Actual value.
  406. Returns:
  407. bool, `True` if the actual meets the expect condition, else `False`.
  408. """
  409. if exp_key == 'in':
  410. if actual_value not in exp_value:
  411. return False
  412. elif exp_key == 'not_in':
  413. if actual_value in exp_value:
  414. return False
  415. elif exp_key == 'partial_match_str_in':
  416. for partial_match_str in exp_value:
  417. if partial_match_str in actual_value:
  418. return True
  419. return False
  420. else:
  421. return False
  422. return True
  423. def _set_display_col_name(self, is_display_detail, is_display_full_op_name):
  424. """
  425. Set the display column name according to the filter condition.
  426. Args:
  427. is_display_detail (bool): Whether to display the detailed operator
  428. information.
  429. is_display_full_op_name (bool): Whether to display the operator full
  430. name.
  431. """
  432. self._display_col_names_detail = self._col_names_detail[0:4]
  433. if is_display_full_op_name:
  434. self._display_col_names_detail.append(self._col_names_detail[4])
  435. if is_display_detail:
  436. self._display_col_names_detail.append(self._col_names_detail[5])
  437. class BaseTimelineGenerator:
  438. """
  439. Analyse timeline data from file.
  440. """
  441. __col_names__ = ['op_name', 'stream_id', 'start_time', 'duration']
  442. _output_timeline_data_file_path = 'output_timeline_data_{}.txt'
  443. _timeline_meta = []
  444. _format_meta_data_list = []
  445. _thread_processed_list = []
  446. # AI Core Op pid is device_id
  447. _AI_CPU_PID = 9000
  448. _COMMUNICATION_OP_PID = 10000
  449. _HOST_CPU_PID = 11000
  450. _OP_OVERLAP_PID = 12000
  451. _MERGED_COMPUTATION_TID = 7999
  452. _PURE_COMMUNICATION_TID = 8000
  453. _MERGED_COMMUNICATION_TID = 8001
  454. _FREE_TIME_TID = 8002
  455. _STEPS_TID = 100000
  456. _SCOPE_NAME_TID = 100001
  457. _GPU_OP_TID = 100002
  458. _HOST_CPU_OP_TID = 100003
  459. _map_tid_name_to_int = {
  460. "Steps": (-4, _STEPS_TID),
  461. "Scope Name": (-3, _SCOPE_NAME_TID),
  462. "GpuOps": (-2, _GPU_OP_TID),
  463. "HostCpuOps": (-1, _HOST_CPU_OP_TID)
  464. }
  465. _timeline_summary = {
  466. 'total_time': 0,
  467. 'num_of_streams': 0,
  468. 'num_of_ops': 0,
  469. 'op_exe_times': 0,
  470. 'max_scope_name_num': 0,
  471. }
  472. _op_name_idx, _tid_idx, _start_time_idx, _duration_idx = 0, 1, 2, 3
  473. _max_scope_name_num = 0
  474. _host_cpu_op_label = 'HostCpuOps'
  475. def get_thread_label_name(self):
  476. """Get process and thread config."""
  477. return [
  478. {"name": "process_labels", "ph": "M", "pid": self._device_id, "args": {"labels": "AI Core Op"}},
  479. {"name": "process_labels", "ph": "M", "pid": self._AI_CPU_PID, "args": {"labels": "AI CPU Op"}},
  480. {"name": "process_labels", "ph": "M", "pid": self._COMMUNICATION_OP_PID,
  481. "args": {"labels": "Communication Op"}},
  482. {"name": "process_labels", "ph": "M", "pid": self._HOST_CPU_PID, "args": {"labels": "Host CPU Op"}},
  483. {"name": "process_labels", "ph": "M", "pid": self._OP_OVERLAP_PID,
  484. "args": {"labels": "Op Overlap Analyse"}},
  485. {"name": "process_sort_index", "ph": "M", "pid": self._device_id, "args": {"sort_index": 0}},
  486. {"name": "process_sort_index", "ph": "M", "pid": self._AI_CPU_PID, "args": {"sort_index": 10}},
  487. {"name": "process_sort_index", "ph": "M", "pid": self._COMMUNICATION_OP_PID, "args": {"sort_index": 20}},
  488. {"name": "process_sort_index", "ph": "M", "pid": self._HOST_CPU_PID, "args": {"sort_index": 30}},
  489. {"name": "process_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "args": {"sort_index": 40}},
  490. {"name": "thread_name", "ph": "M", "pid": self._HOST_CPU_PID, "tid": self._HOST_CPU_OP_TID,
  491. "args": {"name": "Host CPU Op"}},
  492. {"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMPUTATION_TID,
  493. "args": {"name": "Merged Computation Op"}},
  494. {"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._PURE_COMMUNICATION_TID,
  495. "args": {"name": "Pure Communication Op"}},
  496. {"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMMUNICATION_TID,
  497. "args": {"name": "Merged Communication Op"}},
  498. {"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._FREE_TIME_TID,
  499. "args": {"name": "Free Time"}},
  500. {"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMPUTATION_TID,
  501. "args": {"sort_index": self._MERGED_COMPUTATION_TID}},
  502. {"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._PURE_COMMUNICATION_TID,
  503. "args": {"sort_index": self._PURE_COMMUNICATION_TID}},
  504. {"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMMUNICATION_TID,
  505. "args": {"sort_index": self._MERGED_COMMUNICATION_TID}},
  506. {"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._FREE_TIME_TID,
  507. "args": {"sort_index": self._FREE_TIME_TID}}
  508. ]
  509. def write_timeline(self, size_limit=SIZE_LIMIT_DEFAULT):
  510. """Load data according to the parsed profiling files."""
  511. # Write timeline to file.
  512. logger.info('Writing timeline file...')
  513. self.write_timeline_to_json_by_limitation(size_limit)
  514. logger.info('Finished file writing!')
  515. def write_timeline_to_json_by_limitation(self, size_limit):
  516. """Write timeline to json by limitation."""
  517. display_file_path = os.path.join(
  518. self._profiling_dir,
  519. self._display_filename
  520. )
  521. display_file_path = validate_and_normalize_path(display_file_path)
  522. try:
  523. with open(display_file_path, 'w') as json_file:
  524. json_file.write('[')
  525. for _, item in enumerate(self._timeline_meta):
  526. json.dump(item, json_file)
  527. if "scope_level" in item.keys():
  528. self._max_scope_name_num = max(
  529. self._max_scope_name_num, item["scope_level"] + 1)
  530. file_size = os.path.getsize(display_file_path)
  531. json_file.write(',')
  532. if file_size > size_limit:
  533. break
  534. label_name_json = json.dumps(self.get_thread_label_name())
  535. label_name_json = label_name_json.lstrip('[')
  536. json_file.write(label_name_json)
  537. os.chmod(display_file_path, stat.S_IREAD | stat.S_IWRITE)
  538. except (IOError, OSError) as err:
  539. logger.critical('Error occurred when write timeline display file: %s', err)
  540. raise ProfilerIOException()
  541. def write_timeline_summary(self):
  542. """Write timeline summary to json."""
  543. timeline_summary_file_path = os.path.join(
  544. self._profiling_dir,
  545. self._timeline_summary_filename
  546. )
  547. timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path)
  548. try:
  549. with open(timeline_summary_file_path, 'w') as json_file:
  550. json.dump(self._timeline_summary, json_file)
  551. os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE)
  552. except (IOError, OSError) as err:
  553. logger.critical('Error occurred when write timeline summary file: %s', err)
  554. raise ProfilerIOException()
  555. @staticmethod
  556. def _update_num_of_streams(timeline, stream_count_dict):
  557. """Update number of streams."""
  558. stream_id = timeline[1]
  559. if stream_id in ["Steps", "Scope Name"]:
  560. return
  561. if stream_id not in stream_count_dict.keys():
  562. stream_count_dict[stream_id] = 1
  563. else:
  564. stream_count_dict[stream_id] += 1
  565. def _update_format_meta_data(self, timeline_dict):
  566. """Update format meta data which control the display arrange and map the thread name."""
  567. thread_name_meta_data = {
  568. "name": "thread_name",
  569. "pid": int(self._device_id),
  570. "tid": 100000,
  571. "ts": 0,
  572. "ph": "M",
  573. "cat": "__metadata",
  574. "args": {
  575. "name": "Steps"
  576. }
  577. }
  578. tid_name = timeline_dict['tid']
  579. sort_index = 0
  580. if tid_name in self._map_tid_name_to_int.keys():
  581. sort_index, tid = self._map_tid_name_to_int[tid_name]
  582. elif tid_name.startswith("Stream"):
  583. tid = int(tid_name.split("#")[-1])
  584. sort_index = tid
  585. else:
  586. return
  587. if tid_name == self._host_cpu_op_label:
  588. thread_name_meta_data['pid'] = self._HOST_CPU_PID
  589. thread_name_meta_data["tid"] = tid
  590. thread_name_meta_data["args"]["name"] = tid_name
  591. thread_sort_meta_data = thread_name_meta_data.copy()
  592. thread_sort_meta_data['name'] = "thread_sort_index"
  593. thread_sort_meta_data["args"] = {"sort_index": sort_index}
  594. timeline_dict["tid"] = tid
  595. if tid_name in self._thread_processed_list:
  596. return
  597. self._thread_processed_list.append(tid_name)
  598. self._format_meta_data_list.append(thread_name_meta_data)
  599. self._format_meta_data_list.append(thread_sort_meta_data)
  600. def _get_max_scope_name_num(self, timeline_list):
  601. """Get the max number of scope level from all operator."""
  602. max_scope_name_num = 0
  603. for time_item in timeline_list:
  604. cur_scope_name_num = len(time_item[self._op_name_idx].split('/')) - 1
  605. max_scope_name_num = max(cur_scope_name_num, max_scope_name_num)
  606. return max_scope_name_num
  607. def _get_scope_name_time_list(self, timeline_list, subgraph, factor_start_time_to_duration=1):
  608. """Produce the timeline of hierarchical scope name."""
  609. # the key of scope_name_start_duration_dict is scope name, the value is a dict which store the
  610. # start and end index of time_item in timeline_list.
  611. scope_name_start_duration_dict = {}
  612. scope_name_time_list = []
  613. op_full_name_idx, scope_name_idx, invalid_idx = 0, 0, -1
  614. tid = "Scope Name"
  615. for idx, time_item in enumerate(timeline_list):
  616. scope_name_list = time_item[op_full_name_idx].split('/')[:-1]
  617. # skip Default/InitDataSetQueue operator.
  618. if time_item[op_full_name_idx].startswith("Default/InitDataSetQueue"):
  619. scope_name_list = []
  620. # process scope name of subgraph(Default/Gradients/recompute_Default) only.
  621. if scope_name_list and scope_name_list[0] != subgraph:
  622. scope_name_list = []
  623. # add the level of scope name, used to distinguish the same name at different scope level.
  624. scope_name_list = [f"{scope_level}-{scope_name}"
  625. for scope_level, scope_name in enumerate(scope_name_list)]
  626. # update the start and end index of time_item according to current scope_name
  627. for scope_name in scope_name_list:
  628. init_start_end_idx_dict = {'start_item_idx': idx, 'end_item_idx': idx}
  629. if scope_name not in scope_name_start_duration_dict:
  630. scope_name_start_duration_dict[scope_name] = init_start_end_idx_dict
  631. if scope_name_start_duration_dict[scope_name]['start_item_idx'] == invalid_idx:
  632. scope_name_start_duration_dict[scope_name] = init_start_end_idx_dict
  633. else:
  634. scope_name_start_duration_dict[scope_name]['end_item_idx'] = idx
  635. # if the key(scope name) in scope_name_start_duration_dict does not appear in scope_name_list,
  636. # it means this key(scope name) is end and it is append to scope_name_time_list.
  637. for key, val in scope_name_start_duration_dict.items():
  638. if val['start_item_idx'] == invalid_idx:
  639. continue
  640. if (key not in scope_name_list) \
  641. or idx == (len(timeline_list) - 1) \
  642. or time_item[op_full_name_idx] == self._step_end_op_name:
  643. start_time = timeline_list[val['start_item_idx']][self._start_time_idx]
  644. duration = (float(timeline_list[val['end_item_idx']][self._start_time_idx]) - float(start_time)) * \
  645. factor_start_time_to_duration + float(timeline_list[val['end_item_idx']][self._duration_idx])
  646. scope_name_time_item = [key, tid, start_time, duration]
  647. scope_name_time_list.append(scope_name_time_item)
  648. scope_name_start_duration_dict[key]['start_item_idx'] = invalid_idx
  649. # x[scope_name_idx] is a scope name like "0-Default".
  650. # if two element in scope_name_time_list have the same start time,
  651. # the previous element in list will displayed at the higher line in UI page.
  652. scope_name_time_list.sort(
  653. key=lambda x: (float(x[self._start_time_idx]), int(x[scope_name_idx].split('-')[0]))
  654. )
  655. return scope_name_time_list
  656. def _set_step_start_and_end_op_name(self, timeline_list):
  657. """Set the start and end operator full name of each step."""
  658. if not timeline_list:
  659. return
  660. start_op_idx = 0
  661. if timeline_list[0][self._op_name_idx].startswith("Default/InitDataSetQueue"):
  662. start_op_idx = 1
  663. self._step_start_op_name = timeline_list[start_op_idx][self._op_name_idx]
  664. self._step_end_op_name = self._step_start_op_name
  665. if len(timeline_list) > (start_op_idx + 1):
  666. for time_item in timeline_list[start_op_idx + 1:]:
  667. if time_item[self._op_name_idx] != self._step_start_op_name:
  668. self._step_end_op_name = time_item[self._op_name_idx]
  669. else:
  670. break
  671. def _get_step_time_list(self, timeline_list, factor_start_time_to_duration=1):
  672. """Produce the time of each step."""
  673. # Record the time of each step.
  674. step_time_list = []
  675. step_num = 1
  676. tid = "Steps"
  677. cur_step_start_time, cur_step_duration_time = 0, 0
  678. for time_item in timeline_list:
  679. if time_item[self._op_name_idx] == self._step_start_op_name:
  680. cur_step_start_time = time_item[self._start_time_idx]
  681. if time_item[self._op_name_idx] == self._step_end_op_name:
  682. cur_step_duration_time = (float(time_item[self._start_time_idx]) - float(cur_step_start_time)) * \
  683. factor_start_time_to_duration + float(time_item[self._duration_idx])
  684. step_time_item = [str(step_num), tid, float(cur_step_start_time), cur_step_duration_time]
  685. step_time_list.append(step_time_item)
  686. step_num += 1
  687. return step_time_list
  688. class GpuTimelineGenerator(BaseTimelineGenerator):
  689. """Generate gpu Timeline data from file."""
  690. _display_filename = 'gpu_timeline_display_{}.json'
  691. _timeline_summary_filename = 'gpu_timeline_summary_{}.json'
  692. _output_op_execute_time_file_path = "gpu_op_execute_timestamp_{}.txt"
  693. _output_activity_execute_time_file_path = "activity_execute_timestamp_{}.txt"
  694. _output_gpu_activity_info_file_path = "gpu_activity_data_{}.csv"
  695. _step_trace_original_filename = 'step_trace_profiling_{}.txt'
  696. _activity_keys_list = []
  697. def __init__(self, profiling_dir, device_id):
  698. self._profiling_dir = profiling_dir
  699. self._device_id = device_id
  700. self._timeline_meta = []
  701. self._display_filename = self._display_filename.format(device_id)
  702. self._timeline_summary_filename = self._timeline_summary_filename.format(device_id)
  703. def _get_and_validate_path(self, file_name):
  704. """Generate op or activity file path from file name, and validate this path."""
  705. file_path = os.path.join(
  706. self._profiling_dir,
  707. file_name.format(self._device_id)
  708. )
  709. file_path = validate_and_normalize_path(file_path)
  710. if not os.path.exists(file_path):
  711. logger.critical(f"Failed to find parsed timeline file {file_path}.")
  712. raise ProfilerFileNotFoundException('parsed timeline file')
  713. return file_path
  714. def _parse_timeline_data(self, timeline, min_cycle_counter):
  715. """Parse timeline data."""
  716. # factor to convert the time unit of start_time(ts) from 1ns to 1us for timeline display
  717. factor = 1000
  718. op_meta = TimelineContainer(timeline)
  719. timeline_dict = {}
  720. timeline_dict['name'] = op_meta.op_name.split('/')[-1]
  721. timeline_dict['ph'] = 'X'
  722. timeline_dict['tid'] = op_meta.stream_id
  723. timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) / factor
  724. dur = op_meta.duration
  725. timeline_dict['dur'] = dur
  726. timeline_dict['pid'] = int(self._device_id)
  727. if op_meta.stream_id == "Scope Name":
  728. # remove the level of scope name which has a format like "0-conv2-Conv2d".
  729. timeline_dict['name'] = "-".join(op_meta.op_name.split('-')[1:])
  730. timeline_dict['scope_level'] = int(op_meta.op_name.split('-')[0])
  731. elif op_meta.stream_id == self._host_cpu_op_label:
  732. timeline_dict['pid'] = self._HOST_CPU_PID
  733. if len(timeline) > 4:
  734. # len(timeline) > 4 refers to activity data, else op data.
  735. # Add args for activity data
  736. args_dict = {}
  737. for ix, value in enumerate(timeline[4:]):
  738. args_dict[self._activity_keys_list[ix]] = value
  739. timeline_dict['args'] = args_dict
  740. timeline_dict['tid'] = f"Stream #{timeline_dict['tid']}"
  741. elif op_meta.stream_id not in ["Scope Name", "Steps"]:
  742. # Update total time of operator execution.
  743. self._timeline_summary['total_time'] += dur / factor
  744. self._timeline_summary['op_exe_times'] += 1
  745. self._update_format_meta_data(timeline_dict)
  746. self._timeline_meta.append(timeline_dict)
  747. def _load_timeline_data(self):
  748. """Load timeline data from file."""
  749. op_file_path = self._get_and_validate_path(
  750. self._output_op_execute_time_file_path)
  751. activity_file_path = self._get_and_validate_path(
  752. self._output_activity_execute_time_file_path)
  753. activity_args_file_path = self._get_and_validate_path(
  754. self._output_gpu_activity_info_file_path)
  755. timeline_list = self._load_op_data(op_file_path)
  756. # Add host cpu op timeline.
  757. cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._device_id)
  758. cpu_timeline_list = cpu_timeline_generator.load_cpu_op_data()
  759. if cpu_timeline_list:
  760. self._clock_synchronize_to_gpu(cpu_timeline_list)
  761. timeline_list.extend(cpu_timeline_list)
  762. timeline_list.sort(key=lambda x: float(x[2]))
  763. self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
  764. self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
  765. # Generate step time.
  766. factor_start_time_uint_to_duration = 1e-3
  767. self._set_step_start_and_end_op_name(timeline_list)
  768. # Fit gpu kernel async launch solution.
  769. if self.is_gpu_kernel_async_launch():
  770. step_time_list = self._get_step_time_list_from_step_trace()
  771. else:
  772. step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
  773. # Add Scope Name.
  774. default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
  775. factor_start_time_uint_to_duration)
  776. gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
  777. factor_start_time_uint_to_duration)
  778. recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
  779. factor_start_time_uint_to_duration)
  780. timeline_list.extend(default_scope_name_time_list)
  781. timeline_list.extend(gradient_scope_name_time_list)
  782. timeline_list.extend(recompute_scope_name_time_list)
  783. timeline_list.extend(step_time_list)
  784. timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
  785. # Add cuda activity timeline.
  786. activity_timeline_list = self._load_activity_data(activity_file_path, activity_args_file_path)
  787. timeline_list.extend(activity_timeline_list)
  788. timeline_list.sort(key=lambda x: float(x[2]))
  789. return timeline_list
  790. def _clock_synchronize_to_gpu(self, timeline_list):
  791. """Synchronize the timestamp from device to host."""
  792. start_time_file_path = os.path.join(self._profiling_dir, f"start_time_{self._device_id}.txt")
  793. try:
  794. with open(start_time_file_path) as f_obj:
  795. lines = f_obj.readlines()
  796. # lines[0] stores the host monotonic time of start training.
  797. host_monotonic_start_time = int(lines[0].strip().split(':')[-1])
  798. # lines[1] stores the gpu time of start training.
  799. gpu_start_time = int(lines[1].strip().split(':')[-1])
  800. except (IOError, OSError) as err:
  801. logger.critical(f'Error occurred when read {start_time_file_path}: {err}')
  802. raise ProfilerIOException()
  803. time_diff = gpu_start_time - host_monotonic_start_time
  804. for idx, time_item in enumerate(timeline_list):
  805. timeline_list[idx][self._start_time_idx] = int(time_item[self._start_time_idx]) + time_diff
  806. def _load_op_data(self, op_file_path):
  807. """Load operator data from file"""
  808. op_timeline_list = []
  809. try:
  810. with open(op_file_path, 'r') as f_obj:
  811. for line in f_obj:
  812. self._timeline_summary['num_of_ops'] += 1
  813. op_list = line.strip('\n').strip().split(';')
  814. time_arr = op_list[-1]
  815. time_arr = time_arr.split(" ")
  816. for time in time_arr:
  817. time = time.split(",")
  818. line_list = op_list[:2] + time
  819. op_timeline_list.append(line_list)
  820. except (IOError, OSError) as err:
  821. logger.critical('Error occurred when load operator timeline data intermediate file: %s', err)
  822. raise ProfilerIOException()
  823. return op_timeline_list
  824. def _load_activity_data(self, activity_file_path, activity_args_file_path):
  825. """Load activity data from file"""
  826. activity_timeline_list = []
  827. try:
  828. args_dict = {}
  829. with open(activity_args_file_path, 'r') as args_file:
  830. csv_reader = csv.reader(args_file)
  831. keys_list = next(csv_reader)
  832. # keys_list [name, type, op_full_name, stream_id, block_dim, grid_dim, ...]
  833. self._activity_keys_list = keys_list[1:3] + keys_list[4:6]
  834. for info in csv_reader:
  835. args_dict[info[0]] = info[1:3] + info[4:6]
  836. with open(activity_file_path, 'r') as f_obj:
  837. for line in f_obj:
  838. line_list = line.strip('\n').split(';')
  839. # concat activity args info.
  840. line_list += args_dict[line_list[0]]
  841. activity_timeline_list.append(line_list)
  842. except (IOError, OSError) as err:
  843. logger.critical('Error occurred when load activity timeline data intermediate file: %s', err)
  844. raise ProfilerIOException()
  845. return activity_timeline_list
  846. def init_timeline(self):
  847. """Init timeline metadata, adding all collected info."""
  848. timeline_list = self._load_timeline_data()
  849. # Init a dict for counting the num of streams.
  850. stream_count_dict = {}
  851. for timeline in timeline_list:
  852. self._parse_timeline_data(timeline, 0)
  853. # Updating the collection of streams.
  854. if len(timeline) == 4:
  855. self._update_num_of_streams(timeline, stream_count_dict)
  856. # Add format thread meta data.
  857. self._format_meta_data_list.extend(self._timeline_meta)
  858. self._timeline_meta = self._format_meta_data_list
  859. # Update timeline summary info
  860. self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
  861. def check_op_name(self, op_name):
  862. """
  863. Check whether the operator name exists.
  864. Args:
  865. op_name (str): The operator name or operator name prefix.
  866. Returns:
  867. bool, `True` if the operator name does exist, else `False`.
  868. """
  869. if not op_name:
  870. raise ProfilerParamValueErrorException('The op_name should exist.')
  871. for op_time_info in self._timeline_meta:
  872. full_op_name = op_time_info['name']
  873. if full_op_name and full_op_name.startswith(op_name):
  874. return True
  875. return False
  876. def _get_step_time_list_from_step_trace(self):
  877. """Produce the time of each step based on step_trace_profiling file."""
  878. # Record the time of each step.
  879. step_time_list = []
  880. step_start_op_name = []
  881. step_end_op_name = []
  882. step_num = 1
  883. tid = "Steps"
  884. step_trace_profiling_path = self._get_and_validate_path(
  885. self._step_trace_original_filename
  886. )
  887. try:
  888. with open(step_trace_profiling_path, 'r') as f_obj:
  889. for line in f_obj:
  890. line = line.strip().split()
  891. step_start_op_name.append(line[0].split(',')[0])
  892. step_end_op_name.append(line[3].split(',')[0])
  893. cur_step_start_time = float(line[0].split(',')[1])
  894. cur_step_end_time = float(line[3].split(',')[1])
  895. # convert duration time unit from ns to us.
  896. cur_step_duration_time = (cur_step_end_time - cur_step_start_time) / 1e3
  897. step_time_item = [str(step_num), tid, cur_step_start_time, cur_step_duration_time]
  898. step_time_list.append(step_time_item)
  899. step_num += 1
  900. except (IOError, OSError) as err:
  901. logger.critical(f'Error occurred when read {step_trace_profiling_path}: {err}')
  902. raise ProfilerIOException()
  903. return step_time_list
  904. def is_gpu_kernel_async_launch(self):
  905. """Recognize the solution that launch the gpu kernel async."""
  906. step_trace_profiling_path = self._get_and_validate_path(
  907. self._step_trace_original_filename
  908. )
  909. try:
  910. with open(step_trace_profiling_path, 'r') as f_obj:
  911. line = next(f_obj)
  912. first_string = line.strip().split()[0]
  913. # the data format of launch the gpu kernel async is "Default/op1,160123 op-name"
  914. # otherwise, the data format is "Default/op1 160123,12 "
  915. return bool(len(first_string.split(',')) == 2)
  916. except (IOError, OSError) as err:
  917. logger.critical(f'Error occurred when read {step_trace_profiling_path}: {err}')
  918. raise ProfilerIOException()
  919. class AscendTimelineGenerator(BaseTimelineGenerator):
  920. """Generate ascend Timeline data from file."""
  921. _display_filename = 'ascend_timeline_display_{}.json'
  922. _timeline_summary_filename = 'ascend_timeline_summary_{}.json'
  923. _cluster_analyse_filename = 'ascend_cluster_analyse_{}_{}_{}_{}.csv'
  924. def __init__(self, profiling_dir, device_id, rank_id, rank_size):
  925. self._profiling_dir = profiling_dir
  926. self._device_id = device_id
  927. self._rank_id = rank_id
  928. self._tid_dict = {
  929. "computation_op": (self._MERGED_COMPUTATION_TID, self._OP_OVERLAP_PID),
  930. "communication_not_overlapped": (self._PURE_COMMUNICATION_TID, self._OP_OVERLAP_PID),
  931. "communication": (self._MERGED_COMMUNICATION_TID, self._OP_OVERLAP_PID),
  932. "free_time": (self._FREE_TIME_TID, self._OP_OVERLAP_PID)
  933. }
  934. self._rank_size = rank_size
  935. self._display_filename = self._display_filename.format(rank_id)
  936. self._timeline_summary_filename = self._timeline_summary_filename.format(rank_id)
  937. def _load_timeline_data(self):
  938. """Load timeline data from file."""
  939. file_path = os.path.join(
  940. self._profiling_dir,
  941. self._output_timeline_data_file_path.format(self._rank_id)
  942. )
  943. file_path = validate_and_normalize_path(file_path)
  944. if not os.path.exists(file_path):
  945. logger.critical("Failed to find parsed timeline file.")
  946. raise ProfilerFileNotFoundException('parsed timeline file')
  947. timeline_list = []
  948. try:
  949. with open(file_path, 'r') as f_obj:
  950. for line in f_obj:
  951. if not line.startswith('op_name'):
  952. line_list = line.strip('\n').split(',')
  953. line_list[self._tid_idx] = f"Stream #{line_list[self._tid_idx]}"
  954. timeline_list.append(line_list)
  955. except (IOError, OSError) as err:
  956. logger.critical('Error occurred when read timeline intermediate file: %s', err)
  957. raise ProfilerIOException()
  958. return timeline_list
  959. def _parse_timeline_data(self, timeline, min_cycle_counter):
  960. """Parse timeline data."""
  961. # factor to convert the time unit from 1ms to 1us for timeline display
  962. factor = 1000
  963. op_meta = TimelineContainer(timeline)
  964. timeline_dict = {}
  965. timeline_dict['name'] = op_meta.op_name.split('/')[-1]
  966. timeline_dict['ph'] = 'X'
  967. timeline_dict['tid'] = op_meta.stream_id
  968. timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor
  969. dur = op_meta.duration * factor
  970. timeline_dict['dur'] = dur
  971. if op_meta.pid is None:
  972. timeline_dict['pid'] = int(self._device_id)
  973. # Update total time of operator execution.
  974. if op_meta.stream_id not in ["Steps", "Scope Name"]:
  975. self._timeline_summary['total_time'] += op_meta.duration
  976. else: # AllReduce and AI CPU pid
  977. timeline_dict['pid'] = op_meta.pid
  978. if op_meta.stream_id == "Scope Name":
  979. # remove the level of scope name which has a format like "0-conv2-Conv2d".
  980. timeline_dict['name'] = "-".join(op_meta.op_name.split('-')[1:])
  981. timeline_dict['scope_level'] = int(op_meta.op_name.split('-')[0])
  982. elif op_meta.stream_id == self._host_cpu_op_label:
  983. timeline_dict['pid'] = self._HOST_CPU_PID
  984. self._update_format_meta_data(timeline_dict)
  985. self._timeline_meta.append(timeline_dict)
  986. def init_timeline(self, communication_info, framework_info, aicpu_info, min_cycle_counter, source_path):
  987. """
  988. Init timeline metadata, adding all collected info.
  989. Args:
  990. communication_info (list[list]): The metadata of communication operator.
  991. framework_info (dict): The framework metadata.
  992. aicpu_info (dict): The metadata of AI CPU operator.
  993. min_cycle_counter (float): The minimum cycle counter of the timeline.
  994. source_path (str): The source of file.
  995. """
  996. if min_cycle_counter == float('inf'):
  997. min_cycle_counter = 0
  998. logger.info('Initiating timeline...')
  999. timeline_list = self._load_timeline_data()
  1000. cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._rank_id)
  1001. cpu_timeline_list = cpu_timeline_generator.get_timeline_data()
  1002. if cpu_timeline_list:
  1003. self._clock_synchronize_to_device(cpu_timeline_list, source_path)
  1004. timeline_list.extend(cpu_timeline_list)
  1005. timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
  1006. self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
  1007. self._timeline_summary['op_exe_times'] = len(timeline_list)
  1008. self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
  1009. # Generate step time.
  1010. self._set_step_start_and_end_op_name(timeline_list)
  1011. step_time_list = self._get_step_time_list(timeline_list)
  1012. # Add Scope Name.
  1013. default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default")
  1014. gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients")
  1015. recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default")
  1016. # Add AI CPU data into timeline temp list and sort by start time.
  1017. aicpu_data = aicpu_info.get('info')
  1018. if aicpu_data:
  1019. timeline_list.extend(aicpu_data)
  1020. self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0)
  1021. self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0)
  1022. self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0)
  1023. self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0)
  1024. timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
  1025. # Add AllReduce info to timeline temp list and sort by start time.
  1026. if communication_info:
  1027. logger.debug('AllReduce info found. Start adding info into timeline...')
  1028. cluster_related_timeline = self._analyse_and_write_cluster_profiling_data(
  1029. timeline_list, communication_info, step_time_list)
  1030. timeline_list.extend(cluster_related_timeline)
  1031. timeline_list.extend(communication_info)
  1032. timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
  1033. # Add step time and scope name info.
  1034. timeline_list.extend(step_time_list)
  1035. timeline_list.extend(default_scope_name_time_list)
  1036. timeline_list.extend(recompute_scope_name_time_list)
  1037. timeline_list.extend(gradient_scope_name_time_list)
  1038. timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
  1039. # Init a dict for counting the num of streams.
  1040. stream_count_dict = {}
  1041. for timeline in timeline_list:
  1042. self._parse_timeline_data(timeline, min_cycle_counter)
  1043. # Updating the collection of streams.
  1044. if len(timeline) == 4:
  1045. self._update_num_of_streams(timeline, stream_count_dict)
  1046. # Add format thread meta data.
  1047. self._format_meta_data_list.extend(self._timeline_meta)
  1048. self._timeline_meta = self._format_meta_data_list
  1049. # Get framework metadata.
  1050. framework_obj_list = framework_info.get('object')
  1051. # The length of list is the number of operators.
  1052. self._timeline_summary['num_of_ops'] += len(framework_obj_list)
  1053. self._add_framework_info(framework_obj_list)
  1054. logger.info('Finished adding info into timeline...')
  1055. # Update timeline summary info
  1056. self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
  1057. def _clock_synchronize_to_device(self, timeline_list, source_path):
  1058. """Synchronize the timestamp from host to device."""
  1059. host_start_file_path = os.path.join(source_path, f"host_start.log.{self._device_id}")
  1060. dev_start_file_path = os.path.join(source_path, f"dev_start.log.{self._device_id}")
  1061. try:
  1062. with open(host_start_file_path) as f_obj:
  1063. lines = f_obj.readlines()
  1064. # lines[2] stores host monotonic_raw time of start training.
  1065. host_monotonic = int(lines[2].strip().split(':')[1])
  1066. except (IOError, OSError) as err:
  1067. logger.critical('Error occurred when read host_start.log: %s', err)
  1068. raise ProfilerIOException()
  1069. try:
  1070. with open(dev_start_file_path) as f_obj:
  1071. lines = f_obj.readlines()
  1072. # lines[2] stores device cycle counter of start training.
  1073. dev_cntvct = int(lines[2].strip().split(':')[1])
  1074. except (IOError, OSError) as err:
  1075. logger.critical('Error occurred when read dev_start.log: %s', err)
  1076. raise ProfilerIOException()
  1077. factor_ns_to_ms = 1e-6
  1078. factor_ten_ns_to_ns = 10
  1079. factor_ms_to_ns = 1e6
  1080. for idx, time_item in enumerate(timeline_list):
  1081. host_time = int(float(time_item[self._start_time_idx]) * factor_ms_to_ns)
  1082. device_time = dev_cntvct * factor_ten_ns_to_ns + (host_time - host_monotonic)
  1083. timeline_list[idx][self._start_time_idx] = device_time * factor_ns_to_ms
  1084. def _add_framework_info(self, framework_obj_list):
  1085. """
  1086. Add framework info into timeline metadata.
  1087. Args:
  1088. framework_obj_list (list): The framework metadata.
  1089. """
  1090. logger.debug('Start adding framework info into timeline...')
  1091. # Get the framework info that will be written into timeline.
  1092. framework_info_dict = {}
  1093. for framework_obj in framework_obj_list:
  1094. op_name = framework_obj[0]
  1095. op_type = framework_obj[1]
  1096. op_full_name = framework_obj[4]
  1097. op_info = framework_obj[5]
  1098. framework_info_dict[op_full_name] = {
  1099. 'name': op_name,
  1100. 'args': {
  1101. 'type': op_type,
  1102. 'fullname': op_full_name
  1103. }
  1104. }
  1105. framework_info_dict[op_full_name]['args'].update(op_info)
  1106. # Insert framework info into timeline.
  1107. for timeline_item in self._timeline_meta:
  1108. op_full_name = timeline_item.get('name')
  1109. framework_item = framework_info_dict.get(op_full_name)
  1110. if framework_item:
  1111. timeline_item['name'] = framework_item.get('name')
  1112. timeline_item['args'] = framework_item.get('args')
  1113. logger.debug('Finished adding framework info into timeline...')
  1114. def _produce_two_separated_timeline(self, timeline, op_name):
  1115. """Produce two separated timeline based on op_name."""
  1116. timeline_include_op_name = []
  1117. timeline_exclude_op_name = []
  1118. for time_item in timeline:
  1119. if op_name in time_item[self._op_name_idx]:
  1120. timeline_include_op_name.append(time_item)
  1121. else:
  1122. timeline_exclude_op_name.append(time_item)
  1123. return timeline_include_op_name, timeline_exclude_op_name
  1124. def _analyse_and_write_cluster_profiling_data(self, aicore_timeline, communication_timeline, step_time_list):
  1125. """
  1126. Analyse the cluster communication and computation data, and write result to file.
  1127. To analyse the cluster performance bottleneck based on timeline, define the time of a training
  1128. step as "t_total", propose five metrics as follows:
  1129. 1) The time that "receive" operators not overlapped by others(t1)
  1130. 2) The time that is consumed inside the stage(t_total - t1)
  1131. 3) The time that "communication" operators not overlapped by others(t2)
  1132. 4) The time that consumed by computation(t_total - t2)
  1133. 5) The time that "collective communication" operators not overlapped by others(t3)
  1134. In pipeline parallel mode, we can locate slow stage based on t_total - t1. Inside each stage,
  1135. we can locate slow card based on t_total - t2. The value of t1 indicates the degree that
  1136. communication time between stages slow down the training. The value of t3 indicates the degree
  1137. that communication inside each stage slow down the training.
  1138. """
  1139. step_num = len(step_time_list)
  1140. is_pipeline_parallel = False
  1141. comm_merged_timeline, _, comm_display_timeline = self._get_merged_time_list(
  1142. communication_timeline,
  1143. display_name="communication"
  1144. )
  1145. aicore_timeline_interval, _, aicore_display_timeline = self._get_merged_time_list(
  1146. aicore_timeline,
  1147. get_interval_time=True
  1148. )
  1149. # Consider if the overlap will be 0 or not.
  1150. comm_not_overlaped_timeline = self._get_intersection_time(
  1151. aicore_timeline_interval,
  1152. comm_merged_timeline
  1153. )
  1154. # Process receive part.
  1155. all_timeline = aicore_timeline + communication_timeline
  1156. all_timeline.sort(key=lambda x: float(x[self._start_time_idx]))
  1157. receive_op_timeline, timeline_exclude_receive_op = self._produce_two_separated_timeline(
  1158. all_timeline,
  1159. "Receive-op"
  1160. )
  1161. if receive_op_timeline:
  1162. is_pipeline_parallel = True
  1163. receive_op_merged_timeline = self._get_merged_time_list(receive_op_timeline)[0]
  1164. timeline_exclude_receive_op_interval = self._get_merged_time_list(
  1165. timeline_exclude_receive_op,
  1166. get_interval_time=True
  1167. )[0]
  1168. receive_op_not_overlaped_timeline = self._get_intersection_time(
  1169. timeline_exclude_receive_op_interval,
  1170. receive_op_merged_timeline
  1171. )
  1172. # Process collective communication part.
  1173. collective_comm_timeline = self._produce_two_separated_timeline(
  1174. communication_timeline,
  1175. "Receive-op"
  1176. )[-1]
  1177. collective_comm_merged_timeline = self._get_merged_time_list(collective_comm_timeline)[0]
  1178. collective_comm_not_overlaped_timeline = self._get_intersection_time(
  1179. aicore_timeline_interval,
  1180. collective_comm_merged_timeline
  1181. )
  1182. # Generate free time that exclude computation and communication time.
  1183. free_timeline = self._get_merged_time_list(
  1184. all_timeline,
  1185. get_interval_time=True,
  1186. display_name="free_time"
  1187. )[1]
  1188. try:
  1189. # Compute these five metrics mentioned above per step.
  1190. recieve_alone_time = self._compute_time_inside_step(receive_op_not_overlaped_timeline, step_time_list)
  1191. stage_time, computation_time = [], []
  1192. comm_alone_time = self._compute_time_inside_step(comm_not_overlaped_timeline, step_time_list)
  1193. collective_comm_alone_time = self._compute_time_inside_step(
  1194. collective_comm_not_overlaped_timeline,
  1195. step_time_list
  1196. )
  1197. for step in range(step_num):
  1198. if is_pipeline_parallel:
  1199. stage_time.append(step_time_list[step][self._duration_idx] - recieve_alone_time[step])
  1200. computation_time.append(step_time_list[step][self._duration_idx] - comm_alone_time[step])
  1201. metrices_per_step_list = [computation_time, comm_alone_time, stage_time,
  1202. recieve_alone_time, collective_comm_alone_time]
  1203. if step_num > 1:
  1204. for metric in metrices_per_step_list:
  1205. metric.append(sum(metric[1:]) / (step_num - 1))
  1206. self._write_cluster_metrices(metrices_per_step_list, is_pipeline_parallel)
  1207. except IndexError as e:
  1208. logger.error(e)
  1209. res_timeline = []
  1210. res_timeline.extend(comm_not_overlaped_timeline)
  1211. res_timeline.extend(aicore_display_timeline)
  1212. res_timeline.extend(comm_display_timeline)
  1213. res_timeline.extend(free_timeline)
  1214. return res_timeline
  1215. def _write_cluster_metrices(self, metrices, is_pipeline_parallel):
  1216. """Write cluster metric."""
  1217. # Note that the feature of cluster bottleneck analyse is not supported in offline parse mode,
  1218. # due to that parallel context is not set.
  1219. try:
  1220. parallel_mode = get_auto_parallel_context("parallel_mode")
  1221. stage_num = get_auto_parallel_context("pipeline_stages")
  1222. except RuntimeError:
  1223. logger.warning("[profiler] the feature of cluster bottleneck analyse "
  1224. "is not supported in offline parse mode.")
  1225. parallel_mode = "data_parallel"
  1226. stage_num = 1
  1227. if stage_num > 1:
  1228. parallel_mode = "pipeline-parallel"
  1229. elif parallel_mode != "data_parallel":
  1230. parallel_mode = "model-parallel"
  1231. else:
  1232. parallel_mode = "data-parallel"
  1233. cluster_analyse_file_path = os.path.join(
  1234. self._profiling_dir,
  1235. self._cluster_analyse_filename.format(parallel_mode, stage_num, self._rank_size, self._rank_id)
  1236. )
  1237. cluster_analyse_file_path = validate_and_normalize_path(cluster_analyse_file_path)
  1238. try:
  1239. with open(cluster_analyse_file_path, 'w') as file_handle:
  1240. csv_writer = csv.writer(file_handle)
  1241. if is_pipeline_parallel:
  1242. header = ['computation_time', 'communication_alone_time', 'stage_time',
  1243. 'receive_alone_time', 'collective_communication_alone_time']
  1244. zip_metrices = zip(metrices[0], metrices[1], metrices[2], metrices[3], metrices[4])
  1245. else:
  1246. header = ['computation_time', 'communication_alone_time']
  1247. zip_metrices = zip(metrices[0], metrices[1])
  1248. csv_writer.writerow(header)
  1249. for row_data in zip_metrices:
  1250. row_data = [round(val, 4) for val in row_data]
  1251. csv_writer.writerow(row_data)
  1252. os.chmod(cluster_analyse_file_path, stat.S_IREAD | stat.S_IWRITE)
  1253. except (IOError, OSError) as err:
  1254. logger.warning(f'Failed to save {cluster_analyse_file_path}. {err}')
  1255. raise ProfilerIOException
  1256. def _compute_time_inside_step(self, metric_timeline, step_time_list):
  1257. """Compute per step time of metric_timeline."""
  1258. per_step_time_list = []
  1259. step = 0
  1260. cur_step_metric_time = 0
  1261. step_end_time = step_time_list[step][self._start_time_idx] + \
  1262. step_time_list[step][self._duration_idx]
  1263. for time_item in metric_timeline:
  1264. start_time = time_item[self._start_time_idx]
  1265. if start_time > step_end_time:
  1266. per_step_time_list.append(cur_step_metric_time)
  1267. step += 1
  1268. step_end_time = step_time_list[step][self._start_time_idx] + \
  1269. step_time_list[step][self._duration_idx]
  1270. cur_step_metric_time = 0
  1271. cur_step_metric_time += time_item[self._duration_idx]
  1272. per_step_time_list.append(cur_step_metric_time)
  1273. return per_step_time_list
  1274. def _get_merged_time_list(self, time_list, get_interval_time=False, display_name="computation_op"):
  1275. """
  1276. Get merged time segment list.
  1277. The process of merge is, for example, there is a list [[1,5], [2,6], [7,8]],
  1278. each items in this list contains a start_time and end_time,
  1279. the merged result is [[1,6], [7,8]].
  1280. """
  1281. time_merged_segment_list = []
  1282. tid = self._tid_dict[display_name][0]
  1283. pid = self._tid_dict[display_name][1]
  1284. for time_item in time_list:
  1285. time_segment = list(map(float, time_item[self._start_time_idx:self._duration_idx + 1]))
  1286. time_segment[1] += time_segment[0]
  1287. if not time_merged_segment_list or \
  1288. time_segment[0] > time_merged_segment_list[-1]:
  1289. time_merged_segment_list.extend(time_segment)
  1290. else:
  1291. time_merged_segment_list[-1] = max(
  1292. time_merged_segment_list[-1],
  1293. time_segment[1]
  1294. )
  1295. # merged_display_list data used for ui page.
  1296. merged_display_list = [
  1297. [display_name, tid, time_merged_segment_list[i * 2],
  1298. time_merged_segment_list[i * 2 + 1] - time_merged_segment_list[i * 2], pid]
  1299. for i in range(len(time_merged_segment_list) // 2)
  1300. ]
  1301. if get_interval_time:
  1302. time_merged_segment_list = time_merged_segment_list[1:-1]
  1303. # merged_res_list data used to compute overlap with other time_list.
  1304. merged_res_list = [
  1305. [display_name, tid, time_merged_segment_list[i * 2], time_merged_segment_list[i * 2 + 1], pid]
  1306. for i in range(len(time_merged_segment_list) // 2)
  1307. ]
  1308. # interval_display_list is interval time used for ui page.
  1309. interval_display_list = [
  1310. [display_name, tid, time_merged_segment_list[i * 2],
  1311. time_merged_segment_list[i * 2 + 1] - time_merged_segment_list[i * 2], pid]
  1312. for i in range(len(time_merged_segment_list) // 2)
  1313. ]
  1314. return merged_res_list, interval_display_list, merged_display_list
  1315. def _get_intersection_time(self, first_time_list, second_time_list,
  1316. display_name="communication_not_overlapped"):
  1317. """Get intersection time of two time list."""
  1318. first_list_idx, second_list_idx = 0, 0
  1319. first_list_len = len(first_time_list)
  1320. second_list_len = len(second_time_list)
  1321. intersection_segment_display_list = []
  1322. while first_list_idx < first_list_len and \
  1323. second_list_idx < second_list_len:
  1324. intersection_start = max(
  1325. first_time_list[first_list_idx][self._start_time_idx],
  1326. second_time_list[second_list_idx][self._start_time_idx]
  1327. )
  1328. intersection_end = min(
  1329. first_time_list[first_list_idx][self._duration_idx],
  1330. second_time_list[second_list_idx][self._duration_idx]
  1331. )
  1332. if intersection_start < intersection_end:
  1333. intersection_segment_display_list.append(
  1334. [display_name, self._tid_dict[display_name][0],
  1335. intersection_start, intersection_end - intersection_start, self._tid_dict[display_name][1]]
  1336. )
  1337. if first_time_list[first_list_idx][self._duration_idx] >= \
  1338. second_time_list[second_list_idx][self._duration_idx]:
  1339. second_list_idx += 1
  1340. else:
  1341. first_list_idx += 1
  1342. return intersection_segment_display_list
  1343. class CpuTimelineGenerator(GpuTimelineGenerator):
  1344. """Generate cpu Timeline data from file."""
  1345. _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
  1346. def _get_and_validate_path(self, file_name):
  1347. """Generate op or activity file path from file name, and validate this path."""
  1348. file_path = os.path.join(
  1349. self._profiling_dir,
  1350. file_name.format(self._device_id)
  1351. )
  1352. file_path = validate_and_normalize_path(file_path)
  1353. return file_path
  1354. def load_cpu_op_data(self):
  1355. """Load cpu operator data from file"""
  1356. op_file_path = self._get_and_validate_path(
  1357. self._output_op_execute_time_file_path)
  1358. timeline_list = []
  1359. if not os.path.exists(op_file_path):
  1360. logger.info("No cpu operator info.")
  1361. return timeline_list
  1362. timeline_list = self._load_op_data(op_file_path)
  1363. factor_ms_to_us = 1e-3
  1364. for time_item in timeline_list:
  1365. time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_ms_to_us
  1366. return timeline_list
  1367. def get_timeline_data(self):
  1368. """Get timeline data from file."""
  1369. timeline_list = self.load_cpu_op_data()
  1370. factor_ns_to_ms = 1e6
  1371. factor_us_to_ms = 1e3
  1372. for time_item in timeline_list:
  1373. time_item[self._start_time_idx] = float(time_item[self._start_time_idx]) / factor_ns_to_ms
  1374. time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms
  1375. return timeline_list