You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

integrator.py 41 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """The integrator for integrating parsed profiling files."""
  16. import csv
  17. import json
  18. import os
  19. import stat
  20. from decimal import Decimal
  21. from mindspore import log as logger
  22. from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \
  23. ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException
  24. from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond
  25. from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
  26. from mindspore.profiler.parser.container import TimelineContainer
  27. SIZE_LIMIT_DEFAULT = 20 * 1024 * 1024 # 20MB
  28. class Integrator:
  29. """
  30. The integrator for integrating parsed profiling files.
  31. Args:
  32. profiling_dir (str): The directory where the parsed profiling files are
  33. located.
  34. device_id (str): The device ID.
  35. """
  36. _file_name_aicore_detail_time = 'output_op_compute_time_{}.txt'
  37. _file_name_aicpu_time = 'output_data_preprocess_aicpu_{}.txt'
  38. _file_name_framework = 'framework_raw_{}.csv'
  39. _header_aicore_type = ['op_type', 'execution_time', 'execution_frequency',
  40. 'percent']
  41. _header_aicore_detail = ['full_op_name', 'execution_time']
  42. _header_aicpu = ['serial_number', 'op_type', 'total_time', 'dispatch_time',
  43. 'run_start', 'run_end']
  44. _file_name_aicore_type_time = 'aicore_intermediate_{}_type.csv'
  45. _file_name_aicore_detail_info = 'aicore_intermediate_{}_detail.csv'
  46. _col_names_detail = ['op_name', 'op_type', 'avg_execution_time', 'subgraph', 'full_op_name', 'op_info']
  47. _none_filter_condition_key = ['is_display_detail', 'is_display_full_op_name']
  48. _none_sort_col_names = ['op_info']
  49. _aicore_data = []
  50. _aicore_detail_data = []
  51. _aicore_trace_data = []
  52. def __init__(self, profiling_dir, device_id):
  53. self._profiling_dir = profiling_dir
  54. self._device_id = device_id
  55. self._op_time_cache = {}
  56. self._total_time = Decimal('0.0')
  57. def integrate(self):
  58. """Integrate the parsed profiling files."""
  59. self._parse_aicore_detail_time()
  60. self._parse_aicore_type_time()
  61. self._parse_aicpu_time()
  62. def get_aicore_data(self):
  63. self._aicore_data_load()
  64. return self._aicore_data
  65. def get_aicore_detail_data(self):
  66. self._aicore_detail_data_load()
  67. return self._aicore_detail_data
  68. def get_aicore_trace_data(self):
  69. self._aicore_trace_data_load()
  70. return self._aicore_trace_data
  71. def query_for_all_reduce(self):
  72. return self._query_for_all_reduce()
  73. def query_and_sort_by_op_type(self, filter_condition, op_type_order):
  74. return self._query_and_sort_by_op_type(filter_condition, op_type_order)
  75. def _parse_aicore_type_time(self):
  76. """Parse the parsed AICORE operator type file."""
  77. framework_file = os.path.join(
  78. self._profiling_dir,
  79. self._file_name_framework.format(self._device_id)
  80. )
  81. framework_file = validate_and_normalize_path(framework_file)
  82. if not os.path.isfile(framework_file):
  83. return
  84. op_name_type_cache = {}
  85. with open(framework_file, 'r') as src_file:
  86. csv_reader = csv.reader(src_file)
  87. _ = next(csv_reader)
  88. for row in csv_reader:
  89. op_name_type_cache[row[3]] = row[5]
  90. op_type_time_cache = {}
  91. for full_op_name, op_time in self._op_time_cache.items():
  92. op_type = op_name_type_cache.get(full_op_name)
  93. if op_type_time_cache.get(op_type) is None:
  94. op_type_time_cache[op_type] = [op_time, 1]
  95. else:
  96. op_type_time_cache[op_type][0] += op_time
  97. op_type_time_cache[op_type][1] += 1
  98. op_type_file_name = 'aicore_intermediate_' + self._device_id + '_type.csv'
  99. op_type_file_path = os.path.join(self._profiling_dir, op_type_file_name)
  100. with open(op_type_file_path, 'w') as type_file:
  101. csv_writer = csv.writer(type_file)
  102. csv_writer.writerow(self._header_aicore_type)
  103. for op_type, op_type_time_info in op_type_time_cache.items():
  104. type_info = [
  105. op_type, op_type_time_info[0], op_type_time_info[1],
  106. round((op_type_time_info[0] / self._total_time) * 100, 2)
  107. ]
  108. csv_writer.writerow(type_info)
  109. def _parse_aicore_detail_time(self):
  110. """Parse the parsed AICORE operator time file."""
  111. aicore_detail_file = os.path.join(
  112. self._profiling_dir,
  113. self._file_name_aicore_detail_time.format(self._device_id)
  114. )
  115. aicore_detail_file = validate_and_normalize_path(aicore_detail_file)
  116. if not os.path.isfile(aicore_detail_file):
  117. return
  118. op_detail_file_name = 'aicore_intermediate_' + self._device_id + '_detail.csv'
  119. op_detail_file_path = os.path.join(
  120. self._profiling_dir, op_detail_file_name
  121. )
  122. with open(aicore_detail_file, 'r') as src_file:
  123. row = src_file.readline()
  124. if row.startswith('op_name'):
  125. _ = src_file.readline()
  126. elif row.startswith('====='):
  127. _ = src_file.readline()
  128. _ = src_file.readline()
  129. else:
  130. return
  131. with open(op_detail_file_path, 'w') as detail_file:
  132. csv_writer = csv.writer(detail_file)
  133. csv_writer.writerow(self._header_aicore_detail)
  134. while True:
  135. row = src_file.readline()
  136. if not row:
  137. break
  138. op_infos = row.split()
  139. if op_infos[0] == 'total':
  140. self._total_time = Decimal(op_infos[2])
  141. continue
  142. self._op_time_cache[op_infos[0]] = Decimal(op_infos[1])
  143. csv_writer.writerow([op_infos[0], op_infos[1]])
  144. def _parse_aicpu_time(self):
  145. """Parse the parsed AICPU operator time file."""
  146. aicpu_file = os.path.join(
  147. self._profiling_dir,
  148. self._file_name_aicpu_time.format(self._device_id)
  149. )
  150. aicpu_file = validate_and_normalize_path(aicpu_file)
  151. if not os.path.isfile(aicpu_file):
  152. return
  153. save_file_name = 'aicpu_intermediate_' + self._device_id + '.csv'
  154. save_file_path = os.path.join(self._profiling_dir, save_file_name)
  155. with open(aicpu_file, 'r') as src_file:
  156. row = src_file.readline()
  157. if not row.startswith('serial_number'):
  158. return
  159. with open(save_file_path, 'w') as save_file:
  160. csv_writer = csv.writer(save_file)
  161. csv_writer.writerow(self._header_aicpu)
  162. while True:
  163. row = src_file.readline()
  164. if not row:
  165. break
  166. infos = row.split()
  167. if infos[0] == 'AI':
  168. continue
  169. csv_writer.writerow(infos)
  170. def _aicore_data_load(self):
  171. """Load data according to the parsed AICORE operator types file."""
  172. op_type_file_path = os.path.join(
  173. self._profiling_dir,
  174. self._file_name_aicore_type_time.format(self._device_id)
  175. )
  176. op_type_file_path = validate_and_normalize_path(op_type_file_path)
  177. if not os.path.isfile(op_type_file_path):
  178. logger.warning('The file <%s> does not exist.', op_type_file_path)
  179. return
  180. with open(op_type_file_path, 'r') as file:
  181. csv_reader = csv.reader(file)
  182. _ = next(csv_reader)
  183. for info in csv_reader:
  184. self._aicore_data.append([info[0], float(info[1]), int(info[2]), float(info[3])])
  185. def _aicore_detail_data_load(self):
  186. """Load data according to the parsed AICORE operator file."""
  187. op_detail_file_path = os.path.join(
  188. self._profiling_dir,
  189. self._file_name_aicore_detail_info.format(self._device_id)
  190. )
  191. framework_file_path = os.path.join(
  192. self._profiling_dir,
  193. self._file_name_framework.format(self._device_id)
  194. )
  195. op_detail_file_path = validate_and_normalize_path(op_detail_file_path)
  196. framework_file_path = validate_and_normalize_path(framework_file_path)
  197. if not os.path.isfile(op_detail_file_path):
  198. logger.warning('The file <%s> does not exist.', op_detail_file_path)
  199. return
  200. if not os.path.isfile(framework_file_path):
  201. logger.warning('The file <%s> does not exist.', framework_file_path)
  202. return
  203. framework_infos = dict()
  204. with open(framework_file_path, 'r') as file:
  205. csv_reader = csv.reader(file)
  206. _ = next(csv_reader)
  207. for info in csv_reader:
  208. framework_infos[info[3]] = [
  209. info[3], info[4], info[5], info[6], json.loads(info[7]) if info[7] else None]
  210. with open(op_detail_file_path, 'r') as file:
  211. csv_reader = csv.reader(file)
  212. _ = next(csv_reader)
  213. for info in csv_reader:
  214. framework_info = framework_infos.get(info[0])
  215. self._aicore_detail_data.append(
  216. [
  217. framework_info[1], framework_info[2], float(info[1]),
  218. framework_info[3], framework_info[0], framework_info[4]
  219. ]
  220. )
  221. del framework_infos
  222. def _aicore_trace_data_load(self):
  223. """Load data according to the parsed AICORE operator types file."""
  224. file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id))
  225. if not file_path:
  226. logger.error("Failed to find parsed trace time file.")
  227. raise ProfilerFileNotFoundException('parsed step trace time file')
  228. file_path = validate_and_normalize_path(file_path)
  229. with open(file_path, 'r') as handle:
  230. csv_reader = csv.reader(handle)
  231. self.__column__ = next(csv_reader)
  232. self._aicore_trace_data = list(csv_reader)
  233. self._size = len(self._aicore_trace_data) - 1
  234. self._load_point_info()
  235. def _load_point_info(self):
  236. """Load point info."""
  237. file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json')
  238. file_path = validate_and_normalize_path(file_path)
  239. if os.path.isfile(file_path):
  240. with open(file_path, 'r', encoding='utf-8') as file:
  241. try:
  242. self._point_info = json.load(file)
  243. except (json.JSONDecodeError, TypeError) as err:
  244. logger.warning(err)
  245. raise ProfilerRawFileException('Fail to parse point info file.')
  246. def _query_for_all_reduce(self):
  247. """
  248. Query for all reduce info.
  249. Returns:
  250. list[dict], reduce information. Each item is the reduce info for one step.
  251. The reduce info is format like:
  252. {stream_id: List[Tuple(start_point, end_point, duration, field_name)]}.
  253. """
  254. self._aicore_trace_data_load()
  255. reduce_infos = []
  256. for row_info in self._aicore_trace_data[:-1]:
  257. row_info_dict = self._get_info_dict_from_row_data(row_info, 'systime')
  258. reduce_info = self._sort_reduce_by_time(row_info_dict)
  259. if reduce_info:
  260. reduce_infos.extend(reduce_info)
  261. return reduce_infos
  262. def _get_info_dict_from_row_data(self, row_info, time_type):
  263. """
  264. Get step info in dict format.
  265. Args:
  266. row_info (list[str]): Step info, the value is corresponding to `__column__`.
  267. time_type (str): The value type. `systime` keeps the original value.
  268. `realtime` transforms the value in millisecond. Default: `realtime`.
  269. Returns:
  270. dict, step trace information. The key is in `__column__`.
  271. """
  272. row_info_dict = {}
  273. for key, value in zip(self.__column__, row_info):
  274. if key == 'step_num':
  275. continue
  276. value = to_int(value, key)
  277. row_info_dict[key] = to_millisecond(value) if time_type == 'realtime' else value
  278. return row_info_dict
  279. def _sort_reduce_by_time(self, row_info_dict):
  280. """
  281. Sort reduce info by time.
  282. Args:
  283. row_info_dict (dict): Step trace information.
  284. Returns:
  285. list, including the all reduce info sorted by start time only.
  286. [
  287. [reduce_field, stream_id, reduce_start, reduce_duration],
  288. [...],
  289. [...]
  290. ]
  291. """
  292. factor = 1e5 # convert time unit from 10ns to 1ms
  293. reduce_pid = 10000
  294. reduce_info = []
  295. reduce_fields = [field_name for field_name in self.__column__
  296. if field_name.startswith('stream_') and not field_name.endswith('point')]
  297. for reduce_field in reduce_fields:
  298. reduce_start = row_info_dict.get(reduce_field + '_start_point')
  299. reduce_start = reduce_start / factor \
  300. if reduce_start else 0
  301. reduce_duration = row_info_dict.get(reduce_field)
  302. reduce_duration = reduce_duration / factor if reduce_duration else 0
  303. if not (reduce_start and reduce_duration):
  304. logger.info("Reduce event missing value.")
  305. continue
  306. cur_stream_id = reduce_field.split('_', 2)[1]
  307. reduce_meta = [reduce_field, int(cur_stream_id), reduce_start,
  308. reduce_duration, reduce_pid]
  309. reduce_info.append(reduce_meta)
  310. return reduce_info
  311. def _query_and_sort_by_op_type(self, filter_condition, op_type_order: list):
  312. """
  313. Query the AICORE operator detail information by `filter_condition`,
  314. and sort by `op_type_order` and execution time.
  315. Args:
  316. filter_condition (dict): The filter condition.
  317. op_type_order (list[str]): The name of the operator type in order.
  318. Returns:
  319. dict, The results are filtered and sorted.
  320. """
  321. self._aicore_detail_data_load()
  322. if filter_condition is None:
  323. filter_condition = {}
  324. self._filter(filter_condition)
  325. type_detail_cache = {}
  326. for detail_info in self._result:
  327. op_type = detail_info[1]
  328. if op_type not in op_type_order:
  329. continue
  330. infos = type_detail_cache.get(op_type)
  331. if infos:
  332. infos.append(detail_info)
  333. else:
  334. type_detail_cache[op_type] = [detail_info]
  335. result = []
  336. for op_type in op_type_order:
  337. detail_infos = type_detail_cache.get(op_type)
  338. if detail_infos is None:
  339. continue
  340. detail_infos.sort(key=lambda item: item[2], reverse=True)
  341. result.extend(detail_infos)
  342. return {
  343. 'col_name_detail': self._display_col_names_detail,
  344. 'object': result
  345. }
  346. def _filter(self, filter_condition):
  347. """
  348. Filter the profiling data according to the filter condition.
  349. Args:
  350. filter_condition (dict): The filter condition.
  351. """
  352. def _inner_filter(item: list):
  353. return self._default_filter(item, filter_condition)
  354. def _inner_map(item: list):
  355. inner_item = item[0:4]
  356. if is_display_full_op_name:
  357. inner_item.append(item[4])
  358. if is_display_detail:
  359. inner_item.append(item[5])
  360. return inner_item
  361. is_display_detail = filter_condition.get('is_display_detail', True)
  362. is_display_full_op_name = filter_condition.get(
  363. 'is_display_full_op_name', True
  364. )
  365. self._set_display_col_name(is_display_detail, is_display_full_op_name)
  366. if is_display_detail and is_display_full_op_name:
  367. self._result = list(filter(_inner_filter, self._aicore_detail_data))
  368. else:
  369. self._result = list(
  370. map(_inner_map, filter(_inner_filter, self._aicore_detail_data))
  371. )
  372. def _default_filter(self, item, condition):
  373. """
  374. The default filter method.
  375. Args:
  376. item (list[Union[str, float, int]]): A piece of data to be filtered.
  377. condition (dict): The filter condition.
  378. Returns:
  379. bool, `True` if the item is satisfied.
  380. """
  381. for condition_key, condition_value in condition.items():
  382. if condition_key in self._none_filter_condition_key:
  383. continue
  384. if condition_key in self._col_names_detail:
  385. index = self._col_names_detail.index(condition_key)
  386. actual_value = item[index]
  387. for exp_key, exp_value in condition_value.items():
  388. if not self._is_match_condition(
  389. exp_key, exp_value, actual_value):
  390. return False
  391. return True
  392. def _is_match_condition(self, exp_key, exp_value, actual_value):
  393. """
  394. Check whether the actual value meets the expect condition.
  395. Args:
  396. exp_key (str): Expect key of the condition.
  397. exp_value (str): Expect value.
  398. actual_value (str): Actual value.
  399. Returns:
  400. bool, `True` if the actual meets the expect condition, else `False`.
  401. """
  402. if exp_key == 'in':
  403. if actual_value not in exp_value:
  404. return False
  405. elif exp_key == 'not_in':
  406. if actual_value in exp_value:
  407. return False
  408. elif exp_key == 'partial_match_str_in':
  409. for partial_match_str in exp_value:
  410. if partial_match_str in actual_value:
  411. return True
  412. return False
  413. else:
  414. return False
  415. return True
  416. def _set_display_col_name(self, is_display_detail, is_display_full_op_name):
  417. """
  418. Set the display column name according to the filter condition.
  419. Args:
  420. is_display_detail (bool): Whether to display the detailed operator
  421. information.
  422. is_display_full_op_name (bool): Whether to display the operator full
  423. name.
  424. """
  425. self._display_col_names_detail = self._col_names_detail[0:4]
  426. if is_display_full_op_name:
  427. self._display_col_names_detail.append(self._col_names_detail[4])
  428. if is_display_detail:
  429. self._display_col_names_detail.append(self._col_names_detail[5])
  430. class BaseTimelineGenerator:
  431. """
  432. Analyse timeline data from file.
  433. """
  434. __col_names__ = ['op_name', 'stream_id', 'start_time', 'duration']
  435. _output_timeline_data_file_path = 'output_timeline_data_{}.txt'
  436. _min_cycle_counter_file_path = 'min_cycle_counter_{}.txt'
  437. _timeline_meta = []
  438. _timeline_summary = {
  439. 'total_time': 0,
  440. 'num_of_streams': 0,
  441. 'num_of_ops': 0,
  442. 'op_exe_times': 0
  443. }
  444. def _load_timeline_data(self):
  445. """Load timeline data from file."""
  446. def _parse_timeline_data(self):
  447. """Parse timeline data."""
  448. def init_timeline(self):
  449. """Init timeline metadata, adding all collected info."""
  450. def write_timeline(self, size_limit=SIZE_LIMIT_DEFAULT):
  451. """Load data according to the parsed profiling files."""
  452. # Write timeline to file.
  453. logger.info('Writing timeline file...')
  454. self.write_timeline_to_json_by_limitation(size_limit)
  455. logger.info('Finished file writing!')
  456. def write_timeline_to_json_by_limitation(self, size_limit):
  457. """Write timeline to json by limitation."""
  458. display_filename = self._display_filename.format(self._device_id)
  459. display_file_path = os.path.join(
  460. self._profiling_dir,
  461. display_filename
  462. )
  463. display_file_path = validate_and_normalize_path(display_file_path)
  464. length = len(self._timeline_meta)
  465. try:
  466. with open(display_file_path, 'w') as json_file:
  467. json_file.write('[')
  468. for index, item in enumerate(self._timeline_meta):
  469. json.dump(item, json_file)
  470. file_size = os.path.getsize(display_file_path)
  471. if file_size > size_limit:
  472. break
  473. if index == length - 1:
  474. break
  475. json_file.write(',')
  476. json_file.write(']')
  477. os.chmod(display_file_path, stat.S_IREAD | stat.S_IWRITE)
  478. except (IOError, OSError) as err:
  479. logger.error('Error occurred when write timeline display file: %s', err)
  480. raise ProfilerIOException
  481. def write_timeline_summary(self):
  482. """Write timeline summary to json."""
  483. timeline_summary_file_path = os.path.join(
  484. self._profiling_dir,
  485. self._timeline_summary_filename.format(self._device_id)
  486. )
  487. timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path)
  488. try:
  489. with open(timeline_summary_file_path, 'w') as json_file:
  490. json.dump(self._timeline_summary, json_file)
  491. os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE)
  492. except (IOError, OSError) as err:
  493. logger.error('Error occurred when write timeline summary file: %s', err)
  494. raise ProfilerIOException
  495. @staticmethod
  496. def _update_num_of_streams(timeline, stream_count_dict):
  497. """Update number of streams."""
  498. stream_id = timeline[1]
  499. if stream_id not in stream_count_dict.keys():
  500. stream_count_dict[stream_id] = 1
  501. else:
  502. stream_count_dict[stream_id] += 1
  503. def get_min_cycle_counter(self):
  504. """
  505. Get minimum cycle counter.
  506. Returns:
  507. float, the minimum value of the cycle counter.
  508. """
  509. file_path = os.path.join(
  510. self._profiling_dir,
  511. self._min_cycle_counter_file_path.format(self._device_id)
  512. )
  513. file_path = validate_and_normalize_path(file_path)
  514. if os.path.exists(file_path):
  515. try:
  516. with open(file_path, 'r') as f_obj:
  517. min_cycle_counter = f_obj.read()
  518. min_cycle_counter = float(min_cycle_counter) \
  519. if not min_cycle_counter == 'inf' else 0
  520. except (IOError, OSError) as err:
  521. logger.error('Error occurred when read minimum cycle counter: %s', err)
  522. raise ProfilerIOException
  523. else:
  524. min_cycle_counter = 0
  525. logger.info("No min cycle counter recorded.")
  526. return min_cycle_counter
  527. def _add_framework_info(self, framework_obj_list):
  528. """
  529. Add framework info into timeline metadata.
  530. Args:
  531. framework_obj_list (list): The framework metadata.
  532. """
  533. logger.debug('Start adding framework info into timeline...')
  534. # Get the framework info that will be written into timeline.
  535. framework_info_dict = {}
  536. for framework_obj in framework_obj_list:
  537. op_name = framework_obj[0]
  538. op_type = framework_obj[1]
  539. op_full_name = framework_obj[4]
  540. op_info = framework_obj[5]
  541. framework_info_dict[op_full_name] = {
  542. 'name': op_name,
  543. 'args': {
  544. 'type': op_type,
  545. 'fullname': op_full_name
  546. }
  547. }
  548. framework_info_dict[op_full_name]['args'].update(op_info)
  549. # Insert framework info into timeline.
  550. for timeline_item in self._timeline_meta:
  551. op_full_name = timeline_item.get('name')
  552. framework_item = framework_info_dict.get(op_full_name)
  553. if framework_item:
  554. timeline_item['name'] = framework_item.get('name')
  555. timeline_item['args'] = framework_item.get('args')
  556. logger.debug('Finished adding framework info into timeline...')
  557. class GpuTimelineGenerator(BaseTimelineGenerator):
  558. """Generate gpu Timeline data from file."""
  559. _display_filename = 'gpu_timeline_display_{}.json'
  560. _timeline_summary_filename = 'gpu_timeline_summary_{}.json'
  561. _output_op_execute_time_file_path = "op_execute_timestamp_{}.txt"
  562. _output_activity_execute_time_file_path = "activity_execute_timestamp_{}.txt"
  563. _output_gpu_activity_info_file_path = "gpu_activity_data_{}.csv"
  564. _activity_keys_list = []
  565. def __init__(self, profiling_dir, device_id):
  566. self._profiling_dir = profiling_dir
  567. self._device_id = device_id
  568. self._timeline_meta = []
  569. self._timeline_summary = {
  570. 'total_time': 0,
  571. 'num_of_streams': 0,
  572. 'num_of_ops': 0,
  573. 'op_exe_times': 0
  574. }
  575. def _get_and_validate_path(self, file_name):
  576. """Generate op or activity file path from file name, and validate this path."""
  577. file_path = os.path.join(
  578. self._profiling_dir,
  579. file_name.format(self._device_id)
  580. )
  581. file_path = validate_and_normalize_path(file_path)
  582. if not os.path.exists(file_path):
  583. logger.error(f"Failed to find parsed timeline file {file_path}.")
  584. raise ProfilerFileNotFoundException('parsed timeline file')
  585. return file_path
  586. def _parse_timeline_data(self, timeline, min_cycle_counter):
  587. """Parse timeline data."""
  588. # factor to convert the time unit of start_time(ts) from 1ns to 1us for timeline display
  589. factor = 1000
  590. op_meta = TimelineContainer(timeline)
  591. timeline_dict = {}
  592. timeline_dict['name'] = op_meta.op_name
  593. timeline_dict['ph'] = 'X'
  594. timeline_dict['tid'] = op_meta.stream_id
  595. timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) / factor
  596. dur = op_meta.duration
  597. timeline_dict['dur'] = dur
  598. if op_meta.pid is None:
  599. timeline_dict['pid'] = int(self._device_id)
  600. else: # AllReduce and AI CPU pid
  601. timeline_dict['pid'] = op_meta.pid
  602. if len(timeline) > 4:
  603. # len(timeline) > 4 refers to activity data, else op data.
  604. # Add args for activity data
  605. args_dict = {}
  606. for ix, value in enumerate(timeline[4:]):
  607. args_dict[self._activity_keys_list[ix]] = value
  608. timeline_dict['args'] = args_dict
  609. else:
  610. # Update total time of operator execution.
  611. self._timeline_summary['total_time'] += dur / factor
  612. self._timeline_summary['op_exe_times'] += 1
  613. self._timeline_meta.append(timeline_dict)
  614. def _load_timeline_data(self):
  615. """Load timeline data from file."""
  616. op_file_path = self._get_and_validate_path(
  617. self._output_op_execute_time_file_path)
  618. activity_file_path = self._get_and_validate_path(
  619. self._output_activity_execute_time_file_path)
  620. activity_args_file_path = self._get_and_validate_path(
  621. self._output_gpu_activity_info_file_path)
  622. timeline_list = self._load_op_data(op_file_path) + \
  623. self._load_activity_data(activity_file_path, activity_args_file_path)
  624. cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._device_id)
  625. cpu_timeline_list = cpu_timeline_generator.load_cpu_op_data()
  626. if cpu_timeline_list:
  627. self._clock_synchronize_to_gpu(cpu_timeline_list)
  628. timeline_list.extend(cpu_timeline_list)
  629. timeline_list.sort(key=lambda x: float(x[2]))
  630. return timeline_list
  631. def _clock_synchronize_to_gpu(self, timeline_list):
  632. """Synchronize the timestamp from device to host."""
  633. start_time_file_path = os.path.join(self._profiling_dir, f"start_time_{self._device_id}.txt")
  634. try:
  635. with open(start_time_file_path) as f:
  636. lines = f.readlines()
  637. host_monotonic_start_time = int(lines[0].strip().split(':')[-1])
  638. gpu_start_time = int(lines[1].strip().split(':')[-1])
  639. except (IOError, OSError) as err:
  640. logger.error(f'Error occurred when read {start_time_file_path}: {err}')
  641. raise ProfilerIOException
  642. time_diff = gpu_start_time - host_monotonic_start_time
  643. start_time = 2
  644. for idx, time_item in enumerate(timeline_list):
  645. timeline_list[idx][start_time] = int(time_item[start_time]) + time_diff
  646. def _load_op_data(self, op_file_path):
  647. """Load operator data from file"""
  648. op_timeline_list = []
  649. try:
  650. with open(op_file_path, 'r') as f_obj:
  651. for line in f_obj:
  652. self._timeline_summary['num_of_ops'] += 1
  653. op_list = line.strip('\n').strip().split(';')
  654. time_arr = op_list[-1]
  655. time_arr = time_arr.split(" ")
  656. for time in time_arr:
  657. time = time.split(",")
  658. line_list = op_list[:2] + time
  659. op_timeline_list.append(line_list)
  660. except (IOError, OSError) as err:
  661. logger.error('Error occurred when load operator timeline data intermediate file: %s', err)
  662. raise ProfilerIOException
  663. return op_timeline_list
  664. def _load_activity_data(self, activity_file_path, activity_args_file_path):
  665. """Load activity data from file"""
  666. activity_timeline_list = []
  667. try:
  668. args_dict = {}
  669. with open(activity_args_file_path, 'r') as args_file:
  670. csv_reader = csv.reader(args_file)
  671. keys_list = next(csv_reader)
  672. # keys_list [name, type, op_full_name, stream_id, block_dim, grid_dim, ...]
  673. self._activity_keys_list = keys_list[1:3] + keys_list[4:6]
  674. for info in csv_reader:
  675. args_dict[info[0]] = info[1:3] + info[4:6]
  676. with open(activity_file_path, 'r') as f_obj:
  677. for line in f_obj:
  678. line_list = line.strip('\n').split(';')
  679. # concat activity args info.
  680. line_list += args_dict[line_list[0]]
  681. activity_timeline_list.append(line_list)
  682. except (IOError, OSError) as err:
  683. logger.error('Error occurred when load activity timeline data intermediate file: %s', err)
  684. raise ProfilerIOException
  685. return activity_timeline_list
  686. def init_timeline(self):
  687. """Init timeline metadata, adding all collected info."""
  688. timeline_list = self._load_timeline_data()
  689. # Init a dict for counting the num of streams.
  690. stream_count_dict = {}
  691. for timeline in timeline_list:
  692. self._parse_timeline_data(timeline, 0)
  693. # Updating the collection of streams.
  694. if len(timeline) == 4:
  695. self._update_num_of_streams(timeline, stream_count_dict)
  696. # Update timeline summary info
  697. self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
  698. def check_op_name(self, op_name):
  699. """
  700. Check whether the operator name exists.
  701. Args:
  702. op_name (str): The operator name or operator name prefix.
  703. Returns:
  704. bool, `True` if the operator name does exist, else `False`.
  705. """
  706. if not op_name:
  707. raise ProfilerParamValueErrorException('The op_name should exist.')
  708. for op_time_info in self._timeline_meta:
  709. full_op_name = op_time_info['name']
  710. if full_op_name and full_op_name.startswith(op_name):
  711. return True
  712. return False
  713. class AscendTimelineGenerator(BaseTimelineGenerator):
  714. """Generate ascend Timeline data from file."""
  715. _display_filename = 'ascend_timeline_display_{}.json'
  716. _timeline_summary_filename = 'ascend_timeline_summary_{}.json'
  717. def __init__(self, profiling_dir, device_id):
  718. self._profiling_dir = profiling_dir
  719. self._device_id = device_id
  720. def _load_timeline_data(self):
  721. """Load timeline data from file."""
  722. file_path = os.path.join(
  723. self._profiling_dir,
  724. self._output_timeline_data_file_path.format(self._device_id)
  725. )
  726. file_path = validate_and_normalize_path(file_path)
  727. if not os.path.exists(file_path):
  728. logger.error("Failed to find parsed timeline file.")
  729. raise ProfilerFileNotFoundException('parsed timeline file')
  730. timeline_list = []
  731. try:
  732. with open(file_path, 'r') as f_obj:
  733. for line in f_obj:
  734. if not line.startswith('op_name'):
  735. line_list = line.strip('\n').split(',')
  736. timeline_list.append(line_list)
  737. except (IOError, OSError) as err:
  738. logger.error('Error occurred when read timeline intermediate file: %s', err)
  739. raise ProfilerIOException
  740. return timeline_list
  741. def _parse_timeline_data(self, timeline, min_cycle_counter):
  742. """Parse timeline data."""
  743. # factor to convert the time unit from 1ms to 1us for timeline display
  744. factor = 1000
  745. op_meta = TimelineContainer(timeline)
  746. timeline_dict = {}
  747. timeline_dict['name'] = op_meta.op_name
  748. timeline_dict['ph'] = 'X'
  749. timeline_dict['tid'] = op_meta.stream_id
  750. timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor
  751. dur = op_meta.duration * factor
  752. timeline_dict['dur'] = dur
  753. if op_meta.pid is None:
  754. timeline_dict['pid'] = int(self._device_id)
  755. # Update total time of operator execution.
  756. self._timeline_summary['total_time'] += dur
  757. else: # AllReduce and AI CPU pid
  758. timeline_dict['pid'] = op_meta.pid
  759. self._timeline_meta.append(timeline_dict)
  760. def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter, source_path):
  761. """
  762. Init timeline metadata, adding all collected info.
  763. Args:
  764. all_reduce_info (list[list]): The metadata of AllReduce operator.
  765. framework_info (dict): The framework metadata.
  766. aicpu_info (dict): The metadata of AI CPU operator.
  767. min_cycle_counter (float): The minimum cycle counter of the timeline.
  768. """
  769. if min_cycle_counter == float('inf'):
  770. min_cycle_counter = 0
  771. logger.info('Initiating timeline...')
  772. timeline_list = self._load_timeline_data()
  773. cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._device_id)
  774. cpu_timeline_list = cpu_timeline_generator.get_timeline_data()
  775. if cpu_timeline_list:
  776. self._clock_synchronize_to_host(timeline_list, source_path)
  777. timeline_list.extend(cpu_timeline_list)
  778. timeline_list.sort(key=lambda x: float(x[2]))
  779. self._timeline_summary['op_exe_times'] = len(timeline_list)
  780. # Add AllReduce info to timeline temp list and sort by start time.
  781. if all_reduce_info:
  782. logger.debug('AllReduce info found. Start adding info into timeline...')
  783. timeline_list.extend(all_reduce_info)
  784. timeline_list.sort(key=lambda x: float(x[2]))
  785. # Add AI CPU data into timeline temp list and sort by start time.
  786. aicpu_data = aicpu_info.get('info')
  787. if aicpu_data:
  788. timeline_list.extend(aicpu_data)
  789. timeline_list.sort(key=lambda x: float(x[2]))
  790. self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0)
  791. self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0)
  792. self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0)
  793. self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0)
  794. # Init a dict for counting the num of streams.
  795. stream_count_dict = {}
  796. for timeline in timeline_list:
  797. self._parse_timeline_data(timeline, min_cycle_counter)
  798. # Updating the collection of streams.
  799. if len(timeline) == 4:
  800. self._update_num_of_streams(timeline, stream_count_dict)
  801. # Get framework metadata.
  802. framework_obj_list = framework_info.get('object')
  803. # The length of list is the number of operators.
  804. self._timeline_summary['num_of_ops'] += len(framework_obj_list)
  805. self._add_framework_info(framework_obj_list)
  806. logger.info('Finished adding info into timeline...')
  807. # Update timeline summary info
  808. self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
  809. def _clock_synchronize_to_host(self, timeline_list, source_path):
  810. """Synchronize the timestamp from device to host."""
  811. host_start_file_path = os.path.join(source_path, f"host_start.log.{self._device_id}")
  812. dev_start_file_path = os.path.join(source_path, f"dev_start.log.{self._device_id}")
  813. try:
  814. with open(host_start_file_path) as f:
  815. lines = f.readlines()
  816. host_monotonic = int(lines[2].strip().split(':')[1])
  817. except (IOError, OSError) as err:
  818. logger.error('Error occurred when read host_start.log: %s', err)
  819. raise ProfilerIOException
  820. try:
  821. with open(dev_start_file_path) as f:
  822. lines = f.readlines()
  823. dev_cntvct = int(lines[2].strip().split(':')[1])
  824. except (IOError, OSError) as err:
  825. logger.error('Error occurred when read dev_start.log: %s', err)
  826. raise ProfilerIOException
  827. factor_ns_to_ms = 1e6
  828. factor_ms_to_ten_ns = 1e5
  829. factor_ten_ns_to_ns = 10
  830. start_time = 2
  831. for idx, time_item in enumerate(timeline_list):
  832. cycle_counter = int(float(time_item[start_time]) * factor_ms_to_ten_ns)
  833. host_monotonic_time = host_monotonic + (cycle_counter - dev_cntvct) * factor_ten_ns_to_ns
  834. timeline_list[idx][start_time] = host_monotonic_time / factor_ns_to_ms
  835. class CpuTimelineGenerator(GpuTimelineGenerator):
  836. """Generate gpu Timeline data from file."""
  837. _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
  838. def _get_and_validate_path(self, file_name):
  839. """Generate op or activity file path from file name, and validate this path."""
  840. file_path = os.path.join(
  841. self._profiling_dir,
  842. file_name.format(self._device_id)
  843. )
  844. file_path = validate_and_normalize_path(file_path)
  845. return file_path
  846. def load_cpu_op_data(self):
  847. """Load cpu operator data from file"""
  848. op_file_path = self._get_and_validate_path(
  849. self._output_op_execute_time_file_path)
  850. timeline_list = []
  851. if not os.path.exists(op_file_path):
  852. logger.info("No cpu operator info.")
  853. return timeline_list
  854. timeline_list = self._load_op_data(op_file_path)
  855. return timeline_list
  856. def get_timeline_data(self):
  857. """Get timeline data from file."""
  858. timeline_list = self.load_cpu_op_data()
  859. factor_ns_to_ms = 1e6
  860. factor_us_to_ms = 1e3
  861. start_time = 2
  862. duration = 3
  863. for idx, time_item in enumerate(timeline_list):
  864. time_item[start_time] = float(time_item[start_time]) / factor_ns_to_ms
  865. time_item[duration] = float(time_item[duration]) / factor_us_to_ms
  866. timeline_list[idx] = time_item
  867. return timeline_list