You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

step_trace_parser.py 27 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """The parser for step trace data."""
  16. import csv
  17. import json
  18. import os
  19. import stat
  20. import struct
  21. from collections import namedtuple
  22. from decimal import Decimal
  23. from abc import abstractmethod
  24. from mindspore.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \
  25. ProfilerIOException, ProfilerRawFileException
  26. from mindspore import log
  27. from mindspore.profiler.common.util import get_summary_for_step_trace
  28. from mindspore.profiler.common.validator.validate_path import \
  29. validate_and_normalize_path
  30. ProfilingHeadStruct = namedtuple(
  31. 'ProfilingHeadStruct', ['mode', 'rptType', 'bufSize']
  32. )
  33. StepTraceStruct = namedtuple(
  34. 'StepTraceStruct', ['timeStamp', 'index_id', 'model_id', 'stream_id', 'task_id', 'tag_id']
  35. )
  36. class BaseStepTraceParser:
  37. """
  38. The parser for step trace data.
  39. Args:
  40. input_dir (str): The directory that contains original step trace data.
  41. output_file_path (str): The output file path.
  42. job_id (int): The job id used to define the start of new step. Default: 0.
  43. skip_first_step (bool): Whether skip the first step or not.
  44. is_training_mode (bool): Whether in training mode or not.
  45. is_gpu_kernel_async_launch (bool): Whether is gpu kernel async launch or not.
  46. """
  47. def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False,
  48. is_training_mode=True, is_gpu_kernel_async_launch=False):
  49. self._input_dir = input_dir
  50. self._output_path = output_file_path
  51. self._job_id = job_id
  52. self._skip_first_step = skip_first_step
  53. self._result = []
  54. self._header = []
  55. self._step_num = 0
  56. self._tag_map = {}
  57. self._is_training_mode = is_training_mode
  58. self._step_end_tag_id = 4
  59. self._is_gpu_kernel_async_launch = is_gpu_kernel_async_launch
  60. self._model_start_tag_id = 0
  61. self._model_end_tag_id = 1
  62. self._fp_tag_id = 2
  63. self._bp_tag_id = 3
  64. self._reduce_min_tag_id = 10000
  65. self._reduce_max_tag_id = 20000
  66. self._profiling_head_len = 4
  67. self._profiling_head_pad_len = 4
  68. self._st_data_len = 8 + 8 + 8 + 2 + 2 + 2
  69. @property
  70. def output_file(self):
  71. """The property of step trace header."""
  72. file_name = self._output_path.rsplit('/', 2)
  73. return file_name[-1] if len(file_name) == 3 else ''
  74. def show(self):
  75. """The property of step trace info."""
  76. summary_info = {}
  77. if self._result:
  78. summary_info = get_summary_for_step_trace(self._result[-1], self._header, self._is_training_mode)
  79. summary_info['total_steps'] = len(self._result) - 1
  80. print('\nStep trace summary info (unit: syscnt):')
  81. print(summary_info)
  82. print('\nThe step trace parse result saves under ${summary_dir}/profiler/%s'
  83. % self.output_file)
  84. def parse_and_save(self):
  85. """Parse step trace files and save the result."""
  86. try:
  87. source_files = self._get_step_trace_files()
  88. if self._is_gpu_kernel_async_launch:
  89. self._parse_async_launch(source_files)
  90. else:
  91. self._parse(source_files)
  92. self._save()
  93. except IOError as err:
  94. log.warning(err)
  95. raise ProfilerIOException()
  96. else:
  97. log.info("Finish to save intermediate result for step trace file.")
  98. def record_point_info(self, point_info, output_path):
  99. """
  100. Record point info into json.
  101. Args:
  102. point_info (dict): The point info about tag id and relative op name.
  103. output_path (str): The output path for saving point info.
  104. Returns:
  105. dict, parsed point info.
  106. """
  107. def update_tag_op_type_map(self, point_info):
  108. """
  109. update the map from tag id to op type.
  110. Args:
  111. point_info (dict): The point info about tag id and relative op name.
  112. """
  113. self._get_step_trace_files()
  114. tag_map = {}
  115. for tag, op_name in point_info.items():
  116. op_type = self._get_op_type(tag, op_name)
  117. tag_map[tag] = op_type
  118. log.info("Get tag types for step trace analysis: %s", tag_map)
  119. self._tag_map = tag_map
  120. def _get_op_type(self, tag, name):
  121. """
  122. Get op type from tag and name.
  123. Args:
  124. tag (int): The tag id.
  125. name (str): The op name.
  126. Returns:
  127. str, the op type or communication op name.
  128. """
  129. tag_map = {self._fp_tag: 'fp', self._bp_tag: 'bp', self._step_end_tag_id: 'end'}
  130. # get solid tag type
  131. op_type = tag_map.get(tag, '')
  132. if op_type:
  133. return op_type
  134. # check if the tag is step tag.
  135. if tag == 0:
  136. return 'start'
  137. # analyze the reduce tag
  138. op_name = name.rsplit('/', 1)[-1]
  139. if not op_name:
  140. log.warning("Unexpected op name:%s", name)
  141. return op_name
  142. def _get_step_trace_files(self):
  143. """Get step trace files."""
  144. return self._input_dir
  145. @staticmethod
  146. def _search_file(input_dir):
  147. """Search step trace file under specific input directory."""
  148. # validate input_dir
  149. if not os.path.isdir(input_dir):
  150. raise ProfilerPathErrorException(
  151. '{} does not exist or is not a dir'.format(input_dir)
  152. )
  153. # get step trace files
  154. files = os.listdir(input_dir)
  155. step_trace_files = list(
  156. filter(
  157. lambda file: file.startswith('ts_track.data') and not file.endswith('.done'),
  158. files
  159. )
  160. )
  161. # validate result
  162. if len(step_trace_files) > 1:
  163. # the format of file name is like
  164. # `training_trace.46.dev.profiler_default_tag.$id.slice_$number`
  165. # use the $number as the sorted key
  166. try:
  167. step_trace_files.sort(key=lambda path: int(path.rsplit('_', 1)[-1]))
  168. except ValueError as err:
  169. log.warning("Unable to parse file names: %s. %s", step_trace_files, err)
  170. step_trace_files = []
  171. else:
  172. training_trace_files = list(
  173. filter(
  174. lambda file: file.startswith('training_trace') and not file.endswith('.done'),
  175. files
  176. )
  177. )
  178. if len(training_trace_files) >= 1:
  179. log.warning("The training_trace file structure is changed, please upgrade "
  180. "mindspore and regenerate profiling data")
  181. file_paths = [os.path.join(input_dir, file) for file in step_trace_files]
  182. log.info("Find %d step trace files.", len(file_paths))
  183. return file_paths
  184. @abstractmethod
  185. def _parse(self, source_files):
  186. """Parse source step trace files."""
  187. def _get_next_step_trace(self, content, event_info):
  188. """
  189. Get next step trace info.
  190. Args:
  191. content (bytes): The input step trace info.
  192. event_info (dict): The event info.
  193. Returns:
  194. Generator, return the step trace one by one.
  195. """
  196. start_time = event_info.get('end', '-')
  197. event_info['start'] = start_time
  198. if 'reduce' not in event_info.keys():
  199. event_info['reduce'] = {}
  200. i = 0
  201. while i < len(content):
  202. profiling_head_data = content[i:i + self._profiling_head_len]
  203. parsed_head = struct.unpack('BBH', profiling_head_data)
  204. profiling_head = ProfilingHeadStruct(*parsed_head)
  205. if profiling_head.rptType == 10:
  206. st_data = content[i + self._profiling_head_len + self._profiling_head_pad_len:
  207. i + self._profiling_head_len + self._profiling_head_pad_len + self._st_data_len]
  208. parsed_data = struct.unpack('QQQHHH', st_data)
  209. next_event = StepTraceStruct(*parsed_data)
  210. self._construct_event_info(next_event, event_info)
  211. if event_info.get('end'):
  212. yield event_info
  213. start_time = event_info.get('end', '-')
  214. event_info.clear()
  215. event_info['start'] = start_time
  216. event_info['reduce'] = {}
  217. i = i + profiling_head.bufSize
  218. def _construct_event_info(self, next_event, event_info):
  219. """Construct event info according to next_event."""
  220. end_flag: bool = lambda tag: tag == self._step_end_tag_id
  221. fp_flag: bool = lambda tag: tag == self._fp_tag_id
  222. bp_flag: bool = lambda tag: tag == self._bp_tag_id
  223. reduce_flag: bool = lambda tag: self._reduce_min_tag_id <= tag < self._reduce_max_tag_id
  224. def _on_reduce_event(reduce_tag_id):
  225. """Handle reduce event."""
  226. stream_id = next_event.stream_id
  227. if event_info['reduce'].get(stream_id):
  228. event_info['reduce'][stream_id].append((reduce_tag_id, time_stamp))
  229. else:
  230. event_info['reduce'][stream_id] = [(reduce_tag_id, time_stamp)]
  231. tag_id = next_event.tag_id
  232. time_stamp = next_event.timeStamp
  233. if end_flag(tag_id):
  234. event_info['end'] = time_stamp
  235. elif fp_flag(tag_id):
  236. event_info['fp'] = time_stamp
  237. elif bp_flag(tag_id):
  238. event_info['bp'] = time_stamp
  239. elif reduce_flag(tag_id):
  240. _on_reduce_event(tag_id)
  241. def _record_trace_event(self, step_trace):
  242. """Record trace event."""
  243. self._step_num += 1
  244. start_time = step_trace.get('start')
  245. end_time = step_trace.get('end')
  246. fp_time = step_trace.get('fp')
  247. bp_time = step_trace.get('bp')
  248. if not (start_time and end_time and fp_time and bp_time):
  249. log.warning("The step %d lacks basic time.", self._step_num)
  250. return
  251. if start_time == '-':
  252. start_time = fp_time
  253. row_data = {
  254. 'step_num': self._step_num,
  255. 'start_point': start_time,
  256. 'end_point': end_time,
  257. 'total': end_time - start_time,
  258. 'fp_point': fp_time,
  259. 'bp_point': bp_time,
  260. 'iteration_interval': fp_time - start_time,
  261. 'fp_and_bp': bp_time - fp_time,
  262. 'tail': end_time - bp_time
  263. }
  264. # update reduce info
  265. self._update_reduce_info(step_trace, row_data)
  266. # save the row data
  267. if not self._header:
  268. self._header = list(row_data.keys())
  269. row_data_list = [row_data.get(header_name, 0) for header_name in self._header]
  270. self._result.append(row_data_list)
  271. def _update_reduce_info(self, step_trace, row_data):
  272. """Extract reduce info."""
  273. reduce_time = step_trace.get('reduce', {})
  274. for stream_id, time_points in reduce_time.items():
  275. time_point_num = len(time_points)
  276. if time_point_num % 2:
  277. log.warning("Stream %d has %d reduce time points.", stream_id, time_point_num)
  278. continue
  279. for index, point_id in enumerate(range(0, time_point_num, 2)):
  280. field_name = f'stream_{stream_id}_{index}'
  281. reduce_info = self._get_single_reduce_event_info(
  282. field_name, time_points[point_id], time_points[point_id + 1])
  283. row_data.update(reduce_info)
  284. def _get_single_reduce_event_info(self, field_name, start_point, end_point):
  285. """
  286. Get single reduce info.
  287. Args:
  288. field_name (str): The field name.
  289. start_point (Tuple[int, int]): Start point time info, including (tag_id, sys_count).
  290. end_point (Tuple[int, int]): End point time info, including (tag_id, sys_count).
  291. Returns:
  292. dict, reduce info.
  293. """
  294. ret_dict = {}
  295. return ret_dict
  296. def _record_average_info(self):
  297. """Calculate average info."""
  298. result_size = len(self._result)
  299. # calculate average data for each column in result data
  300. average_data = [0] * len(self._header)
  301. if result_size >= 2:
  302. for row_info in self._result[1:]:
  303. average_data = [
  304. Decimal(i) + Decimal(j) for i, j in zip(row_info, average_data)
  305. ]
  306. average_data = [
  307. round((item / (result_size - 1))) for item in average_data
  308. ]
  309. # change step num info in average_data to None
  310. step_num_index = self._header.index('step_num')
  311. average_data[step_num_index] = '-'
  312. self._result.append(average_data)
  313. log.info("Finish add average info for step trace.")
  314. def _save(self):
  315. """save step trace file."""
  316. bp_point, tail, fp_duration = 5, -1, -2
  317. log.info("Start to save step trace file.")
  318. if not self._header:
  319. return
  320. try:
  321. with open(self._output_path, 'w') as file_handle:
  322. csv_writer = csv.writer(file_handle)
  323. if not self._is_training_mode:
  324. self._header[fp_duration] = 'fp'
  325. self._header = self._header[:bp_point] + self._header[bp_point + 1:tail]
  326. csv_writer.writerow(self._header)
  327. for row_data in self._result:
  328. if not self._is_training_mode:
  329. row_data[fp_duration] += row_data[tail]
  330. row_data = row_data[:bp_point] + row_data[bp_point + 1:tail]
  331. csv_writer.writerow(row_data)
  332. os.chmod(self._output_path, stat.S_IREAD | stat.S_IWRITE)
  333. except (IOError, OSError) as err:
  334. log.warning('Failed to save step trace raw info. %s', err)
  335. raise ProfilerIOException
  336. class GpuStepTraceParser(BaseStepTraceParser):
  337. """The parser for gpu step trace data."""
  338. def get_fp_bp(self, f_obj, all_step_fp, all_step_bp):
  339. """Parser the fp and bp."""
  340. fp_start, bp_end = 0, 1
  341. if self._is_gpu_kernel_async_launch:
  342. for line in f_obj:
  343. line = line.strip().split()
  344. all_step_fp.append(line[1].split(',')[0])
  345. all_step_bp.append(line[2].split(',')[0])
  346. else:
  347. lines = f_obj.readlines()
  348. all_step_fp.append(lines[fp_start].split()[0])
  349. all_step_bp.append(lines[bp_end].split()[0])
  350. def record_point_info(self, source_file, output_path):
  351. """
  352. Record point info into json.
  353. Args:
  354. source_file (str): The file path of step trace original data.
  355. output_path (str): The output path for saving point info.
  356. Returns:
  357. dict, parsed point info.
  358. """
  359. all_step_points = []
  360. all_step_fp = []
  361. all_step_bp = []
  362. try:
  363. with open(source_file, 'r') as f_obj:
  364. self.get_fp_bp(f_obj, all_step_fp, all_step_bp)
  365. except (IOError, OSError) as err:
  366. log.warning(f'Failed to read {source_file}', err)
  367. raise ProfilerIOException
  368. for fp_name, bp_name in zip(all_step_fp, all_step_bp):
  369. if self._is_training_mode:
  370. points = {
  371. 'fp_start': fp_name,
  372. 'bp_end': bp_name
  373. }
  374. else:
  375. points = {
  376. 'fp_start': fp_name,
  377. }
  378. all_step_points.append(points)
  379. try:
  380. with open(output_path, 'w') as json_file:
  381. if self._is_gpu_kernel_async_launch:
  382. json.dump(all_step_points, json_file)
  383. else:
  384. json.dump(all_step_points[0], json_file)
  385. os.chmod(output_path, stat.S_IREAD | stat.S_IWRITE)
  386. except (IOError, OSError) as err:
  387. log.warning('Failed to save point info. %s', err)
  388. raise ProfilerIOException
  389. return all_step_points[0]
  390. def _get_step_trace_files(self):
  391. """Get step trace files."""
  392. return self._input_dir
  393. def _parse(self, source_file):
  394. """Parse source step trace files."""
  395. log.info("Start to parse step trace file.")
  396. fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3
  397. reduce_start = 4
  398. start_time, end_time = 0, 1
  399. step_trace_point_count = 3
  400. source_file = validate_and_normalize_path(source_file)
  401. try:
  402. with open(source_file, 'r') as f:
  403. lines = f.readlines()
  404. if len(lines) < step_trace_point_count:
  405. raise ProfilerRawFileException(
  406. f"Failed to parse {source_file} file. The FP_POINT/BP_POINT/ITER_END_POINT "
  407. f"do not recognized correctly. Try to set the environment variable'PROFILING_FP_START' "
  408. f"and 'PROFILING_BP_END' to solve this problem. For example, "
  409. f"'export PROFILING_FP_START=Default/xxx/Conv2d-op1' ")
  410. step_trace_info_all = [line.strip().split()[1:] for line in lines]
  411. num_of_step = len(step_trace_info_all[0])
  412. for step_trace_point in step_trace_info_all:
  413. if len(step_trace_point) != num_of_step:
  414. raise ProfilerRawFileException(
  415. f"Failed to parse {source_file} file. Due to the profiled "
  416. f"step_num of FP/BP/ITER_END Point are not equal")
  417. iter_start_info = [step_trace_info_all[fp_start][0]] + \
  418. step_trace_info_all[iter_end][:num_of_step]
  419. step_trace_info_all.insert(iter_start, iter_start_info)
  420. except (IOError, OSError) as err:
  421. log.warning(f'Failed to read {source_file}', err)
  422. raise ProfilerIOException
  423. for step_num in range(num_of_step):
  424. step_trace = {
  425. 'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]),
  426. 'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]),
  427. 'bp': int(step_trace_info_all[bp_end][step_num].split(',')[end_time]),
  428. 'end': int(step_trace_info_all[iter_end][step_num].split(',')[end_time]),
  429. 'reduce': {}
  430. }
  431. num_of_step_point = len(step_trace_info_all)
  432. if num_of_step_point > reduce_start:
  433. reduce_info = {}
  434. reduce_time_info = []
  435. for reduce_idx in range(reduce_start, num_of_step_point):
  436. cur_reduce_time = step_trace_info_all[reduce_idx][step_num]
  437. reduce_time_info += cur_reduce_time.split(',')
  438. reduce_info['ops'] = reduce_time_info
  439. step_trace['reduce'] = reduce_info
  440. self._record_trace_event(step_trace)
  441. self._record_average_info()
  442. log.info("Finish to parse step trace file.")
  443. def _parse_one_step(self, line):
  444. """
  445. Parse step text line to dict obj.
  446. Args:
  447. line (str): The step trace line text, it contains five parts, each part is separated by a space.
  448. part 1: start_op_name,start_op_time
  449. part 2: fp_op_name,fp_time
  450. part 3: bp_op_name,bp_time
  451. part 4: end_op_name,end_time
  452. part 5: [reduce_op_name,reduce1_start],it contains multiple reduce, each reduce is separated by a space.
  453. """
  454. line = line.strip().split()
  455. start_time = int(line[0].split(',')[1][:-1])
  456. fp_time = int(line[1].split(',')[1][:-1])
  457. bp_time = int(line[2].split(',')[1][:-1])
  458. end_time = int(line[3].split(',')[1][:-1])
  459. reduce_info = {}
  460. reduce_time_info = []
  461. for reduce_item in line[4:]:
  462. # add communication op start and end time, time unit from ns to 10ns.
  463. reduce_time_info.append(reduce_item.split(',')[1][:-1])
  464. reduce_time_info.append(reduce_item.split(',')[2][:-1])
  465. step_trace = {
  466. 'start': start_time,
  467. 'fp': fp_time,
  468. 'bp': bp_time,
  469. 'end': end_time
  470. }
  471. if reduce_time_info:
  472. reduce_info['ops'] = reduce_time_info
  473. step_trace['reduce'] = reduce_info
  474. self._record_trace_event(step_trace)
  475. def _parse_async_launch(self, source_file):
  476. """Parse source step trace files generated from async launch kernel."""
  477. log.info("Start to parse step trace file.")
  478. source_file = validate_and_normalize_path(source_file)
  479. try:
  480. with open(source_file, 'r') as f_obj:
  481. for line in f_obj:
  482. self._parse_one_step(line)
  483. except (IOError, OSError) as err:
  484. log.warning(f'Failed to read {source_file}', err)
  485. raise ProfilerIOException
  486. self._record_average_info()
  487. log.info("Finish to parse step trace file.")
  488. def _get_single_reduce_event_info(self, field_name, start_point, end_point):
  489. """
  490. Get single reduce info.
  491. Args:
  492. field_name (str): The field name.
  493. start_point (str): Start point time.
  494. end_point (str): End point time.
  495. Returns:
  496. dict, reduce info.
  497. """
  498. reduce_info = {}
  499. op_type = 'AllReduce'
  500. # append field name with op type.
  501. field_name += '_' + op_type
  502. reduce_info[field_name] = int(end_point) - int(start_point)
  503. reduce_info[field_name + '_start_point'] = start_point
  504. reduce_info[field_name + '_end_point'] = end_point
  505. return reduce_info
  506. class AscendStepTraceParser(BaseStepTraceParser):
  507. """The parser for ascend step trace data."""
  508. _event_size = 20
  509. _fp_tag = 2
  510. _bp_tag = 3
  511. _step_trace_files = []
  512. def record_point_info(self, point_info, output_path):
  513. """
  514. Record point info into json.
  515. Args:
  516. point_info (dict): The point info about tag id and relative op name.
  517. output_path (str): The output path for saving point info.
  518. Returns:
  519. dict, parsed point info.
  520. """
  521. if self._is_training_mode:
  522. points = {
  523. 'fp_start': point_info.get(self._fp_tag, ''),
  524. 'bp_end': point_info.get(self._bp_tag, '')
  525. }
  526. else:
  527. points = {
  528. 'fp_start': point_info.get(self._fp_tag, ''),
  529. }
  530. if os.path.exists(output_path):
  531. return points
  532. try:
  533. with open(output_path, 'w') as json_file:
  534. json.dump(points, json_file)
  535. os.chmod(output_path, stat.S_IREAD | stat.S_IWRITE)
  536. except (IOError, OSError) as err:
  537. log.warning('Failed to save point info. %s', err)
  538. raise ProfilerIOException
  539. return points
  540. def _get_step_trace_files(self):
  541. """Get step trace files."""
  542. # step trace files may under $profiler_dir or $profiler_dir/data
  543. if self._step_trace_files:
  544. return self._step_trace_files
  545. profiler_dir = self._input_dir
  546. step_trace_files = self._search_file(profiler_dir)
  547. if not step_trace_files:
  548. # try to find step trace files under $profiler_dir/data
  549. profiler_dir = os.path.join(profiler_dir, 'data')
  550. step_trace_files = self._search_file(profiler_dir)
  551. if not step_trace_files:
  552. raise ProfilerPathErrorException('Training trace file does not exist.')
  553. self._step_trace_files = step_trace_files
  554. return step_trace_files
  555. def _parse(self, source_files):
  556. """Parse source step trace files."""
  557. log.info("Start to parse step trace file.")
  558. event_info = {}
  559. for source_file in source_files:
  560. source_file = validate_and_normalize_path(source_file)
  561. try:
  562. with open(source_file, 'rb') as handler:
  563. content = handler.read()
  564. for step_trace in self._get_next_step_trace(content, event_info):
  565. if self._skip_first_step:
  566. self._skip_first_step = False
  567. continue
  568. self._record_trace_event(step_trace)
  569. except (IOError, OSError) as err:
  570. log.warning(f'Failed to read {source_file}', err)
  571. raise ProfilerIOException
  572. self._record_average_info()
  573. log.info("Finish to parse step trace file.")
  574. def _get_single_reduce_event_info(self, field_name, start_point, end_point):
  575. """
  576. Get single reduce info.
  577. Args:
  578. field_name (str): The field name.
  579. start_point (Tuple[int, int]): Start point time info, including (tag_id, sys_count).
  580. end_point (Tuple[int, int]): End point time info, including (tag_id, sys_count).
  581. Returns:
  582. dict, reduce info.
  583. """
  584. reduce_info = {}
  585. if end_point[0] - start_point[0] != 1 or start_point[0] % 2:
  586. log.warning("Unmatched reduce event <%s, %s>.", start_point, end_point)
  587. return reduce_info
  588. op_type = self._tag_map.get(start_point[0])
  589. # append field name with op type.
  590. if not op_type:
  591. log.warning("Can't recognize the inner type for point tag: %d.", start_point[0])
  592. field_name += '_parallel'
  593. else:
  594. field_name += '_' + op_type
  595. reduce_info[field_name] = end_point[1] - start_point[1]
  596. reduce_info[field_name + '_start_point'] = start_point[1]
  597. reduce_info[field_name + '_end_point'] = end_point[1]
  598. return reduce_info