You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.py 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Profiling api file."""
  16. import os
  17. import time
  18. from enum import Enum
  19. from mindspore import log as logger, context
  20. from mindspore.communication.management import release, get_rank
  21. from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
  22. ProfilerIOException, ProfilerException
  23. from mindspore.profiler.common.util import get_file_names, fwrite_format
  24. from mindspore.profiler.common.validator.validate_path import \
  25. validate_and_normalize_path
  26. from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
  27. from mindspore.profiler.parser.framework_parser import FrameworkParser
  28. from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
  29. from mindspore.profiler.parser.integrator import Integrator
  30. from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
  31. from mindspore.profiler.parser.minddata_parser import MinddataParser
  32. from mindspore.profiler.parser.minddata_pipeline_parser import \
  33. MinddataPipelineParser
  34. from mindspore.profiler.parser.optime_parser import OPComputeTimeParser
  35. from mindspore.profiler.parser.step_trace_parser import StepTraceParser
  36. from mindspore.nn.cell import Cell
  37. PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling"
  38. INIT_OP_NAME = 'Default/InitDataSetQueue'
  39. class ProfileOption(Enum):
  40. """
  41. Profile Option Enum which be used in Profiler.profile.
  42. """
  43. trainable_parameters = 0
  44. class Profiler:
  45. """
  46. Performance profiling API.
  47. This API enables MindSpore users to profile the performance of neural network.
  48. Profiler supports Ascend and GPU, both of them are used in the same way,
  49. but only output_path in args works on GPU.
  50. Args:
  51. output_path (str): Output data path.
  52. optypes_not_deal (str): (Ascend only) Op type names, determine the data of which optype should be collected
  53. and analysed,will deal with all op if null; Different op types should be separated by comma.
  54. ascend_job_id (str): (Ascend only) The directory where the profiling files to be parsed are located;
  55. This parameter is used to support offline parsing.
  56. Examples:
  57. >>> from mindspore.profiler import Profiler
  58. >>> import mindspore.context
  59. >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
  60. >>> device_id=int(os.environ["DEVICE_ID"]))
  61. >>> profiler = Profiler()
  62. >>> model = Model()
  63. >>> model.train()
  64. >>> profiler.analyse()
  65. """
  66. _base_profiling_container_path = "/var/log/npu/profiling/container"
  67. _hwts_output_filename_target = "output_format_data_hwts_"
  68. _opcompute_output_filename_target = "output_op_compute_time_"
  69. _aicpu_op_output_filename_target = "output_data_preprocess_aicpu_"
  70. def __init__(self, **kwargs):
  71. # get device_id and device_target
  72. self._get_devid_and_devtarget()
  73. output_path = kwargs.pop("output_path", "./data")
  74. self._output_path = validate_and_normalize_path(output_path)
  75. self._output_path = os.path.join(self._output_path, "profiler")
  76. if not os.path.exists(self._output_path):
  77. os.makedirs(self._output_path, exist_ok=True)
  78. else:
  79. logger.warning("The target dir already exists. "
  80. "There may be some old profiling data, and they will be rewrote in the end.")
  81. os.environ['PROFILING_MODE'] = 'true'
  82. os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
  83. if self._device_target and self._device_target == "GPU":
  84. from mindspore._c_expression import GPUProfiler
  85. self._gpu_profiler = GPUProfiler.get_instance()
  86. self._gpu_profiler.init(self._output_path)
  87. self._gpu_profiler.step_profiling_enable(True)
  88. if context.get_auto_parallel_context('device_num') > 1:
  89. self._dev_id = get_rank()
  90. os.environ['DEVICE_ID'] = str(self._dev_id)
  91. if kwargs:
  92. logger.warning("Params not be supported yet on GPU.")
  93. elif self._device_target and self._device_target == "Ascend":
  94. optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
  95. if not isinstance(optypes_not_deal, str):
  96. raise TypeError("The parameter optypes_not_deal must be str.")
  97. job_id = kwargs.pop("ascend_job_id", "")
  98. if kwargs:
  99. logger.warning("There are invalid params which don't work.")
  100. os.environ['DEVICE_ID'] = self._dev_id
  101. os.environ['AICPU_PROFILING_MODE'] = 'true'
  102. # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
  103. context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
  104. self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id)
  105. data_path = os.path.join(self._container_path, "data")
  106. data_path = validate_and_normalize_path(data_path)
  107. if not os.path.exists(data_path):
  108. os.makedirs(data_path, exist_ok=True)
  109. self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
  110. self._profiling_job_id = job_id
  111. # add job id env through user input later
  112. self._job_id_env = 0
  113. self._start_time = int(time.time() * 10000000)
  114. logger.info("Profiling: profiling start time: %d", self._start_time)
  115. def analyse(self):
  116. """
  117. Collect and analyse performance data, called after training or during training.
  118. Examples:
  119. >>> from mindspore.profiler import Profiler
  120. >>> import mindspore.context
  121. >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
  122. >>> device_id=int(os.environ["DEVICE_ID"]))
  123. >>> profiler = Profiler()
  124. >>> model = Model()
  125. >>> model.train()
  126. >>> profiler.analyse()
  127. """
  128. if self._device_target and self._device_target == "GPU":
  129. self._gpu_profiler.stop()
  130. self._generate_timeline()
  131. # parse minddata pipeline operator and queue for GPU
  132. try:
  133. pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
  134. pipeline_parser.parse()
  135. except ProfilerException as err:
  136. logger.warning(err.message)
  137. os.environ['PROFILING_MODE'] = str("false")
  138. elif self._device_target and self._device_target == "Ascend":
  139. release()
  140. job_id = self._get_profiling_job_id()
  141. logger.info("Profiling: job id is %s ", job_id)
  142. source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
  143. # parse hwts.log.data.45.dev file, and get task profiling data
  144. hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
  145. hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
  146. source_path = validate_and_normalize_path(source_path)
  147. hwts_output_filename = validate_and_normalize_path(hwts_output_filename)
  148. hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
  149. _ = hwtslog_parser.execute()
  150. # parse Framework file, and get the relation of op and tasks
  151. framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
  152. framework_parser.parse()
  153. op_task_dict = framework_parser.to_task_id_full_op_name_dict()
  154. if not op_task_dict:
  155. logger.error("Profiling: fail to parse framework files.")
  156. return
  157. # get op compute time from hwts data and framework data, write output_op_compute_time.txt
  158. opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
  159. opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
  160. opcompute_output_filename = validate_and_normalize_path(opcompute_output_filename)
  161. optime_parser = OPComputeTimeParser(
  162. hwts_output_filename, opcompute_output_filename,
  163. op_task_dict, self._output_path, self._dev_id
  164. )
  165. optime_parser.execute()
  166. # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
  167. output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
  168. output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
  169. output_data_preprocess_aicpu = validate_and_normalize_path(output_data_preprocess_aicpu)
  170. aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
  171. aicpu_data_parser.execute()
  172. # Parsing minddata AICPU profiling
  173. MinddataParser.execute(source_path, self._output_path, self._dev_id)
  174. # parse minddata pipeline operator and queue
  175. try:
  176. pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
  177. pipeline_parser.parse()
  178. except ProfilerException as err:
  179. logger.warning(err.message)
  180. # analyse op compute time info
  181. try:
  182. self._analyser_op_info()
  183. except ProfilerException as err:
  184. logger.warning(err.message)
  185. # analyse step trace info
  186. try:
  187. self._analyse_step_trace(source_path, framework_parser)
  188. except ProfilerException as err:
  189. logger.warning(err.message)
  190. # analyse timeline info
  191. try:
  192. self._analyse_timeline(aicpu_data_parser, optime_parser)
  193. except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
  194. logger.warning('Fail to write timeline data: %s', err)
  195. os.environ['PROFILING_MODE'] = str("false")
  196. context.set_context(enable_profiling=False)
  197. def _analyse_step_trace(self, source_path, framework_parser):
  198. """
  199. Analyse step trace data and save the result.
  200. Args:
  201. source_path (str): The directory that contains the step trace original data.
  202. framework_parser (FrameworkParser): The framework parse instance.
  203. """
  204. logger.info("Begin to parse step trace.")
  205. # construct output path
  206. step_trace_intermediate_file_path = os.path.join(
  207. self._output_path,
  208. f'step_trace_raw_{self._dev_id}_detail_time.csv'
  209. )
  210. point_info_file_path = os.path.join(
  211. self._output_path,
  212. 'step_trace_point_info.json'
  213. )
  214. step_trace_intermediate_file_path = validate_and_normalize_path(step_trace_intermediate_file_path)
  215. point_info_file_path = validate_and_normalize_path(point_info_file_path)
  216. # whether keep the first step
  217. skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
  218. point_info = framework_parser.point_info
  219. # parser the step trace files and save the result to disk
  220. source_path = validate_and_normalize_path(source_path)
  221. parser = StepTraceParser(input_dir=source_path,
  222. output_file_path=step_trace_intermediate_file_path,
  223. job_id=self._job_id_env,
  224. skip_first_step=skip_first_step_flag)
  225. parser.update_tag_op_type_map(point_info)
  226. parser.parse_and_save()
  227. point_info = parser.record_point_info(point_info, point_info_file_path)
  228. # print parser result
  229. parser.show()
  230. logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path)
  231. logger.info("The point info is: %s", point_info)
  232. def _analyse_timeline(self, aicpu_parser, optime_parser):
  233. """
  234. Analyse and parse timeline info.
  235. Args:
  236. aicpu_parser (DataPreProcessParser): The parser instance for AI CPU operator
  237. execution time calculation.
  238. optime_parser (OPComputeTimeParserParser): The parser instance for AI Core
  239. operator execution time calculation.
  240. """
  241. timeline_analyser = AscendTimelineGenerator(self._output_path, self._dev_id)
  242. # Get framework info
  243. integrator = Integrator(self._output_path, self._dev_id)
  244. aicore_detail_data = integrator.get_aicore_detail_data()
  245. aicore_detail_data_size = len(aicore_detail_data)
  246. col_names = ['op_name', 'op_type', 'avg_execution_time', 'subgraph',
  247. 'full_op_name', 'op_info']
  248. framework_info = {
  249. 'col_name': col_names,
  250. 'object': aicore_detail_data,
  251. 'size': aicore_detail_data_size
  252. }
  253. all_reduce_info = integrator.query_for_all_reduce()
  254. # Get timeline info
  255. logger.info('Start writing timeline info...')
  256. logger.info('Warm Prompt: It could take a few minutes if you are training '
  257. 'with a complex network or more than 10 steps.')
  258. # Add info into timeline, such as AI CPU, AllReduce, framework info.
  259. aicpu_info = aicpu_parser.query_aicpu_data()
  260. min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
  261. timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info, min_cycle_counter)
  262. size_limit = 20 * 1024 * 1024 # 20MB
  263. timeline_analyser.write_timeline(size_limit)
  264. timeline_analyser.write_timeline_summary()
  265. def _generate_timeline(self):
  266. """Used for gpu, generate timeline info, write to json format file."""
  267. try:
  268. size_limit = 100 * 1024 * 1024 # 100MB
  269. timeline_generator = GpuTimelineGenerator(self._output_path, self._dev_id)
  270. timeline_generator.init_timeline()
  271. timeline_generator.write_timeline(size_limit)
  272. timeline_generator.write_timeline_summary()
  273. except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
  274. logger.warning('Fail to write timeline data: %s', err)
  275. raise RuntimeError('Fail to write timeline data.')
  276. def _get_profiling_job_id(self):
  277. """Get profiling job id, which was generated by ada service.
  278. Returns:
  279. str: profiling jon id.
  280. """
  281. if self._profiling_job_id:
  282. return self._profiling_job_id
  283. job_id = ""
  284. cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'"
  285. r = os.popen(cmd)
  286. profiling_job_dirs = r.readlines()
  287. r.close()
  288. for item in profiling_job_dirs:
  289. path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip())
  290. log_file = get_file_names(path, "host_start.log")
  291. if not log_file:
  292. logger.error("Profiling: job path %s, host_start.log not exist.", path)
  293. continue
  294. log_file = os.path.join(path, log_file[0])
  295. item_dict = self._parse_host_start_log(log_file)
  296. if not item_dict:
  297. logger.error("Profiling: job path %s, fail to get job start info.", path)
  298. continue
  299. if self._dev_id != item_dict["device_id"]:
  300. logger.info("Profiling: job path %s, dev id %s, training device id %s.",
  301. path, item_dict["device_id"], self._dev_id)
  302. continue
  303. if self._start_time > int(item_dict["start_time"]):
  304. logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
  305. path, item_dict["start_time"], self._start_time)
  306. break
  307. job_id = item.strip()
  308. break
  309. if not job_id:
  310. msg = "Fail to get profiling job, please check whether job dir was generated"
  311. raise RuntimeError(msg)
  312. return job_id
  313. def _parse_host_start_log(self, input_file):
  314. """
  315. Parse host start log file, get the device id and start time of the job.
  316. Args:
  317. input_file (str): The file path of the host start log file.
  318. Returns:
  319. dict, job start time and device id.
  320. """
  321. item_dict = {}
  322. for line in open(input_file):
  323. if "Device" in line:
  324. item_dict["device_id"] = line[7:len(line)-2]
  325. elif "clock_realtime" in line:
  326. item_dict["start_time"] = line[16:len(line)-3]
  327. return item_dict
  328. def _analyser_op_info(self):
  329. """Analyse the operator information."""
  330. integrator = Integrator(self._output_path, self._dev_id)
  331. integrator.integrate()
  332. aicore_type_result = self._query_op_type_info()
  333. detail_file_path = os.path.join(
  334. self._output_path,
  335. 'output_op_compute_time_detail_{}.txt'.format(self._dev_id)
  336. )
  337. fwrite_format(detail_file_path, data_source='title:op compute time')
  338. display_names = [
  339. 'optype_name', 'compute_time(ms, per-step)',
  340. 'called_times(per-step)', 'percent'
  341. ]
  342. fwrite_format(detail_file_path, data_source=" ".join(display_names), is_print=True)
  343. fwrite_format(detail_file_path, data_source=aicore_type_result, is_print=True)
  344. op_type_order = [item[0] for item in aicore_type_result]
  345. aicore_detail_result = self._query_op_detail_info(op_type_order)
  346. fwrite_format(detail_file_path, data_source='', is_print=True)
  347. fwrite_format(detail_file_path, data_source='Detail:', is_print=True)
  348. fwrite_format(detail_file_path, data_source=" ".join(aicore_detail_result.get('col_name_detail')),
  349. is_print=True)
  350. fwrite_format(detail_file_path, data_source=aicore_detail_result.get('object'), is_print=True)
  351. def _query_op_type_info(self):
  352. """
  353. Query AICORE operator type information.
  354. Returns:
  355. list[list], the AICORE operator type and execution time information.
  356. """
  357. integrator = Integrator(self._output_path, self._dev_id)
  358. return integrator.get_aicore_data()
  359. def _query_op_detail_info(self, op_type_order):
  360. """
  361. Query AICORE operator detail information.
  362. Args:
  363. op_type_order(list): The name of the op type in order.
  364. Returns:
  365. dict, the AICORE operator detail information.
  366. """
  367. op_type_condition = {}
  368. if self._filt_optype_names:
  369. op_type_condition['not_in'] = self._filt_optype_names
  370. filter_condition = {
  371. 'op_type': op_type_condition,
  372. 'is_display_detail': False,
  373. }
  374. integrator = Integrator(self._output_path, self._dev_id)
  375. return integrator.query_and_sort_by_op_type(filter_condition, op_type_order)
  376. def _get_devid_and_devtarget(self):
  377. """Get device id and target of this training."""
  378. device_target = ""
  379. dev_id = ""
  380. try:
  381. dev_id = str(context.get_context("device_id"))
  382. device_target = context.get_context("device_target")
  383. except ValueError as err:
  384. logger.error("Profiling: fail to get context, %s", err)
  385. if not dev_id or not dev_id.isdigit():
  386. dev_id = os.getenv('DEVICE_ID')
  387. if not dev_id or not dev_id.isdigit():
  388. dev_id = "0"
  389. logger.error("Fail to get DEVICE_ID, use 0 instead.")
  390. if device_target and device_target not in ["Ascend", "GPU"]:
  391. msg = "Profiling: unsupported backend: %s" % device_target
  392. raise RuntimeError(msg)
  393. self._dev_id = dev_id
  394. self._device_target = device_target
  395. @staticmethod
  396. def profile(network=None, profile_option=None):
  397. """
  398. Get the number of trainable parameters in the training network.
  399. Args:
  400. network (Cell): The training network.
  401. profile_option (ProfileOption): The profile option.
  402. Returns:
  403. dict, the key is the option name, the value is the result of option.
  404. """
  405. result = dict()
  406. if not profile_option:
  407. raise ValueError("The parameter profile_option must pass a value using ProfileOption.")
  408. if profile_option == ProfileOption.trainable_parameters:
  409. if not isinstance(network, Cell):
  410. msg = "Profiling: The network should be an object of nn.Cell"
  411. raise ValueError(msg)
  412. param_nums = len(network.parameters_dict())
  413. result = {"trainable_parameters": param_nums}
  414. else:
  415. raise ValueError("Wrong options.")
  416. return result