You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.py 38 kB

4 years ago
4 years ago
5 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Profiling api file."""
  16. import os
  17. import stat
  18. import time
  19. import json
  20. from enum import Enum
  21. from mindspore import log as logger, context
  22. from mindspore.communication.management import GlobalComm, release, get_rank, get_group_size
  23. import mindspore._c_expression as c_expression
  24. import mindspore._c_dataengine as cde
  25. from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
  26. ProfilerIOException, ProfilerException, ProfilerRawFileException
  27. from mindspore.profiler.common.util import get_file_path, fwrite_format
  28. from mindspore.profiler.common.validator.validate_path import \
  29. validate_and_normalize_path
  30. from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
  31. from mindspore.profiler.parser.framework_parser import FrameworkParser
  32. from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
  33. from mindspore.profiler.parser.integrator import Integrator
  34. from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
  35. from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser
  36. from mindspore.profiler.parser.minddata_parser import MinddataParser
  37. from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
  38. from mindspore.profiler.parser.flops_parser import FlopsParser
  39. from mindspore.profiler.parser.minddata_pipeline_parser import \
  40. MinddataPipelineParser
  41. from mindspore.profiler.parser.optime_parser import OPComputeTimeParser
  42. from mindspore.profiler.parser.step_trace_parser import GpuStepTraceParser, AscendStepTraceParser
  43. from mindspore.profiler.parser.hccl_parser import HcclParser
  44. from mindspore.nn.cell import Cell
  45. INIT_OP_NAME = 'Default/InitDataSetQueue'
  46. def _environment_check():
  47. if c_expression.security.enable_security():
  48. raise RuntimeError("Profiler is not supported if compiled with \'-s on\'")
  49. if context.get_context("mode") == context.PYNATIVE_MODE:
  50. raise RuntimeError("Profiler is not supported in pynative mode currently, "
  51. "and it is only supported in graph mode.")
  52. class ProfileOption(Enum):
  53. """
  54. Profile Option Enum which be used in Profiler.profile.
  55. """
  56. trainable_parameters = 0
  57. class Profiler:
  58. """
  59. Performance profiling API.
  60. This API enables MindSpore users to profile the performance of neural network.
  61. Profiler supports Ascend and GPU, both of them are used in the same way,
  62. but only output_path in args works on GPU. And it can only be initialized once.
  63. Args:
  64. output_path (str): Output data path.
  65. optypes_not_deal (str): (Ascend only) Op type names, determine the data of which optype should be collected
  66. and analysed, will deal with all op if null. Different op types should be separated by comma.
  67. ascend_job_id (str): (Ascend only) The directory where the profiling files to be parsed are located.
  68. This parameter is used to support offline parsing.
  69. profile_communication (bool): Whether to collect communication performance data in a multi devices training,
  70. collect when True. Default is False. Setting this parameter has no effect during single device training.
  71. profile_memory (bool): Whether to collect tensor memory data, collect when True. Default is False.
  72. Examples:
  73. >>> import numpy as np
  74. >>> from mindspore import nn, context
  75. >>> from mindspore import Model
  76. >>> import mindspore.dataset as ds
  77. >>> from mindspore.profiler import Profiler
  78. >>>
  79. >>>
  80. >>> class Net(nn.Cell):
  81. ... def __init__(self):
  82. ... super(Net, self).__init__()
  83. ... self.fc = nn.Dense(2,2)
  84. ... def construct(self, x):
  85. ... return self.fc(x)
  86. >>>
  87. >>> def generator():
  88. ... for i in range(2):
  89. ... yield (np.ones([2, 2]).astype(np.float32), np.ones([2]).astype(np.int32))
  90. >>>
  91. >>> def train(net):
  92. ... optimizer = nn.Momentum(net.trainable_params(), 1, 0.9)
  93. ... loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
  94. ... data = ds.GeneratorDataset(generator, ["data", "label"])
  95. ... model = Model(net, loss, optimizer)
  96. ... model.train(1, data)
  97. >>>
  98. >>> if __name__ == '__main__':
  99. ... # If the device_target is GPU, set the device_target to "GPU"
  100. ... context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  101. ...
  102. ... # Init Profiler
  103. ... # Note that the Profiler should be initialized after context.set_context and before model.train
  104. ... # If you are running in parallel mode on Ascend, the Profiler should be initialized before HCCL
  105. ... # initialized.
  106. ... profiler = Profiler()
  107. ...
  108. ... # Train Model
  109. ... net = Net()
  110. ... train(net)
  111. ...
  112. ... # Profiler end
  113. ... profiler.analyse()
  114. """
  115. _hwts_output_filename_target = "output_format_data_hwts_"
  116. _opcompute_output_filename_target = "output_op_compute_time_"
  117. _aicpu_op_output_filename_target = "output_data_preprocess_aicpu_"
  118. _has_analysed = False
  119. _has_initialized = False
  120. def __init__(self, **kwargs):
  121. if Profiler._has_initialized:
  122. msg = "Do not init twice in the profiler."
  123. raise RuntimeError(msg)
  124. Profiler._has_initialized = True
  125. _environment_check()
  126. # get device_id and device_target
  127. self._get_devid_rankid_and_devtarget()
  128. self._get_output_path(kwargs)
  129. self._profile_communication = False
  130. self._has_started = False
  131. self.start_profile = True
  132. # Setup and start MindData Profiling
  133. self._md_profiler = cde.GlobalContext.profiling_manager()
  134. self._md_profiler.init()
  135. if self._device_target:
  136. cpu_profiler = c_expression.CPUProfiler
  137. self._cpu_profiler = cpu_profiler.get_instance()
  138. self._cpu_profiler.init(self._output_path)
  139. if self._device_target and self._device_target == "GPU":
  140. gpu_profiler = c_expression.GPUProfiler
  141. self._gpu_profiler = gpu_profiler.get_instance()
  142. self._gpu_profiler.init(self._output_path)
  143. if GlobalComm.WORLD_COMM_GROUP == "nccl_world_group":
  144. self._dev_id = str(get_rank())
  145. os.environ['DEVICE_ID'] = self._dev_id
  146. self.start_profile = kwargs.pop("start_profile", True)
  147. if not isinstance(self.start_profile, bool):
  148. raise TypeError("The parameter start_profile must be bool.")
  149. if kwargs:
  150. logger.warning("Params not be supported yet on GPU.")
  151. elif self._device_target and self._device_target == "Ascend":
  152. self._init_time = int(time.time() * 10000000)
  153. logger.info("Profiling: profiling init time: %d", self._init_time)
  154. self._parse_parameter_for_ascend(**kwargs)
  155. os.environ['DEVICE_ID'] = self._dev_id
  156. profiling_options = json.dumps(self._construct_profiling_options())
  157. # Characters longer than 2048 are ignored, resulting in profiling option resolution errors
  158. if len(profiling_options) > 2048:
  159. msg = "The parameter length exceeds the limit (2048), please input valid parameters."
  160. logger.critical(msg)
  161. raise ValueError(msg)
  162. # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
  163. self._ascend_profiler = c_expression.AscendProfiler.get_instance()
  164. self._ascend_profiler.init(self._output_path, int(self._dev_id), profiling_options)
  165. base_profiling_container_path = os.path.join(self._output_path, "container")
  166. container_path = os.path.join(base_profiling_container_path, self._dev_id)
  167. data_path = os.path.join(container_path, "data")
  168. data_path = validate_and_normalize_path(data_path)
  169. if not os.path.exists(data_path):
  170. os.makedirs(data_path, exist_ok=True)
  171. # add job id env through user input later
  172. self._job_id_env = 0
  173. if self.start_profile:
  174. self.start()
  175. def _construct_profiling_options(self):
  176. """
  177. Construct profiling options to determine which profiling data should be collected.
  178. """
  179. profile_memory = "off"
  180. if self._profile_memory:
  181. profile_memory = "on"
  182. fp_point = os.environ.get("PROFILING_FP_START", "")
  183. bp_point = os.environ.get("PROFILING_BP_END", "")
  184. profiling_options = {
  185. "output": self._output_path,
  186. "fp_point": fp_point,
  187. "bp_point": bp_point,
  188. "training_trace": "on",
  189. "task_trace": "on",
  190. "aic_metrics": "ArithmeticUtilization",
  191. "aicpu": "on",
  192. "profile_memory": profile_memory
  193. }
  194. return profiling_options
  195. def _parse_parameter_for_ascend(self, **kwargs):
  196. """Parse parameter in Proflier when the device target is Ascend."""
  197. optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
  198. if not isinstance(optypes_not_deal, str):
  199. raise TypeError("The parameter optypes_not_deal must be str.")
  200. self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
  201. job_dir = kwargs.pop("ascend_job_id", "")
  202. if job_dir:
  203. job_dir = validate_and_normalize_path(job_dir)
  204. if not os.path.exists(job_dir):
  205. msg = f"Invalid ascend_job_id: {job_dir}, Please pass the absolute path of the JOB dir"
  206. logger.critical(msg)
  207. raise ValueError(msg)
  208. self._output_path, _ = os.path.split(job_dir)
  209. self.start_profile = kwargs.pop("start_profile", True)
  210. if not isinstance(self.start_profile, bool):
  211. raise TypeError("The parameter start_profile must be bool.")
  212. self._profile_communication = kwargs.pop("profile_communication", False)
  213. if not isinstance(self._profile_communication, bool):
  214. raise TypeError("The parameter profile_communication must be bool.")
  215. if self._profile_communication:
  216. hccl_option = {"output": self._output_path, "task_trace": "on"}
  217. os.environ['PROFILING_OPTIONS'] = json.dumps(hccl_option)
  218. if not self.start_profile:
  219. raise TypeError("The parameter profile_communication can not be True if want to start profiler in the "
  220. "process of training.")
  221. self._profile_memory = kwargs.pop("profile_memory", False)
  222. if not isinstance(self._profile_memory, bool):
  223. raise TypeError("The parameter profile_memory must be bool")
  224. if kwargs:
  225. logger.warning("There are invalid params which don't work.")
  226. task_sink = os.getenv("GRAPH_OP_RUN")
  227. if task_sink and task_sink == "1":
  228. logger.warning("Profiling is not supported when task is not sink.")
  229. def analyse(self):
  230. """
  231. Collect and analyse performance data, called after training or during training. The example shows above.
  232. """
  233. if Profiler._has_analysed:
  234. msg = "Do not analyze twice in the profiler."
  235. raise RuntimeError(msg)
  236. Profiler._has_analysed = True
  237. _environment_check()
  238. self._cpu_profiler.stop()
  239. if self._device_target and self._device_target == "GPU":
  240. self._gpu_analyse()
  241. elif self._device_target and self._device_target == "Ascend":
  242. self._ascend_analyse()
  243. logger.info("Profiling: all the data have been analyzed.")
  244. def _ascend_analyse(self):
  245. """Collect and analyse ascend performance data"""
  246. self._rank_size = 1
  247. if self._profile_communication and not GlobalComm.INITED:
  248. self._profile_communication = False
  249. if GlobalComm.INITED:
  250. self._rank_size = get_group_size()
  251. release()
  252. if self._has_started:
  253. self.stop()
  254. else:
  255. logger.info("No need to stop profiler because profiler has been stopped.")
  256. self._ascend_profiler.finalize()
  257. job_id = self._get_profiling_job_id()
  258. logger.info("Profiling: job id is %s ", job_id)
  259. source_path = os.path.join(self._output_path, job_id)
  260. # parse hwts.log.data.45.dev file, and get task profiling data
  261. hwts_output_filename = self._hwts_output_filename_target + self._rank_id + ".txt"
  262. hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
  263. source_path = validate_and_normalize_path(source_path)
  264. hwts_output_filename = validate_and_normalize_path(hwts_output_filename)
  265. hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
  266. logger.info("Profiling: analyzing hwts data.")
  267. hwtslog_parser.execute()
  268. # parse Framework file, and get the relation of op and tasks
  269. framework_parser = FrameworkParser(job_id, self._dev_id, self._rank_id, self._output_path)
  270. logger.info("Profiling: analyzing framework data.")
  271. framework_parser.parse()
  272. op_task_dict = framework_parser.to_task_id_full_op_name_dict()
  273. if not op_task_dict:
  274. logger.error("Profiling: fail to parse framework files.")
  275. return
  276. # get op compute time from hwts data and framework data, write output_op_compute_time.txt
  277. opcompute_output_filename = self._opcompute_output_filename_target + self._rank_id + ".txt"
  278. opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
  279. opcompute_output_filename = validate_and_normalize_path(opcompute_output_filename)
  280. optime_parser = OPComputeTimeParser(
  281. hwts_output_filename, opcompute_output_filename,
  282. op_task_dict, self._output_path, self._rank_id
  283. )
  284. logger.info("Profiling: analyzing the operation compute time.")
  285. optime_parser.execute()
  286. # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
  287. output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._rank_id + ".txt"
  288. output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
  289. output_data_preprocess_aicpu = validate_and_normalize_path(output_data_preprocess_aicpu)
  290. aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
  291. logger.info("Profiling: analyzing the data preprocess data.")
  292. aicpu_data_parser.execute()
  293. # Parsing minddata AICPU profiling
  294. logger.info("Profiling: analyzing the minddata AICPU data.")
  295. MinddataParser.execute(source_path, self._output_path, self._rank_id)
  296. # parse minddata pipeline operator and queue
  297. try:
  298. pipeline_parser = MinddataPipelineParser(self._output_path, self._rank_id, self._output_path)
  299. logger.info("Profiling: analyzing the minddata pipeline operator and queue.")
  300. pipeline_parser.parse()
  301. except ProfilerException as err:
  302. logger.warning(err.message)
  303. # Analyze minddata information
  304. try:
  305. md_analyzer = MinddataProfilingAnalyzer(self._output_path, self._rank_id, self._output_path)
  306. logger.info("Profiling: analyzing the minddata information.")
  307. md_analyzer.analyze()
  308. except ProfilerException as err:
  309. logger.warning(err.message)
  310. # analyse op compute time info
  311. try:
  312. logger.info("Profiling: analyzing the operation compute time.")
  313. self._analyser_op_info()
  314. except ProfilerException as err:
  315. logger.warning(err.message)
  316. # analyse step trace info
  317. points = None
  318. is_training_mode_flag = False
  319. try:
  320. logger.info("Profiling: analyzing the step trace data.")
  321. points, is_training_mode_flag = self._analyse_step_trace(source_path, framework_parser)
  322. except ProfilerException as err:
  323. logger.warning(err.message)
  324. # analyse timeline info
  325. try:
  326. logger.info("Profiling: analyzing the timeline data.")
  327. self._analyse_timeline(aicpu_data_parser, optime_parser, source_path)
  328. except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
  329. logger.warning('Fail to write timeline data: %s', err)
  330. # analyse memory usage info
  331. if self._profile_memory:
  332. try:
  333. logger.info("Profiling: analyzing the memory usage info.")
  334. self._analyse_memory_usage(points)
  335. except (ProfilerIOException, ProfilerFileNotFoundException, ProfilerRawFileException) as err:
  336. logger.warning(err.message)
  337. # analyse hccl profiler info
  338. if self._profile_communication:
  339. try:
  340. logger.info("Profiling: analyzing the hccl profiler info.")
  341. self._analyse_hccl_info()
  342. except (ProfilerIOException, ProfilerFileNotFoundException, ProfilerRawFileException) as err:
  343. logger.warning(err.message)
  344. # get op FLOPs from aicore.data.x.slice.0 file, and compute FLOPS, write output_op_flops_x.txt
  345. flops_parser = FlopsParser(source_path, self._output_path, op_task_dict,
  346. self._dev_id, self._rank_id, is_training_mode_flag)
  347. logger.info("Profiling: analyzing the operation FLOPs.")
  348. flops_parser.execute()
  349. def start(self):
  350. """Used for Ascend, start profiling."""
  351. self._start_time = int(time.time() * 10000000)
  352. logger.info("Profiling: start time: %d", self._start_time)
  353. if not self._has_started:
  354. self._has_started = True
  355. else:
  356. raise RuntimeError("The profiler has already started.")
  357. self._md_profiler.start()
  358. self._cpu_profiler.step_profiling_enable(True)
  359. if self._device_target and self._device_target == "GPU":
  360. self._gpu_profiler.step_profiling_enable(True)
  361. elif self._device_target and self._device_target == "Ascend":
  362. self._ascend_profiler.start()
  363. def stop(self):
  364. """Used for Ascend, stop profiling."""
  365. if self._has_started:
  366. self._has_started = False
  367. else:
  368. raise RuntimeError("The profiler has not start, so can not stop.")
  369. self._md_profiler.stop()
  370. self._md_profiler.save(self._output_path)
  371. if self._device_target and self._device_target == "GPU":
  372. self._gpu_profiler.stop()
  373. elif self._device_target and self._device_target == "Ascend":
  374. self._ascend_profiler.stop()
  375. self._stop_time = int(time.time() * 10000000)
  376. logger.info("Profiling: stop time: %d", self._stop_time)
  377. def _gpu_analyse(self):
  378. """Collect and analyse gpu performance data"""
  379. self._dev_id = context.get_context("device_id")
  380. if GlobalComm.WORLD_COMM_GROUP == "nccl_world_group":
  381. self._dev_id = str(get_rank())
  382. if self._has_started:
  383. self.stop()
  384. else:
  385. logger.info("No need to stop profiler because profiler has been stopped.")
  386. timeline_generator = self._generate_timeline()
  387. # parse minddata pipeline operator and queue for GPU
  388. try:
  389. pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
  390. logger.info("Profiling: analyzing the minddata pipeline operator and queue for GPU.")
  391. pipeline_parser.parse()
  392. except ProfilerException as err:
  393. logger.warning(err.message)
  394. # Analyze minddata information
  395. try:
  396. md_analyzer = MinddataProfilingAnalyzer(self._output_path, self._dev_id, self._output_path)
  397. logger.info("Profiling: analyzing the minddata information.")
  398. md_analyzer.analyze()
  399. except ProfilerException as err:
  400. logger.warning(err.message)
  401. # analyse step trace info
  402. try:
  403. logger.info("Profiling: analyzing the step trace info.")
  404. self._analyse_step_trace(
  405. is_training_mode_flag=timeline_generator.check_op_name('Gradients'),
  406. is_gpu_kernel_async_launch_flag=timeline_generator.is_gpu_kernel_async_launch()
  407. )
  408. except ProfilerException as err:
  409. logger.warning(err.message)
  410. logger.warning(
  411. '\nMemory Usage is not supported on GPU currently.\n'
  412. 'Please running on Ascend if you would like to see memory analysis, '
  413. 'otherwise, this warning can be ignored.'
  414. )
  415. def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True,
  416. is_gpu_kernel_async_launch_flag=False):
  417. """
  418. Analyse step trace data and save the result.
  419. Args:
  420. source_path (str): The directory that contains the step trace original data.
  421. framework_parser (FrameworkParser): The framework parse instance.
  422. is_training_mode_flag (bool): Whether in training mode or not.
  423. """
  424. logger.info("Begin to parse step trace.")
  425. # construct output path
  426. dev_id = self._rank_id if self._device_target == "Ascend" else self._dev_id
  427. step_trace_intermediate_file_path = os.path.join(
  428. self._output_path,
  429. f'step_trace_raw_{dev_id}_detail_time.csv'
  430. )
  431. point_info_file_path = os.path.join(
  432. self._output_path,
  433. f'step_trace_point_info_{dev_id}.json'
  434. )
  435. step_trace_intermediate_file_path = validate_and_normalize_path(step_trace_intermediate_file_path)
  436. point_info_file_path = validate_and_normalize_path(point_info_file_path)
  437. if self._device_target and self._device_target == 'GPU':
  438. input_file_path = os.path.join(
  439. self._output_path,
  440. f'step_trace_profiling_{self._dev_id}.txt'
  441. )
  442. parser = GpuStepTraceParser(input_dir=input_file_path,
  443. output_file_path=step_trace_intermediate_file_path,
  444. is_training_mode=is_training_mode_flag,
  445. is_gpu_kernel_async_launch=is_gpu_kernel_async_launch_flag)
  446. parser.parse_and_save()
  447. point_info = parser.record_point_info(input_file_path, point_info_file_path)
  448. else:
  449. # whether keep the first step
  450. skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
  451. point_info = framework_parser.point_info
  452. # recognize inference or training mode
  453. is_training_mode_flag = framework_parser.check_op_name("Gradients")
  454. # parser the step trace files and save the result to disk
  455. source_path = validate_and_normalize_path(source_path)
  456. parser = AscendStepTraceParser(input_dir=source_path,
  457. output_file_path=step_trace_intermediate_file_path,
  458. job_id=self._job_id_env,
  459. skip_first_step=skip_first_step_flag,
  460. is_training_mode=is_training_mode_flag)
  461. parser.update_tag_op_type_map(point_info)
  462. parser.parse_and_save()
  463. point_info = parser.record_point_info(point_info, point_info_file_path)
  464. # print parser result
  465. parser.show()
  466. logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path)
  467. logger.info("The point info is: %s", point_info)
  468. return point_info, is_training_mode_flag
  469. def _analyse_timeline(self, aicpu_parser, optime_parser, source_path):
  470. """
  471. Analyse and parse timeline info.
  472. Args:
  473. aicpu_parser (DataPreProcessParser): The parser instance for AI CPU operator
  474. execution time calculation.
  475. optime_parser (OPComputeTimeParserParser): The parser instance for AI Core
  476. operator execution time calculation.
  477. """
  478. timeline_analyser = AscendTimelineGenerator(self._output_path, self._dev_id, self._rank_id, self._rank_size)
  479. # Get framework info
  480. integrator = Integrator(self._output_path, self._rank_id)
  481. aicore_detail_data = integrator.get_aicore_detail_data()
  482. aicore_detail_data_size = len(aicore_detail_data)
  483. col_names = ['op_name', 'op_type', 'avg_execution_time', 'subgraph',
  484. 'full_op_name', 'op_info']
  485. framework_info = {
  486. 'col_name': col_names,
  487. 'object': aicore_detail_data,
  488. 'size': aicore_detail_data_size
  489. }
  490. all_reduce_info = integrator.query_for_all_reduce()
  491. # Get timeline info
  492. logger.info('Start writing timeline info...')
  493. logger.info('Warm Prompt: It could take a few minutes if you are training '
  494. 'with a complex network or more than 10 steps.')
  495. # Add info into timeline, such as AI CPU, AllReduce, framework info.
  496. aicpu_info = aicpu_parser.query_aicpu_data()
  497. min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
  498. timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info,
  499. min_cycle_counter, source_path)
  500. size_limit = 100 * 1024 * 1024 # 100MB
  501. timeline_analyser.write_timeline(size_limit)
  502. timeline_analyser.write_timeline_summary()
  503. def _generate_timeline(self):
  504. """Used for gpu, generate timeline info, write to json format file."""
  505. try:
  506. size_limit = 100 * 1024 * 1024 # 100MB
  507. timeline_generator = GpuTimelineGenerator(self._output_path, self._dev_id)
  508. timeline_generator.init_timeline()
  509. timeline_generator.write_timeline(size_limit)
  510. timeline_generator.write_timeline_summary()
  511. return timeline_generator
  512. except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
  513. logger.warning('Fail to write timeline data: %s', err)
  514. raise RuntimeError('Fail to write timeline data.')
  515. def _analyse_memory_usage(self, points):
  516. """Analyse memory usage data."""
  517. integrator = Integrator(self._output_path, self._rank_id)
  518. aicore_detail_data = integrator.get_aicore_detail_data()
  519. memory_parser = MemoryUsageParser(self._output_path, self._rank_id)
  520. memory_parser.init_memory_usage_info(aicore_detail_data, points)
  521. memory_parser.write_memory_files()
  522. def _get_profiling_job_id(self):
  523. """Get profiling job id, which was generated by ada service.
  524. Returns:
  525. str, profiling job id.
  526. """
  527. job_id = ""
  528. job_dirs = filter(lambda item: item.startswith('JOB') and os.path.isdir(os.path.join(self._output_path, item)),
  529. os.listdir(self._output_path))
  530. sorted_job_dirs = sorted(job_dirs, key=lambda x: os.path.getmtime(os.path.join(self._output_path, x)),
  531. reverse=True)
  532. for dir_name in sorted_job_dirs:
  533. job_dir = os.path.join(self._output_path, dir_name)
  534. host_start_file_path = get_file_path(job_dir, "host_start.log")
  535. if host_start_file_path is None:
  536. logger.warning("Find profiling job path %s, but host_start.log not exist, "
  537. "profiler will ignore this job dir.", job_dir)
  538. continue
  539. training_device_id = host_start_file_path.split('.')[-1]
  540. if self._dev_id != training_device_id:
  541. logger.warning("Find profiling find job path %s, but not current training device id. "
  542. "Current training device id %s, but job path device id: %s, "
  543. "profiler will ignore this job dir.", job_dir, self._dev_id, training_device_id)
  544. continue
  545. job_start_time = self._parse_host_start_log(host_start_file_path)
  546. if not job_start_time:
  547. logger.warning("Find profiling job path %s, but fail to get job start info, "
  548. "profiler will ignore this job dir.", job_start_time)
  549. continue
  550. if int(job_start_time) < self._start_time:
  551. logger.warning("Find profiling job path %s, but start_time(%d) is earlier than this training "
  552. "start_time(%d), profiler will ignore this job dir.",
  553. job_dir, int(job_start_time), self._start_time)
  554. continue
  555. job_id = dir_name
  556. break
  557. if not job_id:
  558. msg = "Fail to get profiling job, output path is {}, " \
  559. "please check whether job dir(name startswith JOB) in output path was generated, " \
  560. "or may be the device id from job dir dismatch the " \
  561. "device_id in current process.".format(self._output_path)
  562. raise RuntimeError(msg)
  563. return job_id
  564. @staticmethod
  565. def _parse_host_start_log(input_file):
  566. """
  567. Parse host start log file, get the start time of the job.
  568. Args:
  569. input_file (str): The file path of the host start log file.
  570. Returns:
  571. str, job start time.
  572. """
  573. job_start_time = ""
  574. with open(input_file) as f:
  575. for line in f.readlines():
  576. if "clock_realtime" in line:
  577. # 16 means the first digit of the timestamp, len(line)-3 means the last.
  578. job_start_time = line[16:len(line) - 3]
  579. return job_start_time
  580. def _analyser_op_info(self):
  581. """Analyse the operator information."""
  582. integrator = Integrator(self._output_path, self._rank_id)
  583. integrator.integrate()
  584. aicore_type_result = self._query_op_type_info()
  585. detail_file_path = os.path.join(
  586. self._output_path,
  587. 'output_op_compute_time_detail_{}.txt'.format(self._rank_id)
  588. )
  589. fwrite_format(detail_file_path, data_source='title:op compute time')
  590. display_names = [
  591. 'optype_name', 'compute_time(ms, per-step)',
  592. 'called_times(per-step)', 'percent'
  593. ]
  594. fwrite_format(detail_file_path, data_source=" ".join(display_names), is_print=True)
  595. fwrite_format(detail_file_path, data_source=aicore_type_result, is_print=True)
  596. op_type_order = [item[0] for item in aicore_type_result]
  597. aicore_detail_result = self._query_op_detail_info(op_type_order)
  598. fwrite_format(detail_file_path, data_source='', is_print=True)
  599. fwrite_format(detail_file_path, data_source='Detail:', is_print=True)
  600. fwrite_format(detail_file_path, data_source=" ".join(aicore_detail_result.get('col_name_detail')),
  601. is_print=True)
  602. fwrite_format(detail_file_path, data_source=aicore_detail_result.get('object'), is_print=True)
  603. def _query_op_type_info(self):
  604. """
  605. Query AICORE operator type information.
  606. Returns:
  607. list[list], the AICORE operator type and execution time information.
  608. """
  609. integrator = Integrator(self._output_path, self._rank_id)
  610. return integrator.get_aicore_data()
  611. def _query_op_detail_info(self, op_type_order):
  612. """
  613. Query AICORE operator detail information.
  614. Args:
  615. op_type_order(list): The name of the op type in order.
  616. Returns:
  617. dict, the AICORE operator detail information.
  618. """
  619. op_type_condition = {}
  620. if self._filt_optype_names:
  621. op_type_condition['not_in'] = self._filt_optype_names
  622. filter_condition = {
  623. 'op_type': op_type_condition,
  624. 'is_display_detail': False,
  625. }
  626. integrator = Integrator(self._output_path, self._rank_id)
  627. return integrator.query_and_sort_by_op_type(filter_condition, op_type_order)
  628. def _get_devid_rankid_and_devtarget(self):
  629. """Get device id and rank id and target of this training."""
  630. device_target = ""
  631. dev_id = ""
  632. rank_id = ""
  633. try:
  634. dev_id = str(context.get_context("device_id"))
  635. device_target = context.get_context("device_target")
  636. except ValueError as err:
  637. logger.error("Profiling: fail to get context, %s", err)
  638. if not dev_id or not dev_id.isdigit():
  639. dev_id = os.getenv('DEVICE_ID')
  640. if not dev_id or not dev_id.isdigit():
  641. dev_id = "0"
  642. logger.warning("Fail to get DEVICE_ID, use 0 instead.")
  643. if device_target and device_target not in ["Ascend", "GPU", "CPU"]:
  644. msg = "Profiling: unsupported backend: %s" % device_target
  645. raise RuntimeError(msg)
  646. rank_id = os.getenv("RANK_ID")
  647. if not rank_id or not rank_id.isdigit():
  648. rank_id = "0"
  649. logger.info("Fail to get RANK_ID, use 0 instead.")
  650. self._dev_id = dev_id
  651. self._device_target = device_target
  652. self._rank_id = rank_id
  653. def _get_output_path(self, kwargs):
  654. """Get output path of profiling data."""
  655. if os.getenv("MS_DIAGNOSTIC_DATA_PATH") and kwargs.get("output_path") is not None:
  656. logger.warning("Both parameter output_path and environment variable MS_DIAGNOSTIC_DATA_PATH"
  657. " have values set, and the profiling data saving path is the value set "
  658. "in parameter output_path")
  659. if kwargs.get("output_path") is None:
  660. if "output_path" in kwargs:
  661. kwargs.pop("output_path")
  662. # Environment variables are mainly set for the convenience of cloud profiler.
  663. output_path = os.getenv("MS_DIAGNOSTIC_DATA_PATH")
  664. if output_path:
  665. self._output_path = validate_and_normalize_path(output_path)
  666. else:
  667. output_path = "data"
  668. self._output_path = validate_and_normalize_path(output_path)
  669. else:
  670. output_path = kwargs.pop("output_path")
  671. self._output_path = validate_and_normalize_path(output_path)
  672. self._output_path = os.path.join(self._output_path, "profiler")
  673. if not os.path.exists(self._output_path):
  674. os.makedirs(self._output_path, exist_ok=True)
  675. os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
  676. else:
  677. logger.warning("The target dir already exists. "
  678. "There may be some old profiling data, and they will be rewritten in the end.")
  679. def _analyse_hccl_info(self):
  680. """Analyse hccl info."""
  681. hccl_path = os.path.join(self._output_path, "hccl_info_{}".format(self._rank_id))
  682. if not os.path.exists(hccl_path):
  683. os.makedirs(hccl_path, exist_ok=True)
  684. os.chmod(hccl_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
  685. logger.info("Start call the interface HCCLParseOP parsing hccl info...")
  686. logger.info('Warm Prompt: It could take a few minutes if you are training '
  687. 'with a complex network or more than 10 steps.')
  688. # Call the interface HCCLParseOP parsing hccl info.
  689. try:
  690. from hccl_parser.entry import hccl_parse_op
  691. hccl_parse_op(self._dev_id, self._output_path, hccl_path, op_type='all')
  692. except ImportError as err:
  693. logger.critical("%s,please check if the hccl_parser-{version}-py3-none-any.whl is installed."
  694. "The hccl_parser-{version}-py3-none-any.whl package is usually located "
  695. "in the /usr/local/Ascend/tools Directory", err)
  696. raise ImportError(err)
  697. logger.info("Parse hccl info successfully.")
  698. logger.info("Start analyse hccl info.")
  699. hccl_parse = HcclParser(hccl_path, self._dev_id, self._rank_id, self._output_path)
  700. hccl_parse.parse()
  701. logger.info("Analyse hccl info successfully.")
  702. @staticmethod
  703. def profile(network, profile_option):
  704. """
  705. Get the number of trainable parameters in the training network.
  706. Args:
  707. network (Cell): The training network.
  708. profile_option (ProfileOption): The profile option.
  709. Returns:
  710. dict, the key is the option name, the value is the result of option.
  711. """
  712. result = dict()
  713. if not profile_option:
  714. raise ValueError("The parameter profile_option must pass a value using ProfileOption.")
  715. if profile_option == ProfileOption.trainable_parameters:
  716. if not isinstance(network, Cell):
  717. msg = "Profiling: The network should be an object of nn.Cell"
  718. raise ValueError(msg)
  719. param_nums = len(network.parameters_dict())
  720. result = {"trainable_parameters": param_nums}
  721. else:
  722. raise ValueError("Wrong options.")
  723. return result