| @@ -22,6 +22,7 @@ from enum import Enum | |||
| from mindspore import log as logger, context | |||
| from mindspore.communication.management import release, get_rank | |||
| import mindspore._c_expression as c_expression | |||
| from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ | |||
| ProfilerIOException, ProfilerException, ProfilerRawFileException | |||
| from mindspore.profiler.common.util import get_file_names, fwrite_format | |||
| @@ -86,18 +87,18 @@ class Profiler: | |||
| os.environ['MINDDATA_PROFILING_DIR'] = self._output_path | |||
| if self._device_target: | |||
| from mindspore._c_expression import CPUProfiler | |||
| CPUProfiler = c_expression.CPUProfiler | |||
| self._cpu_profiler = CPUProfiler.get_instance() | |||
| self._cpu_profiler.init(self._output_path) | |||
| self._cpu_profiler.step_profiling_enable(True) | |||
| if self._device_target and self._device_target == "GPU": | |||
| from mindspore._c_expression import GPUProfiler | |||
| GPUProfiler = c_expression.GPUProfiler | |||
| self._gpu_profiler = GPUProfiler.get_instance() | |||
| self._gpu_profiler.init(self._output_path) | |||
| self._gpu_profiler.step_profiling_enable(True) | |||
| if context.get_auto_parallel_context('device_num') > 1: | |||
| self._dev_id = get_rank() | |||
| os.environ['DEVICE_ID'] = str(self._dev_id) | |||
| self._dev_id = str(get_rank()) | |||
| os.environ['DEVICE_ID'] = self._dev_id | |||
| if kwargs: | |||
| logger.warning("Params not be supported yet on GPU.") | |||
| @@ -253,8 +254,8 @@ class Profiler: | |||
| def _gpu_analyse(self): | |||
| """Collect and analyse gpu performance data""" | |||
| if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != get_rank(): | |||
| self._dev_id = get_rank() | |||
| if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != str(get_rank()): | |||
| self._dev_id = str(get_rank()) | |||
| logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' | |||
| 'and init(). Profiler should be initialized after these code. ') | |||
| self._gpu_profiler.stop() | |||
| @@ -403,6 +404,7 @@ class Profiler: | |||
| """ | |||
| job_id = "" | |||
| for item in os.listdir(self._output_path): | |||
| if item.startswith('JOB'): | |||
| path = os.path.join(self._output_path, item) | |||
| @@ -410,25 +412,23 @@ class Profiler: | |||
| log_file = get_file_names(path, "host_start.log") | |||
| if not log_file: | |||
| logger.error("Profiling: job path %s, host_start.log not exist.", path) | |||
| continue | |||
| training_device_id = log_file[0].split('.')[-1] | |||
| if self._dev_id == training_device_id: | |||
| log_file = os.path.join(path, log_file[0]) | |||
| job_start_time = self._parse_host_start_log(log_file) | |||
| if not job_start_time: | |||
| logger.error("Profiling: job path %s, fail to get job start info.", path) | |||
| break | |||
| job_id = item | |||
| if self._start_time > int(job_start_time): | |||
| logger.info("Profiling: job path %s, start_time %s, training start_time %d.", | |||
| path, job_start_time, self._start_time) | |||
| break | |||
| log_file = os.path.join(path, log_file[0]) | |||
| item_dict = self._parse_host_start_log(log_file) | |||
| if not item_dict: | |||
| logger.error("Profiling: job path %s, fail to get job start info.", path) | |||
| break | |||
| job_id = item | |||
| if self._dev_id != item_dict["device_id"]: | |||
| else: | |||
| logger.info("Profiling: job path %s, dev id %s, training device id %s.", | |||
| path, item_dict["device_id"], self._dev_id) | |||
| if self._start_time > int(item_dict["start_time"]): | |||
| logger.info("Profiling: job path %s, start_time %s, training start_time %d.", | |||
| path, item_dict["start_time"], self._start_time) | |||
| break | |||
| path, training_device_id, self._dev_id) | |||
| if not job_id: | |||
| msg = "Fail to get profiling job, please check whether job dir was generated" | |||
| @@ -438,23 +438,23 @@ class Profiler: | |||
| def _parse_host_start_log(self, input_file): | |||
| """ | |||
| Parse host start log file, get the device id and start time of the job. | |||
| Parse host start log file, get the start time of the job. | |||
| Args: | |||
| input_file (str): The file path of the host start log file. | |||
| Returns: | |||
| dict, job start time and device id. | |||
| str, job start time. | |||
| """ | |||
| item_dict = {} | |||
| for line in open(input_file): | |||
| if "Device" in line: | |||
| item_dict["device_id"] = line[7:len(line)-2] | |||
| elif "clock_realtime" in line: | |||
| item_dict["start_time"] = line[16:len(line)-3] | |||
| job_start_time = "" | |||
| with open(input_file) as f: | |||
| for line in f.readlines(): | |||
| if "clock_realtime" in line: | |||
| # 16 means the first digit of the timestamp, len(line)-3 means the last. | |||
| job_start_time = line[16:len(line)-3] | |||
| return item_dict | |||
| return job_start_time | |||
| def _analyser_op_info(self): | |||
| """Analyse the operator information.""" | |||