| @@ -22,6 +22,7 @@ from enum import Enum | |||||
| from mindspore import log as logger, context | from mindspore import log as logger, context | ||||
| from mindspore.communication.management import release, get_rank | from mindspore.communication.management import release, get_rank | ||||
| import mindspore._c_expression as c_expression | |||||
| from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ | from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ | ||||
| ProfilerIOException, ProfilerException, ProfilerRawFileException | ProfilerIOException, ProfilerException, ProfilerRawFileException | ||||
| from mindspore.profiler.common.util import get_file_names, fwrite_format | from mindspore.profiler.common.util import get_file_names, fwrite_format | ||||
| @@ -86,18 +87,18 @@ class Profiler: | |||||
| os.environ['MINDDATA_PROFILING_DIR'] = self._output_path | os.environ['MINDDATA_PROFILING_DIR'] = self._output_path | ||||
| if self._device_target: | if self._device_target: | ||||
| from mindspore._c_expression import CPUProfiler | |||||
| CPUProfiler = c_expression.CPUProfiler | |||||
| self._cpu_profiler = CPUProfiler.get_instance() | self._cpu_profiler = CPUProfiler.get_instance() | ||||
| self._cpu_profiler.init(self._output_path) | self._cpu_profiler.init(self._output_path) | ||||
| self._cpu_profiler.step_profiling_enable(True) | self._cpu_profiler.step_profiling_enable(True) | ||||
| if self._device_target and self._device_target == "GPU": | if self._device_target and self._device_target == "GPU": | ||||
| from mindspore._c_expression import GPUProfiler | |||||
| GPUProfiler = c_expression.GPUProfiler | |||||
| self._gpu_profiler = GPUProfiler.get_instance() | self._gpu_profiler = GPUProfiler.get_instance() | ||||
| self._gpu_profiler.init(self._output_path) | self._gpu_profiler.init(self._output_path) | ||||
| self._gpu_profiler.step_profiling_enable(True) | self._gpu_profiler.step_profiling_enable(True) | ||||
| if context.get_auto_parallel_context('device_num') > 1: | if context.get_auto_parallel_context('device_num') > 1: | ||||
| self._dev_id = get_rank() | |||||
| os.environ['DEVICE_ID'] = str(self._dev_id) | |||||
| self._dev_id = str(get_rank()) | |||||
| os.environ['DEVICE_ID'] = self._dev_id | |||||
| if kwargs: | if kwargs: | ||||
| logger.warning("Params not be supported yet on GPU.") | logger.warning("Params not be supported yet on GPU.") | ||||
| @@ -253,8 +254,8 @@ class Profiler: | |||||
| def _gpu_analyse(self): | def _gpu_analyse(self): | ||||
| """Collect and analyse gpu performance data""" | """Collect and analyse gpu performance data""" | ||||
| if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != get_rank(): | |||||
| self._dev_id = get_rank() | |||||
| if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != str(get_rank()): | |||||
| self._dev_id = str(get_rank()) | |||||
| logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' | logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' | ||||
| 'and init(). Profiler should be initialized after these code. ') | 'and init(). Profiler should be initialized after these code. ') | ||||
| self._gpu_profiler.stop() | self._gpu_profiler.stop() | ||||
| @@ -403,6 +404,7 @@ class Profiler: | |||||
| """ | """ | ||||
| job_id = "" | job_id = "" | ||||
| for item in os.listdir(self._output_path): | for item in os.listdir(self._output_path): | ||||
| if item.startswith('JOB'): | if item.startswith('JOB'): | ||||
| path = os.path.join(self._output_path, item) | path = os.path.join(self._output_path, item) | ||||
| @@ -410,25 +412,23 @@ class Profiler: | |||||
| log_file = get_file_names(path, "host_start.log") | log_file = get_file_names(path, "host_start.log") | ||||
| if not log_file: | if not log_file: | ||||
| logger.error("Profiling: job path %s, host_start.log not exist.", path) | logger.error("Profiling: job path %s, host_start.log not exist.", path) | ||||
| continue | |||||
| training_device_id = log_file[0].split('.')[-1] | |||||
| if self._dev_id == training_device_id: | |||||
| log_file = os.path.join(path, log_file[0]) | |||||
| job_start_time = self._parse_host_start_log(log_file) | |||||
| if not job_start_time: | |||||
| logger.error("Profiling: job path %s, fail to get job start info.", path) | |||||
| break | |||||
| job_id = item | |||||
| if self._start_time > int(job_start_time): | |||||
| logger.info("Profiling: job path %s, start_time %s, training start_time %d.", | |||||
| path, job_start_time, self._start_time) | |||||
| break | break | ||||
| log_file = os.path.join(path, log_file[0]) | |||||
| item_dict = self._parse_host_start_log(log_file) | |||||
| if not item_dict: | |||||
| logger.error("Profiling: job path %s, fail to get job start info.", path) | |||||
| break | |||||
| job_id = item | |||||
| if self._dev_id != item_dict["device_id"]: | |||||
| else: | |||||
| logger.info("Profiling: job path %s, dev id %s, training device id %s.", | logger.info("Profiling: job path %s, dev id %s, training device id %s.", | ||||
| path, item_dict["device_id"], self._dev_id) | |||||
| if self._start_time > int(item_dict["start_time"]): | |||||
| logger.info("Profiling: job path %s, start_time %s, training start_time %d.", | |||||
| path, item_dict["start_time"], self._start_time) | |||||
| break | |||||
| path, training_device_id, self._dev_id) | |||||
| if not job_id: | if not job_id: | ||||
| msg = "Fail to get profiling job, please check whether job dir was generated" | msg = "Fail to get profiling job, please check whether job dir was generated" | ||||
| @@ -438,23 +438,23 @@ class Profiler: | |||||
| def _parse_host_start_log(self, input_file): | def _parse_host_start_log(self, input_file): | ||||
| """ | """ | ||||
| Parse host start log file, get the device id and start time of the job. | |||||
| Parse host start log file, get the start time of the job. | |||||
| Args: | Args: | ||||
| input_file (str): The file path of the host start log file. | input_file (str): The file path of the host start log file. | ||||
| Returns: | Returns: | ||||
| dict, job start time and device id. | |||||
| str, job start time. | |||||
| """ | """ | ||||
| item_dict = {} | |||||
| for line in open(input_file): | |||||
| if "Device" in line: | |||||
| item_dict["device_id"] = line[7:len(line)-2] | |||||
| elif "clock_realtime" in line: | |||||
| item_dict["start_time"] = line[16:len(line)-3] | |||||
| job_start_time = "" | |||||
| with open(input_file) as f: | |||||
| for line in f.readlines(): | |||||
| if "clock_realtime" in line: | |||||
| # 16 means the first digit of the timestamp, len(line)-3 means the last. | |||||
| job_start_time = line[16:len(line)-3] | |||||
| return item_dict | |||||
| return job_start_time | |||||
| def _analyser_op_info(self): | def _analyser_op_info(self): | ||||
| """Analyse the operator information.""" | """Analyse the operator information.""" | ||||