| @@ -1,4 +1,4 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| @@ -83,7 +83,6 @@ class Profiler: | |||
| output_path = kwargs.pop("output_path", f"data-{format_time}") | |||
| self._output_path = validate_and_normalize_path(output_path) | |||
| self._output_path = os.path.join(self._output_path, f"profiler-{format_time}") | |||
| self._base_profiling_container_path = os.path.join(self._output_path, "container") | |||
| if not os.path.exists(self._output_path): | |||
| os.makedirs(self._output_path, exist_ok=True) | |||
| os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) | |||
| @@ -109,7 +108,14 @@ class Profiler: | |||
| optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable") | |||
| if not isinstance(optypes_not_deal, str): | |||
| raise TypeError("The parameter optypes_not_deal must be str.") | |||
| job_id = kwargs.pop("ascend_job_id", "") | |||
| job_dir = kwargs.pop("ascend_job_id", "") | |||
| if job_dir: | |||
| job_dir = validate_and_normalize_path(job_dir) | |||
| if not os.path.exists(job_dir): | |||
| msg = f"Invalid ascend_job_id: {job_dir}, Please pass the absolute path of the JOB dir" | |||
| logger.error(msg) | |||
| raise ValueError(msg) | |||
| self._output_path, _ = os.path.split(job_dir) | |||
| if kwargs: | |||
| logger.warning("There are invalid params which don't work.") | |||
| @@ -130,18 +136,19 @@ class Profiler: | |||
| profiling_options = json.dumps(profiling_options) | |||
| # Characters longer than 2048 are ignored, resulting in profiling option resolution errors | |||
| if len(profiling_options) > 2048: | |||
| raise ValueError("The parameter length exceeds the limit (2048), please input valid parameters.") | |||
| msg = "The parameter length exceeds the limit (2048), please input valid parameters." | |||
| logger.error(msg) | |||
| raise ValueError(msg) | |||
| # use context interface to open profiling, for the new mindspore version(after 2020.5.21) | |||
| context.set_context(enable_profiling=True, profiling_options=profiling_options) | |||
| self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id) | |||
| data_path = os.path.join(self._container_path, "data") | |||
| base_profiling_container_path = os.path.join(self._output_path, "container") | |||
| container_path = os.path.join(base_profiling_container_path, self._dev_id) | |||
| data_path = os.path.join(container_path, "data") | |||
| data_path = validate_and_normalize_path(data_path) | |||
| if not os.path.exists(data_path): | |||
| os.makedirs(data_path, exist_ok=True) | |||
| self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else [] | |||
| self._profiling_job_id = job_id | |||
| # add job id env through user input later | |||
| self._job_id_env = 0 | |||
| self._start_time = int(time.time() * 10000000) | |||
| @@ -362,27 +369,27 @@ class Profiler: | |||
| """Get profiling job id, which was generated by ada service. | |||
| Returns: | |||
| str: profiling jon id. | |||
| str, profiling job id. | |||
| """ | |||
| if self._profiling_job_id: | |||
| return self._profiling_job_id | |||
| job_id = "" | |||
| for item in os.listdir(self._output_path): | |||
| if item.startswith('JOB'): | |||
| path = os.path.join(self._output_path, item) | |||
| job_id = item | |||
| log_file = get_file_names(path, "host_start.log") | |||
| if not log_file: | |||
| logger.error("Profiling: job path %s, host_start.log not exist.", path) | |||
| break | |||
| log_file = os.path.join(path, log_file[0]) | |||
| item_dict = self._parse_host_start_log(log_file) | |||
| if not item_dict: | |||
| logger.error("Profiling: job path %s, fail to get job start info.", path) | |||
| break | |||
| job_id = item | |||
| if self._dev_id != item_dict["device_id"]: | |||
| logger.info("Profiling: job path %s, dev id %s, training device id %s.", | |||
| @@ -391,7 +398,6 @@ class Profiler: | |||
| if self._start_time > int(item_dict["start_time"]): | |||
| logger.info("Profiling: job path %s, start_time %s, training start_time %d.", | |||
| path, item_dict["start_time"], self._start_time) | |||
| break | |||
| if not job_id: | |||