From: @gzhcv Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -214,11 +214,8 @@ class _Context: | |||
| self.set_param(ms_ctx_param.max_call_depth, max_call_depth) | |||
| def set_profiling_options(self, option): | |||
| options = ["training_trace", "task_trace", | |||
| "task_trace:training_trace", "training_trace:task_trace", "op_trace"] | |||
| if option not in options: | |||
| raise ValueError("Profiling options must be in 'training_trace' 'task_trace' " | |||
| "'task_trace:training_trace' 'training_trace:task_trace' or 'op_trace'.") | |||
| if not isinstance(option, str): | |||
| raise TypeError("The parameter option must be str.") | |||
| self.set_param(ms_ctx_param.profiling_options, option) | |||
| def set_variable_memory_max_size(self, variable_memory_max_size): | |||
| @@ -174,7 +174,6 @@ class FrameworkParser: | |||
| device_id (str): The device ID. | |||
| output_path (str): The directory of the parsed file. Default: `./`. | |||
| """ | |||
| _raw_data_dir = '/var/log/npu/profiling' | |||
| _regex_framework = r'Framework\.(?P<data_type>.+)\.(?P<device_id>\d).+' | |||
| _regex_framework_in_data = r'Framework\.(?P<data_type>.+)\.' \ | |||
| r'(?P<device_id>\d)\.(?P<profiling_id>[a-zA-Z0-9]+).+' | |||
| @@ -193,6 +192,7 @@ class FrameworkParser: | |||
| _task_id_threshold = 25000 | |||
| def __init__(self, profiling_id, device_id, output_path='./'): | |||
| self._raw_data_dir = output_path | |||
| self._profiling_path = self._get_raw_profiling_path(profiling_id) | |||
| self._backend_type = None | |||
| self._framework_path = {'graph': [], 'task': [], 'point': []} | |||
| @@ -16,6 +16,7 @@ | |||
| import os | |||
| import stat | |||
| import time | |||
| import json | |||
| from enum import Enum | |||
| from mindspore import log as logger, context | |||
| @@ -37,7 +38,6 @@ from mindspore.profiler.parser.optime_parser import OPComputeTimeParser | |||
| from mindspore.profiler.parser.step_trace_parser import GpuStepTraceParser, AscendStepTraceParser | |||
| from mindspore.nn.cell import Cell | |||
| PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling" | |||
| INIT_OP_NAME = 'Default/InitDataSetQueue' | |||
| class ProfileOption(Enum): | |||
| @@ -72,7 +72,6 @@ class Profiler: | |||
| >>> profiler.analyse() | |||
| """ | |||
| _base_profiling_container_path = "/var/log/npu/profiling/container" | |||
| _hwts_output_filename_target = "output_format_data_hwts_" | |||
| _opcompute_output_filename_target = "output_op_compute_time_" | |||
| _aicpu_op_output_filename_target = "output_data_preprocess_aicpu_" | |||
| @@ -80,9 +79,11 @@ class Profiler: | |||
| def __init__(self, **kwargs): | |||
| # get device_id and device_target | |||
| self._get_devid_and_devtarget() | |||
| output_path = kwargs.pop("output_path", "./data") | |||
| format_time = int(time.time()) | |||
| output_path = kwargs.pop("output_path", f"data-{format_time}") | |||
| self._output_path = validate_and_normalize_path(output_path) | |||
| self._output_path = os.path.join(self._output_path, "profiler") | |||
| self._output_path = os.path.join(self._output_path, f"profiler-{format_time}") | |||
| self._base_profiling_container_path = os.path.join(self._output_path, "container") | |||
| if not os.path.exists(self._output_path): | |||
| os.makedirs(self._output_path, exist_ok=True) | |||
| os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) | |||
| @@ -113,10 +114,25 @@ class Profiler: | |||
| logger.warning("There are invalid params which don't work.") | |||
| os.environ['DEVICE_ID'] = self._dev_id | |||
| os.environ['AICPU_PROFILING_MODE'] = 'true' | |||
| fp_point = os.environ.get("PROFILING_FP_START", "") | |||
| bp_point = os.environ.get("PROFILING_BP_END", "") | |||
| profiling_options = { | |||
| "result_path": self._output_path, | |||
| "fp_point": fp_point, | |||
| "bp_point": bp_point, | |||
| "training_trace": "on", | |||
| "task_trace": "on", | |||
| "ai_core_metrics": "PipeUtilization", | |||
| "aicpu_trace": "on" | |||
| } | |||
| profiling_options = json.dumps(profiling_options) | |||
| # Characters longer than 2048 are ignored, resulting in profiling option resolution errors | |||
| if len(profiling_options) > 2048: | |||
| raise ValueError("The parameter length exceeds the limit (2048)") | |||
| # use context interface to open profiling, for the new mindspore version(after 2020.5.21) | |||
| context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") | |||
| context.set_context(enable_profiling=True, profiling_options=profiling_options) | |||
| self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id) | |||
| data_path = os.path.join(self._container_path, "data") | |||
| @@ -174,7 +190,7 @@ class Profiler: | |||
| job_id = self._get_profiling_job_id() | |||
| logger.info("Profiling: job id is %s ", job_id) | |||
| source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) | |||
| source_path = os.path.join(self._output_path, job_id) | |||
| # parse hwts.log.data.45.dev file, and get task profiling data | |||
| hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" | |||
| hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) | |||
| @@ -353,12 +369,12 @@ class Profiler: | |||
| return self._profiling_job_id | |||
| job_id = "" | |||
| cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'" | |||
| cmd = "ls -t " + self._output_path + "|grep JOB|awk '{print $1}'" | |||
| r = os.popen(cmd) | |||
| profiling_job_dirs = r.readlines() | |||
| r.close() | |||
| for item in profiling_job_dirs: | |||
| path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip()) | |||
| path = os.path.join(self._output_path, item.strip()) | |||
| log_file = get_file_names(path, "host_start.log") | |||
| if not log_file: | |||
| logger.error("Profiling: job path %s, host_start.log not exist.", path) | |||
| @@ -128,7 +128,6 @@ def cleanup(): | |||
| class TestProfiler: | |||
| device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 | |||
| mnist_path = '/home/workspace/mindspore_dataset/mnist' | |||
| profiler_path = os.path.join(os.getcwd(), 'data/profiler/') | |||
| @classmethod | |||
| def teardown_class(cls): | |||
| @@ -140,7 +139,9 @@ class TestProfiler: | |||
| @pytest.mark.env_onecard | |||
| def test_gpu_profiler(self): | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| profiler = Profiler() | |||
| profiler = Profiler(output_path='data') | |||
| profiler_name = os.listdir(os.path.join(os.getcwd(), 'data'))[0] | |||
| self.profiler_path = os.path.join(os.getcwd(), f'data/{profiler_name}/') | |||
| ds_train = create_dataset(os.path.join(self.mnist_path, "train")) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
| @@ -49,13 +49,15 @@ class TestFrameworkParser: | |||
| """Test the class of `FrameworkParser`.""" | |||
| def setup_method(self): | |||
| """Initialization before test case execution.""" | |||
| with mock.patch.object(FrameworkParser, '_raw_data_dir', RAW_DATA_BASE): | |||
| self._output_path_1 = tempfile.mkdtemp(prefix='test_framework_parser_') | |||
| self._parser_1 = FrameworkParser('JOB1', '0', self._output_path_1) | |||
| self._output_path_2 = tempfile.mkdtemp(prefix='test_framework_parser_') | |||
| self._parser_2 = FrameworkParser('JOB2', '0', self._output_path_2) | |||
| self._output_path_4 = tempfile.mkdtemp(prefix='test_framework_parser_') | |||
| self._parser_4 = FrameworkParser('JOB4', '0', self._output_path_4) | |||
| self._output_path_1 = tempfile.NamedTemporaryFile(prefix='test_framework_parser_').name | |||
| shutil.copytree(RAW_DATA_BASE, self._output_path_1) | |||
| self._parser_1 = FrameworkParser('JOB1', '0', self._output_path_1) | |||
| self._output_path_2 = tempfile.NamedTemporaryFile(prefix='test_framework_parser_').name | |||
| shutil.copytree(RAW_DATA_BASE, self._output_path_2) | |||
| self._parser_2 = FrameworkParser('JOB2', '0', self._output_path_2) | |||
| self._output_path_4 = tempfile.NamedTemporaryFile(prefix='test_framework_parser_').name | |||
| shutil.copytree(RAW_DATA_BASE, self._output_path_4) | |||
| self._parser_4 = FrameworkParser('JOB4', '0', self._output_path_4) | |||
| def teardown_method(self) -> None: | |||
| """Clear up after test case execution.""" | |||
| @@ -15,6 +15,7 @@ | |||
| """ test_context """ | |||
| import os | |||
| import shutil | |||
| import json | |||
| import pytest | |||
| from mindspore import context | |||
| @@ -94,14 +95,18 @@ def test_profiling_options(): | |||
| context.set_context(profiling_options=True) | |||
| with pytest.raises(TypeError): | |||
| context.set_context(profiling_options=1) | |||
| with pytest.raises(ValueError): | |||
| context.set_context(profiling_options="training_") | |||
| with pytest.raises(ValueError): | |||
| context.set_context(profiling_options="training_trace:op_trace") | |||
| context.set_context(profiling_options="training_trace") | |||
| assert context.get_context("profiling_options") == "training_trace" | |||
| context.set_context(profiling_options="training_trace:task_trace") | |||
| assert context.get_context("profiling_options") == "training_trace:task_trace" | |||
| profiling_options = { | |||
| "result_path": "", | |||
| "fp_point": "", | |||
| "bp_point": "", | |||
| "training_trace": "on", | |||
| "task_trace": "on", | |||
| "ai_core_metrics": "PipeUtilization", | |||
| "aicpu_trace": "on" | |||
| } | |||
| profiling_options = json.dumps(profiling_options) | |||
| context.set_context(profiling_options=profiling_options) | |||
| assert context.get_context("profiling_options") == profiling_options | |||
| def test_variable_memory_max_size(): | |||