From: @anzhengqi Reviewed-by: @liucunwei,@jonyguo Signed-off-by: @liucunweipull/15651/MERGE
| @@ -0,0 +1,51 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| import os | |||||
| import pytest | |||||
| from tests.st.model_zoo_tests import utils | |||||
| @pytest.mark.level0 | |||||
| @pytest.mark.platform_x86_ascend_training | |||||
| @pytest.mark.platform_arm_ascend_training | |||||
| @pytest.mark.env_onecard | |||||
| def test_lenet_MNIST(): | |||||
| cur_path = os.path.dirname(os.path.abspath(__file__)) | |||||
| model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) | |||||
| model_name = "lenet" | |||||
| utils.copy_files(model_path, cur_path, model_name) | |||||
| cur_model_path = os.path.join(cur_path, model_name) | |||||
| train_log = os.path.join(cur_model_path, "train_ascend.log") | |||||
| ckpt_file = os.path.join(cur_model_path, "ckpt/checkpoint_lenet-10_1875.ckpt") | |||||
| infer_log = os.path.join(cur_model_path, "infer_ascend.log") | |||||
| dataset_path = os.path.join(utils.data_root, "mnist") | |||||
| exec_network_shell = "cd {0}; python train.py --data_path={1} > {2} 2>&1"\ | |||||
| .format(model_name, dataset_path, train_log) | |||||
| ret = os.system(exec_network_shell) | |||||
| assert ret == 0 | |||||
| exec_network_shell = "cd {0}; python eval.py --data_path={1} --ckpt_path={2} > {3} 2>&1"\ | |||||
| .format(model_name, dataset_path, ckpt_file, infer_log) | |||||
| ret = os.system(exec_network_shell) | |||||
| assert ret == 0 | |||||
| per_step_time = utils.get_perf_data(train_log) | |||||
| print("per_step_time is", per_step_time) | |||||
| assert per_step_time < 1.3 | |||||
| pattern = r"'Accuracy': ([\d\.]+)}" | |||||
| acc = utils.parse_log_file(pattern, infer_log) | |||||
| print("acc is", acc) | |||||
| assert acc[0] > 0.98 | |||||
| @@ -0,0 +1,80 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| import os | |||||
| import pytest | |||||
| from mindspore import log as logger | |||||
| from tests.st.model_zoo_tests import utils | |||||
| @pytest.mark.level0 | |||||
| @pytest.mark.platform_x86_ascend_training | |||||
| @pytest.mark.platform_arm_ascend_training | |||||
| @pytest.mark.env_single | |||||
| def test_resnet50_cifar10_ascend(): | |||||
| cur_path = os.path.dirname(os.path.abspath(__file__)) | |||||
| model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) | |||||
| model_name = "resnet" | |||||
| utils.copy_files(model_path, cur_path, model_name) | |||||
| cur_model_path = os.path.join(cur_path, "resnet") | |||||
| old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"] | |||||
| new_list = ["total_epochs=10", "10"] | |||||
| utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) | |||||
| dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") | |||||
| exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\ | |||||
| .format(utils.rank_table_path, dataset_path) | |||||
| os.system(exec_network_shell) | |||||
| cmd = "ps -ef | grep python | grep train.py | grep -v grep" | |||||
| ret = utils.process_check(100, cmd) | |||||
| assert ret | |||||
| log_file = os.path.join(cur_model_path, "scripts/train_parallel{}/log") | |||||
| for i in range(8): | |||||
| per_step_time = utils.get_perf_data(log_file.format(i)) | |||||
| assert per_step_time < 20.0 | |||||
| loss_list = [] | |||||
| for i in range(8): | |||||
| loss = utils.get_loss_data_list(log_file.format(i)) | |||||
| loss_list.append(loss[-1]) | |||||
| assert sum(loss_list) / len(loss_list) < 0.70 | |||||
| @pytest.mark.level0 | |||||
| @pytest.mark.platform_x86_gpu_training | |||||
| @pytest.mark.env_single | |||||
| def test_resnet50_cifar10_gpu(): | |||||
| cur_path = os.getcwd() | |||||
| model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) | |||||
| model_name = "resnet" | |||||
| utils.copy_files(model_path, cur_path, model_name) | |||||
| cur_model_path = os.path.join(cur_path, "resnet") | |||||
| old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"] | |||||
| new_list = ["total_epochs=10", "10"] | |||||
| utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) | |||||
| dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") | |||||
| exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh resnet50 cifar10 {}".format(dataset_path) | |||||
| logger.warning("cmd [{}] is running...".format(exec_network_shell)) | |||||
| os.system(exec_network_shell) | |||||
| cmd = "ps -ef | grep python | grep train.py | grep -v grep" | |||||
| ret = utils.process_check(100, cmd) | |||||
| assert ret | |||||
| log_file = os.path.join(cur_model_path, "scripts/train_parallel/log") | |||||
| pattern = r"per step time: ([\d\.]+) ms" | |||||
| step_time_list = utils.parse_log_file(pattern, log_file)[8:] | |||||
| per_step_time = sum(step_time_list) / len(step_time_list) | |||||
| print("step time list is", step_time_list) | |||||
| assert per_step_time < 115 | |||||
| loss_list = utils.get_loss_data_list(log_file)[-8:] | |||||
| print("loss_list is", loss_list) | |||||
| assert sum(loss_list) / len(loss_list) < 0.70 | |||||
| @@ -0,0 +1,114 @@ | |||||
| #!/usr/bin/env python | |||||
| # -*- coding: utf-8 -*- | |||||
| """ File Description | |||||
| Details | |||||
| """ | |||||
| import os | |||||
| import shutil | |||||
| import subprocess | |||||
| import time | |||||
| import re | |||||
| from mindspore import log as logger | |||||
| rank_table_path = "/home/workspace/mindspore_config/hccl/rank_table_8p.json" | |||||
| data_root = "/home/workspace/mindspore_dataset/" | |||||
| ckpt_root = "/home/workspace/mindspore_ckpt/" | |||||
| cur_path = os.path.split(os.path.realpath(__file__))[0] | |||||
| geir_root = os.path.join(cur_path, "mindspore_geir") | |||||
| arm_main_path = os.path.join(cur_path, "mindir_310infer_exe") | |||||
| model_zoo_path = os.path.join(cur_path, "../../../model_zoo") | |||||
| def copy_files(from_, to_, model_name): | |||||
| if not os.path.exists(os.path.join(from_, model_name)): | |||||
| raise ValueError("There is no file or path", os.path.join(from_, model_name)) | |||||
| if os.path.exists(os.path.join(to_, model_name)): | |||||
| shutil.rmtree(os.path.join(to_, model_name)) | |||||
| return os.system("cp -r {0} {1}".format(os.path.join(from_, model_name), to_)) | |||||
| def exec_sed_command(old_list, new_list, file): | |||||
| if isinstance(old_list, str): | |||||
| old_list = [old_list] | |||||
| if isinstance(new_list, str): | |||||
| old_list = [new_list] | |||||
| if len(old_list) != len(new_list): | |||||
| raise ValueError("len(old_list) should be equal to len(new_list)") | |||||
| for old, new in zip(old_list, new_list): | |||||
| ret = os.system('sed -i "s#{0}#{1}#g" {2}'.format(old, new, file)) | |||||
| if ret != 0: | |||||
| raise ValueError('exec `sed -i "s#{0}#{1}#g" {2}` failed.'.format(old, new, file)) | |||||
| return ret | |||||
| def process_check(cycle_time, cmd, wait_time=5): | |||||
| for i in range(cycle_time): | |||||
| time.sleep(wait_time) | |||||
| sub = subprocess.Popen(args="{}".format(cmd), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, | |||||
| stderr=subprocess.PIPE, universal_newlines=True) | |||||
| stdout_data, _ = sub.communicate() | |||||
| if not stdout_data: | |||||
| logger.info("process execute success.") | |||||
| return True | |||||
| logger.warning("process is running, please wait {}".format(i)) | |||||
| logger.error("process execute execute timeout.") | |||||
| return False | |||||
| def get_perf_data(log_path, search_str="per step time", cmd=None): | |||||
| if cmd is None: | |||||
| get_step_times_cmd = r"""grep -a "{0}" {1}|egrep -v "loss|\]|\["|awk '{{print $(NF-1)}}'""" \ | |||||
| .format(search_str, log_path) | |||||
| else: | |||||
| get_step_times_cmd = cmd | |||||
| sub = subprocess.Popen(args="{}".format(get_step_times_cmd), shell=True, | |||||
| stdin=subprocess.PIPE, stdout=subprocess.PIPE, | |||||
| stderr=subprocess.PIPE, universal_newlines=True) | |||||
| stdout, _ = sub.communicate() | |||||
| if sub.returncode != 0: | |||||
| raise RuntimeError("exec {} failed".format(cmd)) | |||||
| logger.info("execute {} success".format(cmd)) | |||||
| stdout = stdout.strip().split("\n") | |||||
| step_time_list = list(map(float, stdout[1:])) | |||||
| if not step_time_list: | |||||
| cmd = "cat {}".format(log_path) | |||||
| os.system(cmd) | |||||
| raise RuntimeError("step_time_list is empty") | |||||
| per_step_time = sum(step_time_list) / len(step_time_list) | |||||
| return per_step_time | |||||
| def get_loss_data_list(log_path, search_str="loss is", cmd=None): | |||||
| if cmd is None: | |||||
| loss_value_cmd = """ grep -a '{}' {}| awk '{{print $NF}}' """.format(search_str, log_path) | |||||
| else: | |||||
| loss_value_cmd = cmd | |||||
| sub = subprocess.Popen(args="{}".format(loss_value_cmd), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, | |||||
| stderr=subprocess.PIPE, universal_newlines=True) | |||||
| stdout, _ = sub.communicate() | |||||
| if sub.returncode != 0: | |||||
| raise RuntimeError("get loss from {} failed".format(log_path)) | |||||
| logger.info("execute {} success".format(cmd)) | |||||
| stdout = stdout.strip().split("\n") | |||||
| loss_list = list(map(float, stdout)) | |||||
| if not loss_list: | |||||
| cmd = "cat {}".format(log_path) | |||||
| os.system(cmd) | |||||
| raise RuntimeError("loss_list is empty") | |||||
| return loss_list | |||||
| def parse_log_file(pattern, log_path): | |||||
| value_list = [] | |||||
| with open(log_path, "r") as file: | |||||
| for line in file.readlines(): | |||||
| match_result = re.search(pattern, line) | |||||
| if match_result is not None: | |||||
| value_list.append(float(match_result.group(1))) | |||||
| if not value_list: | |||||
| print("pattern is", pattern) | |||||
| cmd = "cat {}".format(log_path) | |||||
| os.system(cmd) | |||||
| return value_list | |||||