| @@ -1,101 +0,0 @@ | |||||
| # MobileNetV2 Example | |||||
| ## Description | |||||
| This is an example of training MobileNetV2 with ImageNet2012 dataset in MindSpore. | |||||
| ## Requirements | |||||
| * Install [MindSpore](https://www.mindspore.cn/install/en). | |||||
| * Download the dataset [ImageNet2012]. | |||||
| > Unzip the ImageNet2012 dataset to any path you want and the folder structure should be as follows: | |||||
| > ``` | |||||
| > . | |||||
| > ├── train # train dataset | |||||
| > └── val # infer dataset | |||||
| > ``` | |||||
| ## Example structure | |||||
| ``` shell | |||||
| . | |||||
| ├── config.py # parameter configuration | |||||
| ├── dataset.py # data preprocessing | |||||
| ├── eval.py # infer script | |||||
| ├── launch.py # launcher for distributed training | |||||
| ├── lr_generator.py # generate learning rate for each step | |||||
| ├── run_infer.sh # launch infering | |||||
| ├── run_train.sh # launch training | |||||
| └── train.py # train script | |||||
| ``` | |||||
| ## Parameter configuration | |||||
| Parameters for both training and inference can be set in 'config.py'. | |||||
| ``` | |||||
| "num_classes": 1000, # dataset class num | |||||
| "image_height": 224, # image height | |||||
| "image_width": 224, # image width | |||||
| "batch_size": 256, # training or infering batch size | |||||
| "epoch_size": 200, # total training epochs, including warmup_epochs | |||||
| "warmup_epochs": 4, # warmup epochs | |||||
| "lr": 0.4, # base learning rate | |||||
| "momentum": 0.9, # momentum | |||||
| "weight_decay": 4e-5, # weight decay | |||||
| "loss_scale": 1024, # loss scale | |||||
| "save_checkpoint": True, # whether save checkpoint | |||||
| "save_checkpoint_epochs": 1, # the epoch interval between two checkpoints | |||||
| "keep_checkpoint_max": 200, # only keep the last keep_checkpoint_max checkpoint | |||||
| "save_checkpoint_path": "./checkpoint" # path to save checkpoint | |||||
| ``` | |||||
| ## Running the example | |||||
| ### Train | |||||
| #### Usage | |||||
| Usage: sh run_train.sh [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] | |||||
| #### Launch | |||||
| ``` | |||||
| # training example | |||||
| sh run_train.sh 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet | |||||
| ``` | |||||
| #### Result | |||||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. | |||||
| ``` | |||||
| epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] | |||||
| epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 | |||||
| epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] | |||||
| epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 | |||||
| ``` | |||||
| ### Infer | |||||
| #### Usage | |||||
| Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||||
| #### Launch | |||||
| ``` | |||||
| # infer example | |||||
| sh run_infer.sh ~/imagenet ~/train/mobilenet-200_625.ckpt | |||||
| ``` | |||||
| > checkpoint can be produced in training process. | |||||
| #### Result | |||||
| Inference result will be stored in the example path, you can find result like the followings in `val.log`. | |||||
| ``` | |||||
| result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt | |||||
| ``` | |||||
| @@ -1,33 +0,0 @@ | |||||
| #!/usr/bin/env bash | |||||
| if [ $# != 2 ] | |||||
| then | |||||
| echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $1 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$1 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -f $2 ] | |||||
| then | |||||
| echo "error: CHECKPOINT_PATH=$2 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| export DEVICE_ID=0 | |||||
| export RANK_ID=0 | |||||
| export RANK_SIZE=1 | |||||
| if [ -d "eval" ]; | |||||
| then | |||||
| rm -rf ./eval | |||||
| fi | |||||
| mkdir ./eval | |||||
| cd ./eval || exit | |||||
| python ${BASEPATH}/eval.py \ | |||||
| --checkpoint_path=$2 \ | |||||
| --dataset_path=$1 &> infer.log & # dataset val folder path | |||||
| @@ -1,33 +0,0 @@ | |||||
| #!/usr/bin/env bash | |||||
| if [ $# != 4 ] | |||||
| then | |||||
| echo "Usage: sh run_train.sh [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 -lt 1 ] && [ $1 -gt 8 ] | |||||
| then | |||||
| echo "error: DEVICE_NUM=$1 is not in (1-8)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $4 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$4 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| if [ -d "train" ]; | |||||
| then | |||||
| rm -rf ./train | |||||
| fi | |||||
| mkdir ./train | |||||
| cd ./train || exit | |||||
| python ${BASEPATH}/launch.py \ | |||||
| --nproc_per_node=$1 \ | |||||
| --visible_devices=$3 \ | |||||
| --server_id=$2 \ | |||||
| --training_script=${BASEPATH}/train.py \ | |||||
| --dataset_path=$4 &> train.log & # dataset train folder | |||||
| @@ -1,188 +0,0 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """train_imagenet.""" | |||||
| import os | |||||
| import time | |||||
| import argparse | |||||
| import random | |||||
| import numpy as np | |||||
| from dataset import create_dataset | |||||
| from lr_generator import get_lr | |||||
| from config import config | |||||
| from mindspore import context | |||||
| from mindspore import Tensor | |||||
| from mindspore import nn | |||||
| from mindspore.model_zoo.mobilenet import mobilenet_v2 | |||||
| from mindspore.parallel._auto_parallel_context import auto_parallel_context | |||||
| from mindspore.nn.optim.momentum import Momentum | |||||
| from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits | |||||
| from mindspore.nn.loss.loss import _Loss | |||||
| from mindspore.ops import operations as P | |||||
| from mindspore.ops import functional as F | |||||
| from mindspore.common import dtype as mstype | |||||
| from mindspore.train.model import Model, ParallelMode | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback | |||||
| from mindspore.train.loss_scale_manager import FixedLossScaleManager | |||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||||
| import mindspore.dataset.engine as de | |||||
| from mindspore.communication.management import init | |||||
| random.seed(1) | |||||
| np.random.seed(1) | |||||
| de.config.set_seed(1) | |||||
| parser = argparse.ArgumentParser(description='Image classification') | |||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | |||||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | |||||
| args_opt = parser.parse_args() | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| rank_id = int(os.getenv('RANK_ID')) | |||||
| rank_size = int(os.getenv('RANK_SIZE')) | |||||
| run_distribute = rank_size > 1 | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False) | |||||
| class CrossEntropyWithLabelSmooth(_Loss): | |||||
| """ | |||||
| CrossEntropyWith LabelSmooth. | |||||
| Args: | |||||
| smooth_factor (float): smooth factor, default=0. | |||||
| num_classes (int): num classes | |||||
| Returns: | |||||
| None. | |||||
| Examples: | |||||
| >>> CrossEntropyWithLabelSmooth(smooth_factor=0., num_classes=1000) | |||||
| """ | |||||
| def __init__(self, smooth_factor=0., num_classes=1000): | |||||
| super(CrossEntropyWithLabelSmooth, self).__init__() | |||||
| self.onehot = P.OneHot() | |||||
| self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) | |||||
| self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32) | |||||
| self.ce = nn.SoftmaxCrossEntropyWithLogits() | |||||
| self.mean = P.ReduceMean(False) | |||||
| self.cast = P.Cast() | |||||
| def construct(self, logit, label): | |||||
| one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], self.on_value, self.off_value) | |||||
| out_loss = self.ce(logit, one_hot_label) | |||||
| out_loss = self.mean(out_loss, 0) | |||||
| return out_loss | |||||
| class Monitor(Callback): | |||||
| """ | |||||
| Monitor loss and time. | |||||
| Args: | |||||
| lr_init (numpy array): train lr | |||||
| Returns: | |||||
| None | |||||
| Examples: | |||||
| >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy()) | |||||
| """ | |||||
| def __init__(self, lr_init=None): | |||||
| super(Monitor, self).__init__() | |||||
| self.lr_init = lr_init | |||||
| self.lr_init_len = len(lr_init) | |||||
| def epoch_begin(self, run_context): | |||||
| self.losses = [] | |||||
| self.epoch_time = time.time() | |||||
| def epoch_end(self, run_context): | |||||
| cb_params = run_context.original_args() | |||||
| epoch_mseconds = (time.time() - self.epoch_time) * 1000 | |||||
| per_step_mseconds = epoch_mseconds / cb_params.batch_num | |||||
| print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, | |||||
| per_step_mseconds, | |||||
| np.mean(self.losses) | |||||
| )) | |||||
| def step_begin(self, run_context): | |||||
| self.step_time = time.time() | |||||
| def step_end(self, run_context): | |||||
| cb_params = run_context.original_args() | |||||
| step_mseconds = (time.time() - self.step_time) * 1000 | |||||
| step_loss = cb_params.net_outputs | |||||
| if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): | |||||
| step_loss = step_loss[0] | |||||
| if isinstance(step_loss, Tensor): | |||||
| step_loss = np.mean(step_loss.asnumpy()) | |||||
| self.losses.append(step_loss) | |||||
| cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num | |||||
| print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( | |||||
| cb_params.cur_epoch_num - 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, | |||||
| np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) | |||||
| if __name__ == '__main__': | |||||
| if run_distribute: | |||||
| context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, | |||||
| parameter_broadcast=True, mirror_mean=True) | |||||
| auto_parallel_context().set_all_reduce_fusion_split_indices([140]) | |||||
| init() | |||||
| epoch_size = config.epoch_size | |||||
| net = mobilenet_v2(num_classes=config.num_classes) | |||||
| net.to_float(mstype.float16) | |||||
| for _, cell in net.cells_and_names(): | |||||
| if isinstance(cell, nn.Dense): | |||||
| cell.add_flags_recursive(fp32=True) | |||||
| if config.label_smooth > 0: | |||||
| loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) | |||||
| else: | |||||
| loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') | |||||
| print("train args: ", args_opt, "\ncfg: ", config, | |||||
| "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, | |||||
| repeat_num=epoch_size, batch_size=config.batch_size) | |||||
| step_size = dataset.get_dataset_size() | |||||
| if args_opt.pre_trained: | |||||
| param_dict = load_checkpoint(args_opt.pre_trained) | |||||
| load_param_into_net(net, param_dict) | |||||
| loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | |||||
| lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config.lr, | |||||
| warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size)) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, | |||||
| config.weight_decay, config.loss_scale) | |||||
| model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) | |||||
| cb = None | |||||
| if rank_id == 0: | |||||
| cb = [Monitor(lr_init=lr.asnumpy())] | |||||
| if config.save_checkpoint: | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, | |||||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||||
| ckpt_cb = ModelCheckpoint(prefix="mobilenet", directory=config.save_checkpoint_path, config=config_ck) | |||||
| cb += [ckpt_cb] | |||||
| model.train(epoch_size, dataset, callbacks=cb) | |||||
| @@ -0,0 +1,151 @@ | |||||
| # MobileNetV2 Description | |||||
| MobileNetV2 is tuned to mobile phone CPUs through a combination of hardware- aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances.Nov 20, 2019. | |||||
| [Paper](https://arxiv.org/pdf/1905.02244) Howard, Andrew, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang et al. "Searching for MobileNetV2." In Proceedings of the IEEE International Conference on Computer Vision, pp. 1314-1324. 2019. | |||||
| # Model architecture | |||||
| The overall network architecture of MobileNetV2 is show below: | |||||
| [Link](https://arxiv.org/pdf/1905.02244) | |||||
| # Dataset | |||||
| Dataset used: [imagenet](http://www.image-net.org/) | |||||
| - Dataset size: ~125G, 1.2W colorful images in 1000 classes | |||||
| - Train: 120G, 1.2W images | |||||
| - Test: 5G, 50000 images | |||||
| - Data format: RGB images. | |||||
| - Note: Data will be processed in src/dataset.py | |||||
| # Features | |||||
| # Environment Requirements | |||||
| - Hardware(Ascend/GPU) | |||||
| - Prepare hardware environment with Ascend or GPU processor. If you want to try Ascend , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. | |||||
| - Framework | |||||
| - [MindSpore](http://10.90.67.50/mindspore/archive/20200506/OpenSource/me_vm_x86/) | |||||
| - For more information, please check the resources below: | |||||
| - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) | |||||
| - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html) | |||||
| # Script description | |||||
| ## Script and sample code | |||||
| ```python | |||||
| ├── MobileNetV2 | |||||
| ├── Readme.md | |||||
| ├── scripts | |||||
| │ ├──run_train.sh | |||||
| │ ├──run_eval.sh | |||||
| ├── src | |||||
| │ ├──config.py | |||||
| │ ├──dataset.py | |||||
| │ ├──luanch.py | |||||
| │ ├──lr_generator.py | |||||
| │ ├──mobilenetV2.py | |||||
| ├── train.py | |||||
| ├── eval.py | |||||
| ``` | |||||
| ## Training process | |||||
| ### Usage | |||||
| - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] | |||||
| - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] | |||||
| ### Launch | |||||
| ``` | |||||
| # training example | |||||
| Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ | |||||
| ``` | |||||
| ### Result | |||||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. | |||||
| ``` | |||||
| epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] | |||||
| epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 | |||||
| epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] | |||||
| epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 | |||||
| ``` | |||||
| ## Eval process | |||||
| ### Usage | |||||
| - Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] | |||||
| - GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] | |||||
| ### Launch | |||||
| ``` | |||||
| # infer example | |||||
| Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt | |||||
| GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt | |||||
| ``` | |||||
| > checkpoint can be produced in training process. | |||||
| ### Result | |||||
| Inference result will be stored in the example path, you can find result like the followings in `val.log`. | |||||
| ``` | |||||
| result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt | |||||
| ``` | |||||
| # Model description | |||||
| ## Performance | |||||
| ### Training Performance | |||||
| | Parameters | MobilenetV2 | | | |||||
| | -------------------------- | ---------------------------------------------------------- | ------------------------- | | |||||
| | Model Version | | large | | |||||
| | Resource | Ascend 910, cpu:2.60GHz 56cores, memory:314G | NV SMX2 V100-32G | | |||||
| | uploaded Date | 05/06/2020 | 05/06/2020 | | |||||
| | MindSpore Version | 0.3.0 | 0.3.0 | | |||||
| | Dataset | ImageNet | ImageNet | | |||||
| | Training Parameters | src/config.py | src/config.py | | |||||
| | Optimizer | Momentum | Momentum | | |||||
| | Loss Function | SoftmaxCrossEntropy | SoftmaxCrossEntropy | | |||||
| | outputs | | | | |||||
| | Loss | | 1.913 | | |||||
| | Accuracy | | ACC1[77.09%] ACC5[92.57%] | | |||||
| | Total time | | | | |||||
| | Params (M) | | | | |||||
| | Checkpoint for Fine tuning | | | | |||||
| | Model for inference | | | | |||||
| #### Inference Performance | |||||
| | Parameters | GoogLeNet | | | | |||||
| | -------------------------- | ----------------------------- | ------------------------- | -------------------- | | |||||
| | Model Version | V1 | | | | |||||
| | Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 | | |||||
| | uploaded Date | 05/06/2020 | 05/22/2020 | | | |||||
| | MindSpore Version | 0.2.0 | 0.2.0 | 0.2.0 | | |||||
| | Dataset | ImageNet, 1.2W | ImageNet, 1.2W | ImageNet, 1.2W | | |||||
| | batch_size | | 130(8P) | | | |||||
| | outputs | | | | | |||||
| | Accuracy | | ACC1[72.07%] ACC5[90.90%] | | | |||||
| | Speed | | | | | |||||
| | Total time | | | | | |||||
| | Model for inference | | | | | |||||
| # ModelZoo Homepage | |||||
| [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo) | |||||
| @@ -0,0 +1,75 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """ | |||||
| eval. | |||||
| """ | |||||
| import os | |||||
| import argparse | |||||
| from mindspore import context | |||||
| from mindspore import nn | |||||
| from mindspore.train.model import Model | |||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||||
| from mindspore.common import dtype as mstype | |||||
| from src.dataset import create_dataset | |||||
| from src.config import config_ascend, config_gpu | |||||
| from src.mobilenetV2 import mobilenet_v2 | |||||
| parser = argparse.ArgumentParser(description='Image classification') | |||||
| parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') | |||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | |||||
| parser.add_argument('--platform', type=str, default=None, help='run platform') | |||||
| args_opt = parser.parse_args() | |||||
| if __name__ == '__main__': | |||||
| config_platform = None | |||||
| if args_opt.platform == "Ascend": | |||||
| config_platform = config_ascend | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", | |||||
| device_id=device_id, save_graphs=False) | |||||
| elif args_opt.platform == "GPU": | |||||
| config_platform = config_gpu | |||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target="GPU", save_graphs=False) | |||||
| else: | |||||
| raise ValueError("Unsupport platform.") | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits( | |||||
| is_grad=False, sparse=True, reduction='mean') | |||||
| net = mobilenet_v2(num_classes=config_platform.num_classes) | |||||
| if args_opt.platform == "Ascend": | |||||
| net.to_float(mstype.float16) | |||||
| for _, cell in net.cells_and_names(): | |||||
| if isinstance(cell, nn.Dense): | |||||
| cell.to_float(mstype.float32) | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, | |||||
| do_train=False, | |||||
| config=config_platform, | |||||
| platform=args_opt.platform, | |||||
| batch_size=config_platform.batch_size) | |||||
| step_size = dataset.get_dataset_size() | |||||
| if args_opt.checkpoint_path: | |||||
| param_dict = load_checkpoint(args_opt.checkpoint_path) | |||||
| load_param_into_net(net, param_dict) | |||||
| net.set_train(False) | |||||
| model = Model(net, loss_fn=loss, metrics={'acc'}) | |||||
| res = model.eval(dataset) | |||||
| print("result:", res, "ckpt=", args_opt.checkpoint_path) | |||||
| @@ -0,0 +1,55 @@ | |||||
| #!/usr/bin/env bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| if [ $# != 3 ] | |||||
| then | |||||
| echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH] \ | |||||
| GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]" | |||||
| exit 1 | |||||
| fi | |||||
| # check dataset path | |||||
| if [ ! -d $2 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$2 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| # check checkpoint file | |||||
| if [ ! -f $3 ] | |||||
| then | |||||
| echo "error: CHECKPOINT_PATH=$3 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| # set environment | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| export DEVICE_ID=0 | |||||
| export RANK_ID=0 | |||||
| export RANK_SIZE=1 | |||||
| if [ -d "eval" ]; | |||||
| then | |||||
| rm -rf ../eval | |||||
| fi | |||||
| mkdir ../eval | |||||
| cd ../eval || exit | |||||
| # luanch | |||||
| python ${BASEPATH}/../eval.py \ | |||||
| --platform=$1 \ | |||||
| --dataset_path=$2 \ | |||||
| --checkpoint_path=$3 \ | |||||
| &> infer.log & # dataset val folder path | |||||
| @@ -0,0 +1,95 @@ | |||||
| #!/usr/bin/env bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| run_ascend() | |||||
| { | |||||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||||
| then | |||||
| echo "error: DEVICE_NUM=$2 is not in (1-8)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $5 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$5 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| if [ -d "train" ]; | |||||
| then | |||||
| rm -rf ../train | |||||
| fi | |||||
| mkdir ../train | |||||
| cd ../train || exit | |||||
| python ${BASEPATH}/../launch.py \ | |||||
| --nproc_per_node=$2 \ | |||||
| --visible_devices=$4 \ | |||||
| --server_id=$3 \ | |||||
| --training_script=${BASEPATH}/train.py \ | |||||
| --dataset_path=$5 \ | |||||
| --platform=$1 &> train.log & # dataset train folder | |||||
| } | |||||
| run_gpu() | |||||
| { | |||||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||||
| then | |||||
| echo "error: DEVICE_NUM=$2 is not in (1-8)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $4 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$4 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| if [ -d "train" ]; | |||||
| then | |||||
| rm -rf ../train | |||||
| fi | |||||
| mkdir ../train | |||||
| cd ../train || exit | |||||
| export CUDA_VISIBLE_DEVICES="$3" | |||||
| mpirun -n $2 --allow-run-as-root \ | |||||
| python ${BASEPATH}/../train.py \ | |||||
| --dataset_path=$4 \ | |||||
| --platform=$1 \ | |||||
| &> train.log & # dataset train folder | |||||
| } | |||||
| if [ $# -gt 5 ] || [ $# -lt 4 ] | |||||
| then | |||||
| echo "Usage:\n \ | |||||
| Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ | |||||
| GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ | |||||
| " | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 = "Ascend" ] ; then | |||||
| run_ascend "$@" | |||||
| elif [ $1 = "GPU" ] ; then | |||||
| run_gpu "$@" | |||||
| else | |||||
| echo "not support platform" | |||||
| fi; | |||||
| @@ -17,7 +17,7 @@ network config setting, will be used in train.py and eval.py | |||||
| """ | """ | ||||
| from easydict import EasyDict as ed | from easydict import EasyDict as ed | ||||
| config = ed({ | |||||
| config_ascend = ed({ | |||||
| "num_classes": 1000, | "num_classes": 1000, | ||||
| "image_height": 224, | "image_height": 224, | ||||
| "image_width": 224, | "image_width": 224, | ||||
| @@ -34,3 +34,21 @@ config = ed({ | |||||
| "keep_checkpoint_max": 200, | "keep_checkpoint_max": 200, | ||||
| "save_checkpoint_path": "./checkpoint", | "save_checkpoint_path": "./checkpoint", | ||||
| }) | }) | ||||
| config_gpu = ed({ | |||||
| "num_classes": 1000, | |||||
| "image_height": 224, | |||||
| "image_width": 224, | |||||
| "batch_size": 64, | |||||
| "epoch_size": 200, | |||||
| "warmup_epochs": 4, | |||||
| "lr": 0.5, | |||||
| "momentum": 0.9, | |||||
| "weight_decay": 4e-5, | |||||
| "label_smooth": 0.1, | |||||
| "loss_scale": 1024, | |||||
| "save_checkpoint": True, | |||||
| "save_checkpoint_epochs": 1, | |||||
| "keep_checkpoint_max": 200, | |||||
| "save_checkpoint_path": "./checkpoint", | |||||
| }) | |||||
| @@ -20,10 +20,9 @@ import mindspore.common.dtype as mstype | |||||
| import mindspore.dataset.engine as de | import mindspore.dataset.engine as de | ||||
| import mindspore.dataset.transforms.vision.c_transforms as C | import mindspore.dataset.transforms.vision.c_transforms as C | ||||
| import mindspore.dataset.transforms.c_transforms as C2 | import mindspore.dataset.transforms.c_transforms as C2 | ||||
| from config import config | |||||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||||
| def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32): | |||||
| """ | """ | ||||
| create a train or eval dataset | create a train or eval dataset | ||||
| @@ -36,14 +35,18 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): | |||||
| Returns: | Returns: | ||||
| dataset | dataset | ||||
| """ | """ | ||||
| rank_size = int(os.getenv("RANK_SIZE")) | |||||
| rank_id = int(os.getenv("RANK_ID")) | |||||
| if rank_size == 1: | |||||
| if platform == "Ascend": | |||||
| rank_size = int(os.getenv("RANK_SIZE")) | |||||
| rank_id = int(os.getenv("RANK_ID")) | |||||
| if rank_size == 1: | |||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | |||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| elif platform == "GPU": | |||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) | ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) | ||||
| else: | else: | ||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| raise ValueError("Unsupport platform.") | |||||
| resize_height = config.image_height | resize_height = config.image_height | ||||
| resize_width = config.image_width | resize_width = config.image_width | ||||
| @@ -20,20 +20,10 @@ from mindspore.ops.operations import TensorAdd | |||||
| from mindspore import Parameter, Tensor | from mindspore import Parameter, Tensor | ||||
| from mindspore.common.initializer import initializer | from mindspore.common.initializer import initializer | ||||
| __all__ = ['MobileNetV2', 'mobilenet_v2'] | |||||
| __all__ = ['mobilenet_v2'] | |||||
| def _make_divisible(v, divisor, min_value=None): | def _make_divisible(v, divisor, min_value=None): | ||||
| """ | |||||
| This function is taken from the original tf repo. | |||||
| It ensures that all layers have a channel number that is divisible by 8 | |||||
| It can be seen here: | |||||
| https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py | |||||
| :param v: | |||||
| :param divisor: | |||||
| :param min_value: | |||||
| :return: | |||||
| """ | |||||
| if min_value is None: | if min_value is None: | ||||
| min_value = divisor | min_value = divisor | ||||
| new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) | ||||
| @@ -55,6 +45,7 @@ class GlobalAvgPooling(nn.Cell): | |||||
| Examples: | Examples: | ||||
| >>> GlobalAvgPooling() | >>> GlobalAvgPooling() | ||||
| """ | """ | ||||
| def __init__(self): | def __init__(self): | ||||
| super(GlobalAvgPooling, self).__init__() | super(GlobalAvgPooling, self).__init__() | ||||
| self.mean = P.ReduceMean(keep_dims=False) | self.mean = P.ReduceMean(keep_dims=False) | ||||
| @@ -82,6 +73,7 @@ class DepthwiseConv(nn.Cell): | |||||
| Examples: | Examples: | ||||
| >>> DepthwiseConv(16, 3, 1, 'pad', 1, channel_multiplier=1) | >>> DepthwiseConv(16, 3, 1, 'pad', 1, channel_multiplier=1) | ||||
| """ | """ | ||||
| def __init__(self, in_planes, kernel_size, stride, pad_mode, pad, channel_multiplier=1, has_bias=False): | def __init__(self, in_planes, kernel_size, stride, pad_mode, pad, channel_multiplier=1, has_bias=False): | ||||
| super(DepthwiseConv, self).__init__() | super(DepthwiseConv, self).__init__() | ||||
| self.has_bias = has_bias | self.has_bias = has_bias | ||||
| @@ -126,14 +118,19 @@ class ConvBNReLU(nn.Cell): | |||||
| Examples: | Examples: | ||||
| >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1) | >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1) | ||||
| """ | """ | ||||
| def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): | |||||
| def __init__(self, platform, in_planes, out_planes, kernel_size=3, stride=1, groups=1): | |||||
| super(ConvBNReLU, self).__init__() | super(ConvBNReLU, self).__init__() | ||||
| padding = (kernel_size - 1) // 2 | padding = (kernel_size - 1) // 2 | ||||
| if groups == 1: | if groups == 1: | ||||
| conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='pad', | |||||
| padding=padding) | |||||
| conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='pad', padding=padding) | |||||
| else: | else: | ||||
| conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='pad', pad=padding) | |||||
| if platform == "Ascend": | |||||
| conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='pad', pad=padding) | |||||
| elif platform == "GPU": | |||||
| conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, | |||||
| group=in_planes, pad_mode='pad', padding=padding) | |||||
| layers = [conv, nn.BatchNorm2d(out_planes), nn.ReLU6()] | layers = [conv, nn.BatchNorm2d(out_planes), nn.ReLU6()] | ||||
| self.features = nn.SequentialCell(layers) | self.features = nn.SequentialCell(layers) | ||||
| @@ -158,7 +155,8 @@ class InvertedResidual(nn.Cell): | |||||
| Examples: | Examples: | ||||
| >>> ResidualBlock(3, 256, 1, 1) | >>> ResidualBlock(3, 256, 1, 1) | ||||
| """ | """ | ||||
| def __init__(self, inp, oup, stride, expand_ratio): | |||||
| def __init__(self, platform, inp, oup, stride, expand_ratio): | |||||
| super(InvertedResidual, self).__init__() | super(InvertedResidual, self).__init__() | ||||
| assert stride in [1, 2] | assert stride in [1, 2] | ||||
| @@ -167,12 +165,14 @@ class InvertedResidual(nn.Cell): | |||||
| layers = [] | layers = [] | ||||
| if expand_ratio != 1: | if expand_ratio != 1: | ||||
| layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) | |||||
| layers.append(ConvBNReLU(platform, inp, hidden_dim, kernel_size=1)) | |||||
| layers.extend([ | layers.extend([ | ||||
| # dw | # dw | ||||
| ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), | |||||
| ConvBNReLU(platform, hidden_dim, hidden_dim, | |||||
| stride=stride, groups=hidden_dim), | |||||
| # pw-linear | # pw-linear | ||||
| nn.Conv2d(hidden_dim, oup, kernel_size=1, stride=1, has_bias=False), | |||||
| nn.Conv2d(hidden_dim, oup, kernel_size=1, | |||||
| stride=1, has_bias=False), | |||||
| nn.BatchNorm2d(oup), | nn.BatchNorm2d(oup), | ||||
| ]) | ]) | ||||
| self.conv = nn.SequentialCell(layers) | self.conv = nn.SequentialCell(layers) | ||||
| @@ -203,7 +203,8 @@ class MobileNetV2(nn.Cell): | |||||
| Examples: | Examples: | ||||
| >>> MobileNetV2(num_classes=1000) | >>> MobileNetV2(num_classes=1000) | ||||
| """ | """ | ||||
| def __init__(self, num_classes=1000, width_mult=1., | |||||
| def __init__(self, platform, num_classes=1000, width_mult=1., | |||||
| has_dropout=False, inverted_residual_setting=None, round_nearest=8): | has_dropout=False, inverted_residual_setting=None, round_nearest=8): | ||||
| super(MobileNetV2, self).__init__() | super(MobileNetV2, self).__init__() | ||||
| block = InvertedResidual | block = InvertedResidual | ||||
| @@ -226,16 +227,16 @@ class MobileNetV2(nn.Cell): | |||||
| # building first layer | # building first layer | ||||
| input_channel = _make_divisible(input_channel * width_mult, round_nearest) | input_channel = _make_divisible(input_channel * width_mult, round_nearest) | ||||
| self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) | self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) | ||||
| features = [ConvBNReLU(3, input_channel, stride=2)] | |||||
| features = [ConvBNReLU(platform, 3, input_channel, stride=2)] | |||||
| # building inverted residual blocks | # building inverted residual blocks | ||||
| for t, c, n, s in self.cfgs: | for t, c, n, s in self.cfgs: | ||||
| output_channel = _make_divisible(c * width_mult, round_nearest) | output_channel = _make_divisible(c * width_mult, round_nearest) | ||||
| for i in range(n): | for i in range(n): | ||||
| stride = s if i == 0 else 1 | stride = s if i == 0 else 1 | ||||
| features.append(block(input_channel, output_channel, stride, expand_ratio=t)) | |||||
| features.append(block(platform, input_channel, output_channel, stride, expand_ratio=t)) | |||||
| input_channel = output_channel | input_channel = output_channel | ||||
| # building last several layers | # building last several layers | ||||
| features.append(ConvBNReLU(input_channel, self.out_channels, kernel_size=1)) | |||||
| features.append(ConvBNReLU(platform, input_channel, self.out_channels, kernel_size=1)) | |||||
| # make it nn.CellList | # make it nn.CellList | ||||
| self.features = nn.SequentialCell(features) | self.features = nn.SequentialCell(features) | ||||
| # mobilenet head | # mobilenet head | ||||
| @@ -268,14 +269,19 @@ class MobileNetV2(nn.Cell): | |||||
| m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n), | m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n), | ||||
| m.weight.data.shape()).astype("float32"))) | m.weight.data.shape()).astype("float32"))) | ||||
| if m.bias is not None: | if m.bias is not None: | ||||
| m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) | |||||
| m.bias.set_parameter_data( | |||||
| Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) | |||||
| elif isinstance(m, nn.BatchNorm2d): | elif isinstance(m, nn.BatchNorm2d): | ||||
| m.gamma.set_parameter_data(Tensor(np.ones(m.gamma.data.shape(), dtype="float32"))) | |||||
| m.beta.set_parameter_data(Tensor(np.zeros(m.beta.data.shape(), dtype="float32"))) | |||||
| m.gamma.set_parameter_data( | |||||
| Tensor(np.ones(m.gamma.data.shape(), dtype="float32"))) | |||||
| m.beta.set_parameter_data( | |||||
| Tensor(np.zeros(m.beta.data.shape(), dtype="float32"))) | |||||
| elif isinstance(m, nn.Dense): | elif isinstance(m, nn.Dense): | ||||
| m.weight.set_parameter_data(Tensor(np.random.normal(0, 0.01, m.weight.data.shape()).astype("float32"))) | |||||
| m.weight.set_parameter_data(Tensor(np.random.normal( | |||||
| 0, 0.01, m.weight.data.shape()).astype("float32"))) | |||||
| if m.bias is not None: | if m.bias is not None: | ||||
| m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) | |||||
| m.bias.set_parameter_data( | |||||
| Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) | |||||
| def mobilenet_v2(**kwargs): | def mobilenet_v2(**kwargs): | ||||
| @@ -0,0 +1,267 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """train_imagenet.""" | |||||
| import os | |||||
| import time | |||||
| import argparse | |||||
| import random | |||||
| import numpy as np | |||||
| from mindspore import context | |||||
| from mindspore import Tensor | |||||
| from mindspore import nn | |||||
| from mindspore.parallel._auto_parallel_context import auto_parallel_context | |||||
| from mindspore.nn.optim.momentum import Momentum | |||||
| from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits | |||||
| from mindspore.nn.loss.loss import _Loss | |||||
| from mindspore.ops import operations as P | |||||
| from mindspore.ops import functional as F | |||||
| from mindspore.common import dtype as mstype | |||||
| from mindspore.train.model import Model, ParallelMode | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback | |||||
| from mindspore.train.loss_scale_manager import FixedLossScaleManager | |||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||||
| from mindspore.communication.management import init | |||||
| import mindspore.dataset.engine as de | |||||
| from src.dataset import create_dataset | |||||
| from src.lr_generator import get_lr | |||||
| from src.config import config_gpu, config_ascend | |||||
| from src.mobilenetV2 import mobilenet_v2 | |||||
| random.seed(1) | |||||
| np.random.seed(1) | |||||
| de.config.set_seed(1) | |||||
| parser = argparse.ArgumentParser(description='Image classification') | |||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | |||||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | |||||
| parser.add_argument('--platform', type=str, default=None, help='run platform') | |||||
| args_opt = parser.parse_args() | |||||
| if args_opt.platform == "Ascend": | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| rank_id = int(os.getenv('RANK_ID')) | |||||
| rank_size = int(os.getenv('RANK_SIZE')) | |||||
| run_distribute = rank_size > 1 | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target="Ascend", | |||||
| device_id=device_id, save_graphs=False) | |||||
| elif args_opt.platform == "GPU": | |||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target="GPU", save_graphs=False) | |||||
| else: | |||||
| raise ValueError("Unsupport platform.") | |||||
| class CrossEntropyWithLabelSmooth(_Loss): | |||||
| """ | |||||
| CrossEntropyWith LabelSmooth. | |||||
| Args: | |||||
| smooth_factor (float): smooth factor, default=0. | |||||
| num_classes (int): num classes | |||||
| Returns: | |||||
| None. | |||||
| Examples: | |||||
| >>> CrossEntropyWithLabelSmooth(smooth_factor=0., num_classes=1000) | |||||
| """ | |||||
| def __init__(self, smooth_factor=0., num_classes=1000): | |||||
| super(CrossEntropyWithLabelSmooth, self).__init__() | |||||
| self.onehot = P.OneHot() | |||||
| self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) | |||||
| self.off_value = Tensor(1.0 * smooth_factor / | |||||
| (num_classes - 1), mstype.float32) | |||||
| self.ce = nn.SoftmaxCrossEntropyWithLogits() | |||||
| self.mean = P.ReduceMean(False) | |||||
| self.cast = P.Cast() | |||||
| def construct(self, logit, label): | |||||
| one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], | |||||
| self.on_value, self.off_value) | |||||
| out_loss = self.ce(logit, one_hot_label) | |||||
| out_loss = self.mean(out_loss, 0) | |||||
| return out_loss | |||||
| class Monitor(Callback): | |||||
| """ | |||||
| Monitor loss and time. | |||||
| Args: | |||||
| lr_init (numpy array): train lr | |||||
| Returns: | |||||
| None | |||||
| Examples: | |||||
| >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy()) | |||||
| """ | |||||
| def __init__(self, lr_init=None): | |||||
| super(Monitor, self).__init__() | |||||
| self.lr_init = lr_init | |||||
| self.lr_init_len = len(lr_init) | |||||
| def epoch_begin(self, run_context): | |||||
| self.losses = [] | |||||
| self.epoch_time = time.time() | |||||
| def epoch_end(self, run_context): | |||||
| cb_params = run_context.original_args() | |||||
| epoch_mseconds = (time.time() - self.epoch_time) * 1000 | |||||
| per_step_mseconds = epoch_mseconds / cb_params.batch_num | |||||
| print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, | |||||
| per_step_mseconds, | |||||
| np.mean(self.losses))) | |||||
| def step_begin(self, run_context): | |||||
| self.step_time = time.time() | |||||
| def step_end(self, run_context): | |||||
| cb_params = run_context.original_args() | |||||
| step_mseconds = (time.time() - self.step_time) * 1000 | |||||
| step_loss = cb_params.net_outputs | |||||
| if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): | |||||
| step_loss = step_loss[0] | |||||
| if isinstance(step_loss, Tensor): | |||||
| step_loss = np.mean(step_loss.asnumpy()) | |||||
| self.losses.append(step_loss) | |||||
| cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num | |||||
| print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( | |||||
| cb_params.cur_epoch_num - | |||||
| 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, | |||||
| np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) | |||||
| if __name__ == '__main__': | |||||
| if args_opt.platform == "GPU": | |||||
| # train on gpu | |||||
| print("train args: ", args_opt, "\ncfg: ", config_gpu) | |||||
| # define net | |||||
| net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU") | |||||
| # define loss | |||||
| if config_gpu.label_smooth > 0: | |||||
| loss = CrossEntropyWithLabelSmooth( | |||||
| smooth_factor=config_gpu.label_smooth, num_classes=config_gpu.num_classes) | |||||
| else: | |||||
| loss = SoftmaxCrossEntropyWithLogits( | |||||
| is_grad=False, sparse=True, reduction='mean') | |||||
| # define dataset | |||||
| epoch_size = config_gpu.epoch_size | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, | |||||
| do_train=True, | |||||
| config=config_gpu, | |||||
| platform=args_opt.platform, | |||||
| repeat_num=epoch_size, | |||||
| batch_size=config_gpu.batch_size) | |||||
| step_size = dataset.get_dataset_size() | |||||
| # resume | |||||
| if args_opt.pre_trained: | |||||
| param_dict = load_checkpoint(args_opt.pre_trained) | |||||
| load_param_into_net(net, param_dict) | |||||
| # define optimizer | |||||
| loss_scale = FixedLossScaleManager( | |||||
| config_gpu.loss_scale, drop_overflow_update=False) | |||||
| lr = Tensor(get_lr(global_step=0, | |||||
| lr_init=0, | |||||
| lr_end=0, | |||||
| lr_max=config_gpu.lr, | |||||
| warmup_epochs=config_gpu.warmup_epochs, | |||||
| total_epochs=epoch_size, | |||||
| steps_per_epoch=step_size)) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_gpu.momentum, | |||||
| config_gpu.weight_decay, config_gpu.loss_scale) | |||||
| # define model | |||||
| model = Model(net, loss_fn=loss, optimizer=opt, | |||||
| loss_scale_manager=loss_scale) | |||||
| cb = [Monitor(lr_init=lr.asnumpy())] | |||||
| if config_gpu.save_checkpoint: | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config_gpu.save_checkpoint_epochs * step_size, | |||||
| keep_checkpoint_max=config_gpu.keep_checkpoint_max) | |||||
| ckpt_cb = ModelCheckpoint( | |||||
| prefix="mobilenet", directory=config_gpu.save_checkpoint_path, config=config_ck) | |||||
| cb += [ckpt_cb] | |||||
| # begine train | |||||
| model.train(epoch_size, dataset, callbacks=cb) | |||||
| elif args_opt.platform == "Ascend": | |||||
| # train on ascend | |||||
| print("train args: ", args_opt, "\ncfg: ", config_ascend, | |||||
| "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) | |||||
| if run_distribute: | |||||
| context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, | |||||
| parameter_broadcast=True, mirror_mean=True) | |||||
| auto_parallel_context().set_all_reduce_fusion_split_indices([140]) | |||||
| init() | |||||
| epoch_size = config_ascend.epoch_size | |||||
| net = mobilenet_v2(num_classes=config_ascend.num_classes) | |||||
| net.to_float(mstype.float16) | |||||
| for _, cell in net.cells_and_names(): | |||||
| if isinstance(cell, nn.Dense): | |||||
| cell.to_float(mstype.float32) | |||||
| if config_ascend.label_smooth > 0: | |||||
| loss = CrossEntropyWithLabelSmooth( | |||||
| smooth_factor=config_ascend.label_smooth, num_classes=config.num_classes) | |||||
| else: | |||||
| loss = SoftmaxCrossEntropyWithLogits( | |||||
| is_grad=False, sparse=True, reduction='mean') | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, | |||||
| do_train=True, | |||||
| config=config_ascend, | |||||
| platform=args_opt.platform, | |||||
| repeat_num=epoch_size, | |||||
| batch_size=config_ascend.batch_size) | |||||
| step_size = dataset.get_dataset_size() | |||||
| if args_opt.pre_trained: | |||||
| param_dict = load_checkpoint(args_opt.pre_trained) | |||||
| load_param_into_net(net, param_dict) | |||||
| loss_scale = FixedLossScaleManager( | |||||
| config_ascend.loss_scale, drop_overflow_update=False) | |||||
| lr = Tensor(get_lr(global_step=0, | |||||
| lr_init=0, | |||||
| lr_end=0, | |||||
| lr_max=config_ascend.lr, | |||||
| warmup_epochs=config_ascend.warmup_epochs, | |||||
| total_epochs=epoch_size, | |||||
| steps_per_epoch=step_size)) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_ascend.momentum, | |||||
| config_ascend.weight_decay, config_ascend.loss_scale) | |||||
| model = Model(net, loss_fn=loss, optimizer=opt, | |||||
| loss_scale_manager=loss_scale) | |||||
| cb = None | |||||
| if rank_id == 0: | |||||
| cb = [Monitor(lr_init=lr.asnumpy())] | |||||
| if config_ascend.save_checkpoint: | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config_ascend.save_checkpoint_epochs * step_size, | |||||
| keep_checkpoint_max=config_ascend.keep_checkpoint_max) | |||||
| ckpt_cb = ModelCheckpoint( | |||||
| prefix="mobilenet", directory=config_ascend.save_checkpoint_path, config=config_ck) | |||||
| cb += [ckpt_cb] | |||||
| model.train(epoch_size, dataset, callbacks=cb) | |||||
| else: | |||||
| raise ValueError("Unsupport platform.") | |||||
| @@ -0,0 +1,152 @@ | |||||
| # MobileNetV3 Description | |||||
| MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware- aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances.Nov 20, 2019. | |||||
| [Paper](https://arxiv.org/pdf/1905.02244) Howard, Andrew, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang et al. "Searching for mobilenetv3." In Proceedings of the IEEE International Conference on Computer Vision, pp. 1314-1324. 2019. | |||||
| # Model architecture | |||||
| The overall network architecture of MobileNetV3 is show below: | |||||
| [Link](https://arxiv.org/pdf/1905.02244) | |||||
| # Dataset | |||||
| Dataset used: [imagenet](http://www.image-net.org/) | |||||
| - Dataset size: ~125G, 1.2W colorful images in 1000 classes | |||||
| - Train: 120G, 1.2W images | |||||
| - Test: 5G, 50000 images | |||||
| - Data format: RGB images. | |||||
| - Note: Data will be processed in src/dataset.py | |||||
| # Features | |||||
| # Environment Requirements | |||||
| - Hardware(Ascend/GPU) | |||||
| - Prepare hardware environment with Ascend or GPU processor. If you want to try Ascend , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. | |||||
| - Framework | |||||
| - [MindSpore](http://10.90.67.50/mindspore/archive/20200506/OpenSource/me_vm_x86/) | |||||
| - For more information, please check the resources below: | |||||
| - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) | |||||
| - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html) | |||||
| # Script description | |||||
| ## Script and sample code | |||||
| ```python | |||||
| ├── MobilenetV3 | |||||
| ├── Readme.md | |||||
| ├── scripts | |||||
| │ ├──run_train.sh | |||||
| │ ├──run_eval.sh | |||||
| ├── src | |||||
| │ ├──config.py | |||||
| │ ├──dataset.py | |||||
| │ ├──luanch.py | |||||
| │ ├──lr_generator.py | |||||
| │ ├──mobilenetV2.py | |||||
| ├── train.py | |||||
| ├── eval.py | |||||
| ``` | |||||
| ## Training process | |||||
| ### Usage | |||||
| - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] | |||||
| - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] | |||||
| ### Launch | |||||
| ``` | |||||
| # training example | |||||
| Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ | |||||
| ``` | |||||
| ### Result | |||||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. | |||||
| ``` | |||||
| epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] | |||||
| epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 | |||||
| epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] | |||||
| epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 | |||||
| ``` | |||||
| ## Eval process | |||||
| ### Usage | |||||
| - Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] | |||||
| - GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] | |||||
| ### Launch | |||||
| ``` | |||||
| # infer example | |||||
| Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt | |||||
| GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt | |||||
| ``` | |||||
| > checkpoint can be produced in training process. | |||||
| ### Result | |||||
| Inference result will be stored in the example path, you can find result like the followings in `val.log`. | |||||
| ``` | |||||
| result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt | |||||
| ``` | |||||
| # Model description | |||||
| ## Performance | |||||
| ### Training Performance | |||||
| | Parameters | MobilenetV3 | | | |||||
| | -------------------------- | ---------------------------------------------------------- | ------------------------- | | |||||
| | Model Version | | large | | |||||
| | Resource | Ascend 910, cpu:2.60GHz 56cores, memory:314G | NV SMX2 V100-32G | | |||||
| | uploaded Date | 05/06/2020 | 05/06/2020 | | |||||
| | MindSpore Version | 0.3.0 | 0.3.0 | | |||||
| | Dataset | ImageNet | ImageNet | | |||||
| | Training Parameters | src/config.py | src/config.py | | |||||
| | Optimizer | Momentum | Momentum | | |||||
| | Loss Function | SoftmaxCrossEntropy | SoftmaxCrossEntropy | | |||||
| | outputs | | | | |||||
| | Loss | | 1.913 | | |||||
| | Accuracy | | ACC1[77.57%] ACC5[92.51%] | | |||||
| | Total time | | | | |||||
| | Params (M) | | | | |||||
| | Checkpoint for Fine tuning | | | | |||||
| | Model for inference | | | | |||||
| #### Inference Performance | |||||
| | Parameters | GoogLeNet | | | | |||||
| | -------------------------- | ----------------------------- | ------------------------- | -------------------- | | |||||
| | Model Version | V1 | | | | |||||
| | Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 | | |||||
| | uploaded Date | 05/06/2020 | 05/22/2020 | | | |||||
| | MindSpore Version | 0.2.0 | 0.2.0 | 0.2.0 | | |||||
| | Dataset | ImageNet, 1.2W | ImageNet, 1.2W | ImageNet, 1.2W | | |||||
| | batch_size | | 130(8P) | | | |||||
| | outputs | | | | | |||||
| | Accuracy | | ACC1[75.43%] ACC5[92.51%] | | | |||||
| | Speed | | | | | |||||
| | Total time | | | | | |||||
| | Model for inference | | | | | |||||
| # ModelZoo Homepage | |||||
| [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo) | |||||
| @@ -17,33 +17,51 @@ eval. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import argparse | import argparse | ||||
| from dataset import create_dataset | |||||
| from config import config | |||||
| from mindspore import context | from mindspore import context | ||||
| from mindspore.model_zoo.mobilenet import mobilenet_v2 | |||||
| from mindspore import nn | |||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits | |||||
| from mindspore.common import dtype as mstype | from mindspore.common import dtype as mstype | ||||
| from src.dataset import create_dataset | |||||
| from src.config import config_ascend, config_gpu | |||||
| from src.mobilenetV2 import mobilenet_v2 | |||||
| parser = argparse.ArgumentParser(description='Image classification') | parser = argparse.ArgumentParser(description='Image classification') | ||||
| parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') | parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') | ||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | ||||
| parser.add_argument('--platform', type=str, default=None, help='run platform') | |||||
| args_opt = parser.parse_args() | args_opt = parser.parse_args() | ||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') | |||||
| net = mobilenet_v2(num_classes=config.num_classes) | |||||
| net.to_float(mstype.float16) | |||||
| for _, cell in net.cells_and_names(): | |||||
| if isinstance(cell, nn.Dense): | |||||
| cell.add_flags_recursive(fp32=True) | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size) | |||||
| config_platform = None | |||||
| if args_opt.platform == "Ascend": | |||||
| config_platform = config_ascend | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", | |||||
| device_id=device_id, save_graphs=False) | |||||
| elif args_opt.platform == "GPU": | |||||
| config_platform = config_gpu | |||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target="GPU", save_graphs=False) | |||||
| else: | |||||
| raise ValueError("Unsupport platform.") | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits( | |||||
| is_grad=False, sparse=True, reduction='mean') | |||||
| net = mobilenet_v2(num_classes=config_platform.num_classes) | |||||
| if args_opt.platform == "Ascend": | |||||
| net.to_float(mstype.float16) | |||||
| for _, cell in net.cells_and_names(): | |||||
| if isinstance(cell, nn.Dense): | |||||
| cell.to_float(mstype.float32) | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, | |||||
| do_train=False, | |||||
| config=config_platform, | |||||
| platform=args_opt.platform, | |||||
| batch_size=config_platform.batch_size) | |||||
| step_size = dataset.get_dataset_size() | step_size = dataset.get_dataset_size() | ||||
| if args_opt.checkpoint_path: | if args_opt.checkpoint_path: | ||||
| @@ -0,0 +1,55 @@ | |||||
| #!/usr/bin/env bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| if [ $# != 3 ] | |||||
| then | |||||
| echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH] \ | |||||
| GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]" | |||||
| exit 1 | |||||
| fi | |||||
| # check dataset path | |||||
| if [ ! -d $2 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$2 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| # check checkpoint file | |||||
| if [ ! -f $3 ] | |||||
| then | |||||
| echo "error: CHECKPOINT_PATH=$3 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| # set environment | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| export DEVICE_ID=0 | |||||
| export RANK_ID=0 | |||||
| export RANK_SIZE=1 | |||||
| if [ -d "eval" ]; | |||||
| then | |||||
| rm -rf ./eval | |||||
| fi | |||||
| mkdir ./eval | |||||
| cd ./eval || exit | |||||
| # luanch | |||||
| python ${BASEPATH}/eval.py \ | |||||
| --platform=$1 \ | |||||
| --dataset_path=$2 \ | |||||
| --checkpoint_path=$3 \ | |||||
| &> infer.log & # dataset val folder path | |||||
| @@ -0,0 +1,94 @@ | |||||
| #!/usr/bin/env bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| run_ascend() | |||||
| { | |||||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||||
| then | |||||
| echo "error: DEVICE_NUM=$2 is not in (1-8)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $5 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$5 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| if [ -d "train" ]; | |||||
| then | |||||
| rm -rf ./train | |||||
| fi | |||||
| mkdir ./train | |||||
| cd ./train || exit | |||||
| python ${BASEPATH}/launch.py \ | |||||
| --nproc_per_node=$2 \ | |||||
| --visible_devices=$4 \ | |||||
| --server_id=$3 \ | |||||
| --training_script=${BASEPATH}/train.py \ | |||||
| --dataset_path=$5 \ | |||||
| --platform=$1 &> train.log & # dataset train folder | |||||
| } | |||||
| run_gpu() | |||||
| { | |||||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||||
| then | |||||
| echo "error: DEVICE_NUM=$2 is not in (1-8)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $4 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$4 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||||
| if [ -d "train" ]; | |||||
| then | |||||
| rm -rf ./train | |||||
| fi | |||||
| mkdir ./train | |||||
| cd ./train || exit | |||||
| export CUDA_VISIBLE_DEVICES="$3" | |||||
| mpirun -n $2 --allow-run-as-root \ | |||||
| python ${BASEPATH}/train.py \ | |||||
| --dataset_path=$4 \ | |||||
| --platform=$1 \ | |||||
| &> train.log & # dataset train folder | |||||
| } | |||||
| if [ $# -gt 5 ] || [ $# -lt 4 ] | |||||
| then | |||||
| echo "Usage:\n \ | |||||
| Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ | |||||
| GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ | |||||
| " | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 = "Ascend" ] ; then | |||||
| run_ascend "$@" | |||||
| elif [ $1 = "GPU" ] ; then | |||||
| run_gpu "$@" | |||||
| else | |||||
| echo "not support platform" | |||||
| fi; | |||||
| @@ -0,0 +1,54 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """ | |||||
| network config setting, will be used in train.py and eval.py | |||||
| """ | |||||
| from easydict import EasyDict as ed | |||||
| config_ascend = ed({ | |||||
| "num_classes": 1000, | |||||
| "image_height": 224, | |||||
| "image_width": 224, | |||||
| "batch_size": 256, | |||||
| "epoch_size": 200, | |||||
| "warmup_epochs": 4, | |||||
| "lr": 0.4, | |||||
| "momentum": 0.9, | |||||
| "weight_decay": 4e-5, | |||||
| "label_smooth": 0.1, | |||||
| "loss_scale": 1024, | |||||
| "save_checkpoint": True, | |||||
| "save_checkpoint_epochs": 1, | |||||
| "keep_checkpoint_max": 200, | |||||
| "save_checkpoint_path": "./checkpoint", | |||||
| }) | |||||
| config_gpu = ed({ | |||||
| "num_classes": 1000, | |||||
| "image_height": 224, | |||||
| "image_width": 224, | |||||
| "batch_size": 64, | |||||
| "epoch_size": 300, | |||||
| "warmup_epochs": 4, | |||||
| "lr": 0.5, | |||||
| "momentum": 0.9, | |||||
| "weight_decay": 4e-5, | |||||
| "label_smooth": 0.1, | |||||
| "loss_scale": 1024, | |||||
| "save_checkpoint": True, | |||||
| "save_checkpoint_epochs": 1, | |||||
| "keep_checkpoint_max": 500, | |||||
| "save_checkpoint_path": "./checkpoint", | |||||
| }) | |||||
| @@ -0,0 +1,85 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """ | |||||
| create train or eval dataset. | |||||
| """ | |||||
| import os | |||||
| import mindspore.common.dtype as mstype | |||||
| import mindspore.dataset.engine as de | |||||
| import mindspore.dataset.transforms.vision.c_transforms as C | |||||
| import mindspore.dataset.transforms.c_transforms as C2 | |||||
| def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32): | |||||
| """ | |||||
| create a train or eval dataset | |||||
| Args: | |||||
| dataset_path(string): the path of dataset. | |||||
| do_train(bool): whether dataset is used for train or eval. | |||||
| repeat_num(int): the repeat times of dataset. Default: 1 | |||||
| batch_size(int): the batch size of dataset. Default: 32 | |||||
| Returns: | |||||
| dataset | |||||
| """ | |||||
| if platform == "Ascend": | |||||
| rank_size = int(os.getenv("RANK_SIZE")) | |||||
| rank_id = int(os.getenv("RANK_ID")) | |||||
| if rank_size == 1: | |||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | |||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, | |||||
| num_shards=rank_size, shard_id=rank_id) | |||||
| elif platform == "GPU": | |||||
| ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) | |||||
| else: | |||||
| raise ValueError("Unsupport platform.") | |||||
| resize_height = config.image_height | |||||
| resize_width = config.image_width | |||||
| buffer_size = 1000 | |||||
| # define map operations | |||||
| decode_op = C.Decode() | |||||
| resize_crop_op = C.RandomCropDecodeResize(resize_height, scale=(0.08, 1.0), ratio=(0.75, 1.333)) | |||||
| horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5) | |||||
| resize_op = C.Resize((256, 256)) | |||||
| center_crop = C.CenterCrop(resize_width) | |||||
| rescale_op = C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) | |||||
| normalize_op = C.Normalize(mean=[0.485*255, 0.456*255, 0.406*255], std=[0.229*255, 0.224*255, 0.225*255]) | |||||
| change_swap_op = C.HWC2CHW() | |||||
| if do_train: | |||||
| trans = [resize_crop_op, horizontal_flip_op, rescale_op, normalize_op, change_swap_op] | |||||
| else: | |||||
| trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op] | |||||
| type_cast_op = C2.TypeCast(mstype.int32) | |||||
| ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) | |||||
| ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) | |||||
| # apply shuffle operations | |||||
| ds = ds.shuffle(buffer_size=buffer_size) | |||||
| # apply batch operations | |||||
| ds = ds.batch(batch_size, drop_remainder=True) | |||||
| # apply dataset repeat operation | |||||
| ds = ds.repeat(repeat_num) | |||||
| return ds | |||||
| @@ -0,0 +1,163 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """launch train script""" | |||||
| import os | |||||
| import sys | |||||
| import json | |||||
| import subprocess | |||||
| import shutil | |||||
| from argparse import ArgumentParser | |||||
| def parse_args(): | |||||
| """ | |||||
| parse args . | |||||
| Args: | |||||
| Returns: | |||||
| args. | |||||
| Examples: | |||||
| >>> parse_args() | |||||
| """ | |||||
| parser = ArgumentParser(description="mindspore distributed training launch " | |||||
| "helper utilty that will spawn up " | |||||
| "multiple distributed processes") | |||||
| parser.add_argument("--nproc_per_node", type=int, default=1, | |||||
| help="The number of processes to launch on each node, " | |||||
| "for D training, this is recommended to be set " | |||||
| "to the number of D in your system so that " | |||||
| "each process can be bound to a single D.") | |||||
| parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", | |||||
| help="will use the visible devices sequentially") | |||||
| parser.add_argument("--server_id", type=str, default="", | |||||
| help="server ip") | |||||
| parser.add_argument("--training_script", type=str, | |||||
| help="The full path to the single D training " | |||||
| "program/script to be launched in parallel, " | |||||
| "followed by all the arguments for the " | |||||
| "training script") | |||||
| # rest from the training program | |||||
| args, unknown = parser.parse_known_args() | |||||
| args.training_script_args = unknown | |||||
| return args | |||||
| def main(): | |||||
| print("start", __file__) | |||||
| args = parse_args() | |||||
| print(args) | |||||
| visible_devices = args.visible_devices.split(',') | |||||
| assert os.path.isfile(args.training_script) | |||||
| assert len(visible_devices) >= args.nproc_per_node | |||||
| print('visible_devices:{}'.format(visible_devices)) | |||||
| if not args.server_id: | |||||
| print('pleaser input server ip!!!') | |||||
| exit(0) | |||||
| print('server_id:{}'.format(args.server_id)) | |||||
| # construct hccn_table | |||||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||||
| device_ips = {} | |||||
| for hccn_item in hccn_configs: | |||||
| hccn_item = hccn_item.strip() | |||||
| if hccn_item.startswith('address_'): | |||||
| device_id, device_ip = hccn_item.split('=') | |||||
| device_id = device_id.split('_')[1] | |||||
| device_ips[device_id] = device_ip | |||||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||||
| hccn_table = {} | |||||
| hccn_table['board_id'] = '0x0000' | |||||
| hccn_table['chip_info'] = '910' | |||||
| hccn_table['deploy_mode'] = 'lab' | |||||
| hccn_table['group_count'] = '1' | |||||
| hccn_table['group_list'] = [] | |||||
| instance_list = [] | |||||
| usable_dev = '' | |||||
| for instance_id in range(args.nproc_per_node): | |||||
| instance = {} | |||||
| instance['devices'] = [] | |||||
| device_id = visible_devices[instance_id] | |||||
| device_ip = device_ips[device_id] | |||||
| usable_dev += str(device_id) | |||||
| instance['devices'].append({ | |||||
| 'device_id': device_id, | |||||
| 'device_ip': device_ip, | |||||
| }) | |||||
| instance['rank_id'] = str(instance_id) | |||||
| instance['server_id'] = args.server_id | |||||
| instance_list.append(instance) | |||||
| hccn_table['group_list'].append({ | |||||
| 'device_num': str(args.nproc_per_node), | |||||
| 'server_num': '1', | |||||
| 'group_name': '', | |||||
| 'instance_count': str(args.nproc_per_node), | |||||
| 'instance_list': instance_list, | |||||
| }) | |||||
| hccn_table['para_plane_nic_location'] = 'device' | |||||
| hccn_table['para_plane_nic_name'] = [] | |||||
| for instance_id in range(args.nproc_per_node): | |||||
| eth_id = visible_devices[instance_id] | |||||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||||
| hccn_table['para_plane_nic_num'] = str(args.nproc_per_node) | |||||
| hccn_table['status'] = 'completed' | |||||
| # save hccn_table to file | |||||
| table_path = os.getcwd() | |||||
| if not os.path.exists(table_path): | |||||
| os.mkdir(table_path) | |||||
| table_fn = os.path.join(table_path, | |||||
| 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id)) | |||||
| with open(table_fn, 'w') as table_fp: | |||||
| json.dump(hccn_table, table_fp, indent=4) | |||||
| sys.stdout.flush() | |||||
| # spawn the processes | |||||
| processes = [] | |||||
| cmds = [] | |||||
| log_files = [] | |||||
| env = os.environ.copy() | |||||
| env['RANK_SIZE'] = str(args.nproc_per_node) | |||||
| cur_path = os.getcwd() | |||||
| for rank_id in range(0, args.nproc_per_node): | |||||
| os.chdir(cur_path) | |||||
| device_id = visible_devices[rank_id] | |||||
| device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) | |||||
| env['RANK_ID'] = str(rank_id) | |||||
| env['DEVICE_ID'] = str(device_id) | |||||
| if args.nproc_per_node > 1: | |||||
| env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn | |||||
| env['RANK_TABLE_FILE'] = table_fn | |||||
| if os.path.exists(device_dir): | |||||
| shutil.rmtree(device_dir) | |||||
| os.mkdir(device_dir) | |||||
| os.chdir(device_dir) | |||||
| cmd = [sys.executable, '-u'] | |||||
| cmd.append(args.training_script) | |||||
| cmd.extend(args.training_script_args) | |||||
| log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') | |||||
| process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | |||||
| processes.append(process) | |||||
| cmds.append(cmd) | |||||
| log_files.append(log_file) | |||||
| for process, cmd, log_file in zip(processes, cmds, log_files): | |||||
| process.wait() | |||||
| if process.returncode != 0: | |||||
| raise subprocess.CalledProcessError(returncode=process, cmd=cmd) | |||||
| log_file.close() | |||||
| if __name__ == "__main__": | |||||
| main() | |||||
| @@ -0,0 +1,54 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """learning rate generator""" | |||||
| import math | |||||
| import numpy as np | |||||
| def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): | |||||
| """ | |||||
| generate learning rate array | |||||
| Args: | |||||
| global_step(int): total steps of the training | |||||
| lr_init(float): init learning rate | |||||
| lr_end(float): end learning rate | |||||
| lr_max(float): max learning rate | |||||
| warmup_epochs(int): number of warmup epochs | |||||
| total_epochs(int): total epoch of training | |||||
| steps_per_epoch(int): steps of one epoch | |||||
| Returns: | |||||
| np.array, learning rate array | |||||
| """ | |||||
| lr_each_step = [] | |||||
| total_steps = steps_per_epoch * total_epochs | |||||
| warmup_steps = steps_per_epoch * warmup_epochs | |||||
| for i in range(total_steps): | |||||
| if i < warmup_steps: | |||||
| lr = lr_init + (lr_max - lr_init) * i / warmup_steps | |||||
| else: | |||||
| lr = lr_end + \ | |||||
| (lr_max - lr_end) * \ | |||||
| (1. + math.cos(math.pi * (i - warmup_steps) / (total_steps - warmup_steps))) / 2. | |||||
| if lr < 0.0: | |||||
| lr = 0.0 | |||||
| lr_each_step.append(lr) | |||||
| current_step = global_step | |||||
| lr_each_step = np.array(lr_each_step).astype(np.float32) | |||||
| learning_rate = lr_each_step[current_step:] | |||||
| return learning_rate | |||||
| @@ -0,0 +1,390 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """MobileNetV3 model define""" | |||||
| from functools import partial | |||||
| import numpy as np | |||||
| import mindspore.nn as nn | |||||
| from mindspore.ops import operations as P | |||||
| from mindspore import Tensor | |||||
| __all__ = ['mobilenet_v3_large', | |||||
| 'mobilenet_v3_small'] | |||||
| def _make_divisible(x, divisor=8): | |||||
| return int(np.ceil(x * 1. / divisor) * divisor) | |||||
| class Activation(nn.Cell): | |||||
| """ | |||||
| Activation definition. | |||||
| Args: | |||||
| act_func(string): activation name. | |||||
| Returns: | |||||
| Tensor, output tensor. | |||||
| """ | |||||
| def __init__(self, act_func): | |||||
| super(Activation, self).__init__() | |||||
| if act_func == 'relu': | |||||
| self.act = nn.ReLU() | |||||
| elif act_func == 'relu6': | |||||
| self.act = nn.ReLU6() | |||||
| elif act_func in ('hsigmoid', 'hard_sigmoid'): | |||||
| self.act = nn.HSigmoid() | |||||
| elif act_func in ('hswish', 'hard_swish'): | |||||
| self.act = nn.HSwish() | |||||
| else: | |||||
| raise NotImplementedError | |||||
| def construct(self, x): | |||||
| return self.act(x) | |||||
| class GlobalAvgPooling(nn.Cell): | |||||
| """ | |||||
| Global avg pooling definition. | |||||
| Args: | |||||
| Returns: | |||||
| Tensor, output tensor. | |||||
| Examples: | |||||
| >>> GlobalAvgPooling() | |||||
| """ | |||||
| def __init__(self, keep_dims=False): | |||||
| super(GlobalAvgPooling, self).__init__() | |||||
| self.mean = P.ReduceMean(keep_dims=keep_dims) | |||||
| def construct(self, x): | |||||
| x = self.mean(x, (2, 3)) | |||||
| return x | |||||
| class SE(nn.Cell): | |||||
| """ | |||||
| SE warpper definition. | |||||
| Args: | |||||
| num_out (int): Output channel. | |||||
| ratio (int): middle output ratio. | |||||
| Returns: | |||||
| Tensor, output tensor. | |||||
| Examples: | |||||
| >>> SE(4) | |||||
| """ | |||||
| def __init__(self, num_out, ratio=4): | |||||
| super(SE, self).__init__() | |||||
| num_mid = _make_divisible(num_out // ratio) | |||||
| self.pool = GlobalAvgPooling(keep_dims=True) | |||||
| self.conv1 = nn.Conv2d(in_channels=num_out, out_channels=num_mid, | |||||
| kernel_size=1, has_bias=True, pad_mode='pad') | |||||
| self.act1 = Activation('relu') | |||||
| self.conv2 = nn.Conv2d(in_channels=num_mid, out_channels=num_out, | |||||
| kernel_size=1, has_bias=True, pad_mode='pad') | |||||
| self.act2 = Activation('hsigmoid') | |||||
| self.mul = P.Mul() | |||||
| def construct(self, x): | |||||
| out = self.pool(x) | |||||
| out = self.conv1(out) | |||||
| out = self.act1(out) | |||||
| out = self.conv2(out) | |||||
| out = self.act2(out) | |||||
| out = self.mul(x, out) | |||||
| return out | |||||
| class Unit(nn.Cell): | |||||
| """ | |||||
| Unit warpper definition. | |||||
| Args: | |||||
| num_in (int): Input channel. | |||||
| num_out (int): Output channel. | |||||
| kernel_size (int): Input kernel size. | |||||
| stride (int): Stride size. | |||||
| padding (int): Padding number. | |||||
| num_groups (int): Output num group. | |||||
| use_act (bool): Used activation or not. | |||||
| act_type (string): Activation type. | |||||
| Returns: | |||||
| Tensor, output tensor. | |||||
| Examples: | |||||
| >>> Unit(3, 3) | |||||
| """ | |||||
| def __init__(self, num_in, num_out, kernel_size=1, stride=1, padding=0, num_groups=1, | |||||
| use_act=True, act_type='relu'): | |||||
| super(Unit, self).__init__() | |||||
| self.conv = nn.Conv2d(in_channels=num_in, | |||||
| out_channels=num_out, | |||||
| kernel_size=kernel_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| group=num_groups, | |||||
| has_bias=False, | |||||
| pad_mode='pad') | |||||
| self.bn = nn.BatchNorm2d(num_out) | |||||
| self.use_act = use_act | |||||
| self.act = Activation(act_type) if use_act else None | |||||
| def construct(self, x): | |||||
| out = self.conv(x) | |||||
| out = self.bn(out) | |||||
| if self.use_act: | |||||
| out = self.act(out) | |||||
| return out | |||||
| class ResUnit(nn.Cell): | |||||
| """ | |||||
| ResUnit warpper definition. | |||||
| Args: | |||||
| num_in (int): Input channel. | |||||
| num_mid (int): Middle channel. | |||||
| num_out (int): Output channel. | |||||
| kernel_size (int): Input kernel size. | |||||
| stride (int): Stride size. | |||||
| act_type (str): Activation type. | |||||
| use_se (bool): Use SE warpper or not. | |||||
| Returns: | |||||
| Tensor, output tensor. | |||||
| Examples: | |||||
| >>> ResUnit(16, 3, 1, 1) | |||||
| """ | |||||
| def __init__(self, num_in, num_mid, num_out, kernel_size, stride=1, act_type='relu', use_se=False): | |||||
| super(ResUnit, self).__init__() | |||||
| self.use_se = use_se | |||||
| self.first_conv = (num_out != num_mid) | |||||
| self.use_short_cut_conv = True | |||||
| if self.first_conv: | |||||
| self.expand = Unit(num_in, num_mid, kernel_size=1, | |||||
| stride=1, padding=0, act_type=act_type) | |||||
| else: | |||||
| self.expand = None | |||||
| self.conv1 = Unit(num_mid, num_mid, kernel_size=kernel_size, stride=stride, | |||||
| padding=self._get_pad(kernel_size), act_type=act_type, num_groups=num_mid) | |||||
| if use_se: | |||||
| self.se = SE(num_mid) | |||||
| self.conv2 = Unit(num_mid, num_out, kernel_size=1, stride=1, | |||||
| padding=0, act_type=act_type, use_act=False) | |||||
| if num_in != num_out or stride != 1: | |||||
| self.use_short_cut_conv = False | |||||
| self.add = P.TensorAdd() if self.use_short_cut_conv else None | |||||
| def construct(self, x): | |||||
| if self.first_conv: | |||||
| out = self.expand(x) | |||||
| else: | |||||
| out = x | |||||
| out = self.conv1(out) | |||||
| if self.use_se: | |||||
| out = self.se(out) | |||||
| out = self.conv2(out) | |||||
| if self.use_short_cut_conv: | |||||
| out = self.add(x, out) | |||||
| return out | |||||
| def _get_pad(self, kernel_size): | |||||
| """set the padding number""" | |||||
| pad = 0 | |||||
| if kernel_size == 1: | |||||
| pad = 0 | |||||
| elif kernel_size == 3: | |||||
| pad = 1 | |||||
| elif kernel_size == 5: | |||||
| pad = 2 | |||||
| elif kernel_size == 7: | |||||
| pad = 3 | |||||
| else: | |||||
| raise NotImplementedError | |||||
| return pad | |||||
| class MobileNetV3(nn.Cell): | |||||
| """ | |||||
| MobileNetV3 architecture. | |||||
| Args: | |||||
| model_cfgs (Cell): number of classes. | |||||
| num_classes (int): Output number classes. | |||||
| multiplier (int): Channels multiplier for round to 8/16 and others. Default is 1. | |||||
| final_drop (float): Dropout number. | |||||
| round_nearest (list): Channel round to . Default is 8. | |||||
| Returns: | |||||
| Tensor, output tensor. | |||||
| Examples: | |||||
| >>> MobileNetV3(num_classes=1000) | |||||
| """ | |||||
| def __init__(self, model_cfgs, num_classes=1000, multiplier=1., final_drop=0., round_nearest=8): | |||||
| super(MobileNetV3, self).__init__() | |||||
| self.cfgs = model_cfgs['cfg'] | |||||
| self.inplanes = 16 | |||||
| self.features = [] | |||||
| first_conv_in_channel = 3 | |||||
| first_conv_out_channel = _make_divisible(multiplier * self.inplanes) | |||||
| self.features.append(nn.Conv2d(in_channels=first_conv_in_channel, | |||||
| out_channels=first_conv_out_channel, | |||||
| kernel_size=3, padding=1, stride=2, | |||||
| has_bias=False, pad_mode='pad')) | |||||
| self.features.append(nn.BatchNorm2d(first_conv_out_channel)) | |||||
| self.features.append(Activation('hswish')) | |||||
| for layer_cfg in self.cfgs: | |||||
| self.features.append(self._make_layer(kernel_size=layer_cfg[0], | |||||
| exp_ch=_make_divisible(multiplier * layer_cfg[1]), | |||||
| out_channel=_make_divisible(multiplier * layer_cfg[2]), | |||||
| use_se=layer_cfg[3], | |||||
| act_func=layer_cfg[4], | |||||
| stride=layer_cfg[5])) | |||||
| output_channel = _make_divisible(multiplier * model_cfgs["cls_ch_squeeze"]) | |||||
| self.features.append(nn.Conv2d(in_channels=_make_divisible(multiplier * self.cfgs[-1][2]), | |||||
| out_channels=output_channel, | |||||
| kernel_size=1, padding=0, stride=1, | |||||
| has_bias=False, pad_mode='pad')) | |||||
| self.features.append(nn.BatchNorm2d(output_channel)) | |||||
| self.features.append(Activation('hswish')) | |||||
| self.features.append(GlobalAvgPooling(keep_dims=True)) | |||||
| self.features.append(nn.Conv2d(in_channels=output_channel, | |||||
| out_channels=model_cfgs['cls_ch_expand'], | |||||
| kernel_size=1, padding=0, stride=1, | |||||
| has_bias=False, pad_mode='pad')) | |||||
| self.features.append(Activation('hswish')) | |||||
| if final_drop > 0: | |||||
| self.features.append((nn.Dropout(final_drop))) | |||||
| # make it nn.CellList | |||||
| self.features = nn.SequentialCell(self.features) | |||||
| self.output = nn.Conv2d(in_channels=model_cfgs['cls_ch_expand'], | |||||
| out_channels=num_classes, | |||||
| kernel_size=1, has_bias=True, pad_mode='pad') | |||||
| self.squeeze = P.Squeeze(axis=(2, 3)) | |||||
| self._initialize_weights() | |||||
| def construct(self, x): | |||||
| x = self.features(x) | |||||
| x = self.output(x) | |||||
| x = self.squeeze(x) | |||||
| return x | |||||
| def _make_layer(self, kernel_size, exp_ch, out_channel, use_se, act_func, stride=1): | |||||
| mid_planes = exp_ch | |||||
| out_planes = out_channel | |||||
| #num_in, num_mid, num_out, kernel_size, stride=1, act_type='relu', use_se=False): | |||||
| layer = ResUnit(self.inplanes, mid_planes, out_planes, | |||||
| kernel_size, stride=stride, act_type=act_func, use_se=use_se) | |||||
| self.inplanes = out_planes | |||||
| return layer | |||||
| def _initialize_weights(self): | |||||
| """ | |||||
| Initialize weights. | |||||
| Args: | |||||
| Returns: | |||||
| None. | |||||
| Examples: | |||||
| >>> _initialize_weights() | |||||
| """ | |||||
| for _, m in self.cells_and_names(): | |||||
| if isinstance(m, (nn.Conv2d)): | |||||
| n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||||
| m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n), | |||||
| m.weight.data.shape()).astype("float32"))) | |||||
| if m.bias is not None: | |||||
| m.bias.set_parameter_data( | |||||
| Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) | |||||
| elif isinstance(m, nn.BatchNorm2d): | |||||
| m.gamma.set_parameter_data( | |||||
| Tensor(np.ones(m.gamma.data.shape(), dtype="float32"))) | |||||
| m.beta.set_parameter_data( | |||||
| Tensor(np.zeros(m.beta.data.shape(), dtype="float32"))) | |||||
| elif isinstance(m, nn.Dense): | |||||
| m.weight.set_parameter_data(Tensor(np.random.normal( | |||||
| 0, 0.01, m.weight.data.shape()).astype("float32"))) | |||||
| if m.bias is not None: | |||||
| m.bias.set_parameter_data( | |||||
| Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) | |||||
| def mobilenet_v3(model_name, **kwargs): | |||||
| """ | |||||
| Constructs a MobileNet V2 model | |||||
| """ | |||||
| model_cfgs = { | |||||
| "large": { | |||||
| "cfg": [ | |||||
| # k, exp, c, se, nl, s, | |||||
| [3, 16, 16, False, 'relu', 1], | |||||
| [3, 64, 24, False, 'relu', 2], | |||||
| [3, 72, 24, False, 'relu', 1], | |||||
| [5, 72, 40, True, 'relu', 2], | |||||
| [5, 120, 40, True, 'relu', 1], | |||||
| [5, 120, 40, True, 'relu', 1], | |||||
| [3, 240, 80, False, 'hswish', 2], | |||||
| [3, 200, 80, False, 'hswish', 1], | |||||
| [3, 184, 80, False, 'hswish', 1], | |||||
| [3, 184, 80, False, 'hswish', 1], | |||||
| [3, 480, 112, True, 'hswish', 1], | |||||
| [3, 672, 112, True, 'hswish', 1], | |||||
| [5, 672, 160, True, 'hswish', 2], | |||||
| [5, 960, 160, True, 'hswish', 1], | |||||
| [5, 960, 160, True, 'hswish', 1]], | |||||
| "cls_ch_squeeze": 960, | |||||
| "cls_ch_expand": 1280, | |||||
| }, | |||||
| "small": { | |||||
| "cfg": [ | |||||
| # k, exp, c, se, nl, s, | |||||
| [3, 16, 16, True, 'relu', 2], | |||||
| [3, 72, 24, False, 'relu', 2], | |||||
| [3, 88, 24, False, 'relu', 1], | |||||
| [5, 96, 40, True, 'hswish', 2], | |||||
| [5, 240, 40, True, 'hswish', 1], | |||||
| [5, 240, 40, True, 'hswish', 1], | |||||
| [5, 120, 48, True, 'hswish', 1], | |||||
| [5, 144, 48, True, 'hswish', 1], | |||||
| [5, 288, 96, True, 'hswish', 2], | |||||
| [5, 576, 96, True, 'hswish', 1], | |||||
| [5, 576, 96, True, 'hswish', 1]], | |||||
| "cls_ch_squeeze": 576, | |||||
| "cls_ch_expand": 1280, | |||||
| } | |||||
| } | |||||
| return MobileNetV3(model_cfgs[model_name], **kwargs) | |||||
| mobilenet_v3_large = partial(mobilenet_v3, model_name="large") | |||||
| mobilenet_v3_small = partial(mobilenet_v3, model_name="small") | |||||
| @@ -0,0 +1,267 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """train_imagenet.""" | |||||
| import os | |||||
| import time | |||||
| import argparse | |||||
| import random | |||||
| import numpy as np | |||||
| from mindspore import context | |||||
| from mindspore import Tensor | |||||
| from mindspore import nn | |||||
| from mindspore.parallel._auto_parallel_context import auto_parallel_context | |||||
| from mindspore.nn.optim.momentum import Momentum | |||||
| from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits | |||||
| from mindspore.nn.loss.loss import _Loss | |||||
| from mindspore.ops import operations as P | |||||
| from mindspore.ops import functional as F | |||||
| from mindspore.common import dtype as mstype | |||||
| from mindspore.train.model import Model, ParallelMode | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback | |||||
| from mindspore.train.loss_scale_manager import FixedLossScaleManager | |||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||||
| import mindspore.dataset.engine as de | |||||
| from mindspore.communication.management import init | |||||
| from src.dataset import create_dataset | |||||
| from src.lr_generator import get_lr | |||||
| from src.config import config_gpu, config_ascend | |||||
| from src.mobilenetV3 import mobilenet_v3_large | |||||
| random.seed(1) | |||||
| np.random.seed(1) | |||||
| de.config.set_seed(1) | |||||
| parser = argparse.ArgumentParser(description='Image classification') | |||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | |||||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | |||||
| parser.add_argument('--platform', type=str, default=None, help='run platform') | |||||
| args_opt = parser.parse_args() | |||||
| if args_opt.platform == "Ascend": | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| rank_id = int(os.getenv('RANK_ID')) | |||||
| rank_size = int(os.getenv('RANK_SIZE')) | |||||
| run_distribute = rank_size > 1 | |||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target="Ascend", | |||||
| device_id=device_id, save_graphs=False) | |||||
| elif args_opt.platform == "GPU": | |||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target="GPU", save_graphs=False) | |||||
| else: | |||||
| raise ValueError("Unsupport platform.") | |||||
| class CrossEntropyWithLabelSmooth(_Loss): | |||||
| """ | |||||
| CrossEntropyWith LabelSmooth. | |||||
| Args: | |||||
| smooth_factor (float): smooth factor, default=0. | |||||
| num_classes (int): num classes | |||||
| Returns: | |||||
| None. | |||||
| Examples: | |||||
| >>> CrossEntropyWithLabelSmooth(smooth_factor=0., num_classes=1000) | |||||
| """ | |||||
| def __init__(self, smooth_factor=0., num_classes=1000): | |||||
| super(CrossEntropyWithLabelSmooth, self).__init__() | |||||
| self.onehot = P.OneHot() | |||||
| self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) | |||||
| self.off_value = Tensor(1.0 * smooth_factor / | |||||
| (num_classes - 1), mstype.float32) | |||||
| self.ce = nn.SoftmaxCrossEntropyWithLogits() | |||||
| self.mean = P.ReduceMean(False) | |||||
| self.cast = P.Cast() | |||||
| def construct(self, logit, label): | |||||
| one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], | |||||
| self.on_value, self.off_value) | |||||
| out_loss = self.ce(logit, one_hot_label) | |||||
| out_loss = self.mean(out_loss, 0) | |||||
| return out_loss | |||||
| class Monitor(Callback): | |||||
| """ | |||||
| Monitor loss and time. | |||||
| Args: | |||||
| lr_init (numpy array): train lr | |||||
| Returns: | |||||
| None | |||||
| Examples: | |||||
| >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy()) | |||||
| """ | |||||
| def __init__(self, lr_init=None): | |||||
| super(Monitor, self).__init__() | |||||
| self.lr_init = lr_init | |||||
| self.lr_init_len = len(lr_init) | |||||
| def epoch_begin(self, run_context): | |||||
| self.losses = [] | |||||
| self.epoch_time = time.time() | |||||
| def epoch_end(self, run_context): | |||||
| cb_params = run_context.original_args() | |||||
| epoch_mseconds = (time.time() - self.epoch_time) * 1000 | |||||
| per_step_mseconds = epoch_mseconds / cb_params.batch_num | |||||
| print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, | |||||
| per_step_mseconds, | |||||
| np.mean(self.losses))) | |||||
| def step_begin(self, run_context): | |||||
| self.step_time = time.time() | |||||
| def step_end(self, run_context): | |||||
| cb_params = run_context.original_args() | |||||
| step_mseconds = (time.time() - self.step_time) * 1000 | |||||
| step_loss = cb_params.net_outputs | |||||
| if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): | |||||
| step_loss = step_loss[0] | |||||
| if isinstance(step_loss, Tensor): | |||||
| step_loss = np.mean(step_loss.asnumpy()) | |||||
| self.losses.append(step_loss) | |||||
| cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num | |||||
| print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( | |||||
| cb_params.cur_epoch_num - | |||||
| 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, | |||||
| np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) | |||||
| if __name__ == '__main__': | |||||
| if args_opt.platform == "GPU": | |||||
| # train on gpu | |||||
| print("train args: ", args_opt, "\ncfg: ", config_gpu) | |||||
| # define net | |||||
| net = mobilenet_v3_large(num_classes=config_gpu.num_classes) | |||||
| # define loss | |||||
| if config_gpu.label_smooth > 0: | |||||
| loss = CrossEntropyWithLabelSmooth( | |||||
| smooth_factor=config_gpu.label_smooth, num_classes=config_gpu.num_classes) | |||||
| else: | |||||
| loss = SoftmaxCrossEntropyWithLogits( | |||||
| is_grad=False, sparse=True, reduction='mean') | |||||
| # define dataset | |||||
| epoch_size = config_gpu.epoch_size | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, | |||||
| do_train=True, | |||||
| config=config_gpu, | |||||
| platform=args_opt.platform, | |||||
| repeat_num=epoch_size, | |||||
| batch_size=config_gpu.batch_size) | |||||
| step_size = dataset.get_dataset_size() | |||||
| # resume | |||||
| if args_opt.pre_trained: | |||||
| param_dict = load_checkpoint(args_opt.pre_trained) | |||||
| load_param_into_net(net, param_dict) | |||||
| # define optimizer | |||||
| loss_scale = FixedLossScaleManager( | |||||
| config_gpu.loss_scale, drop_overflow_update=False) | |||||
| lr = Tensor(get_lr(global_step=0, | |||||
| lr_init=0, | |||||
| lr_end=0, | |||||
| lr_max=config_gpu.lr, | |||||
| warmup_epochs=config_gpu.warmup_epochs, | |||||
| total_epochs=epoch_size, | |||||
| steps_per_epoch=step_size)) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_gpu.momentum, | |||||
| config_gpu.weight_decay, config_gpu.loss_scale) | |||||
| # define model | |||||
| model = Model(net, loss_fn=loss, optimizer=opt, | |||||
| loss_scale_manager=loss_scale) | |||||
| cb = [Monitor(lr_init=lr.asnumpy())] | |||||
| if config_gpu.save_checkpoint: | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config_gpu.save_checkpoint_epochs * step_size, | |||||
| keep_checkpoint_max=config_gpu.keep_checkpoint_max) | |||||
| ckpt_cb = ModelCheckpoint( | |||||
| prefix="mobilenet", directory=config_gpu.save_checkpoint_path, config=config_ck) | |||||
| cb += [ckpt_cb] | |||||
| # begine train | |||||
| model.train(epoch_size, dataset, callbacks=cb) | |||||
| elif args_opt.platform == "Ascend": | |||||
| # train on ascend | |||||
| print("train args: ", args_opt, "\ncfg: ", config_ascend, | |||||
| "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) | |||||
| if run_distribute: | |||||
| context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, | |||||
| parameter_broadcast=True, mirror_mean=True) | |||||
| auto_parallel_context().set_all_reduce_fusion_split_indices([140]) | |||||
| init() | |||||
| epoch_size = config_ascend.epoch_size | |||||
| net = mobilenet_v3_large(num_classes=config_ascend.num_classes) | |||||
| net.to_float(mstype.float16) | |||||
| for _, cell in net.cells_and_names(): | |||||
| if isinstance(cell, nn.Dense): | |||||
| cell.to_float(mstype.float32) | |||||
| if config_ascend.label_smooth > 0: | |||||
| loss = CrossEntropyWithLabelSmooth( | |||||
| smooth_factor=config_ascend.label_smooth, num_classes=config.num_classes) | |||||
| else: | |||||
| loss = SoftmaxCrossEntropyWithLogits( | |||||
| is_grad=False, sparse=True, reduction='mean') | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, | |||||
| do_train=True, | |||||
| config=config_ascend, | |||||
| platform=args_opt.platform, | |||||
| repeat_num=epoch_size, | |||||
| batch_size=config_ascend.batch_size) | |||||
| step_size = dataset.get_dataset_size() | |||||
| if args_opt.pre_trained: | |||||
| param_dict = load_checkpoint(args_opt.pre_trained) | |||||
| load_param_into_net(net, param_dict) | |||||
| loss_scale = FixedLossScaleManager( | |||||
| config_ascend.loss_scale, drop_overflow_update=False) | |||||
| lr = Tensor(get_lr(global_step=0, | |||||
| lr_init=0, | |||||
| lr_end=0, | |||||
| lr_max=config_ascend.lr, | |||||
| warmup_epochs=config_ascend.warmup_epochs, | |||||
| total_epochs=epoch_size, | |||||
| steps_per_epoch=step_size)) | |||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_ascend.momentum, | |||||
| config_ascend.weight_decay, config_ascend.loss_scale) | |||||
| model = Model(net, loss_fn=loss, optimizer=opt, | |||||
| loss_scale_manager=loss_scale) | |||||
| cb = None | |||||
| if rank_id == 0: | |||||
| cb = [Monitor(lr_init=lr.asnumpy())] | |||||
| if config_ascend.save_checkpoint: | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config_ascend.save_checkpoint_epochs * step_size, | |||||
| keep_checkpoint_max=config_ascend.keep_checkpoint_max) | |||||
| ckpt_cb = ModelCheckpoint( | |||||
| prefix="mobilenet", directory=config_ascend.save_checkpoint_path, config=config_ck) | |||||
| cb += [ckpt_cb] | |||||
| model.train(epoch_size, dataset, callbacks=cb) | |||||
| else: | |||||
| raise Exception | |||||