From 82fc2f7ebd17158d30869a6e968c8ea28f95a6cd Mon Sep 17 00:00:00 2001 From: chenzomi Date: Tue, 19 May 2020 19:01:55 +0800 Subject: [PATCH] add mobilenetv2 and mobilenetv3 --- example/mobilenetv2_imagenet2012/README.md | 101 ----- example/mobilenetv2_imagenet2012/run_infer.sh | 33 -- example/mobilenetv2_imagenet2012/run_train.sh | 33 -- example/mobilenetv2_imagenet2012/train.py | 188 --------- mindspore/model_zoo/mobilenetv2/Readme.md | 151 +++++++ mindspore/model_zoo/mobilenetv2/eval.py | 75 ++++ .../mobilenetv2/scripts/run_infer.sh | 55 +++ .../mobilenetv2/scripts/run_train.sh | 95 +++++ .../model_zoo/mobilenetv2/src}/config.py | 20 +- .../model_zoo/mobilenetv2/src}/dataset.py | 19 +- .../model_zoo/mobilenetv2/src}/launch.py | 0 .../mobilenetv2/src}/lr_generator.py | 0 .../src/mobilenetV2.py} | 62 +-- mindspore/model_zoo/mobilenetv2/train.py | 267 ++++++++++++ mindspore/model_zoo/mobilenetv3/Readme.md | 152 +++++++ .../model_zoo/mobilenetv3}/eval.py | 48 ++- .../mobilenetv3/scripts/run_infer.sh | 55 +++ .../mobilenetv3/scripts/run_train.sh | 94 +++++ mindspore/model_zoo/mobilenetv3/src/config.py | 54 +++ .../model_zoo/mobilenetv3/src/dataset.py | 85 ++++ mindspore/model_zoo/mobilenetv3/src/launch.py | 163 ++++++++ .../model_zoo/mobilenetv3/src/lr_generator.py | 54 +++ .../model_zoo/mobilenetv3/src/mobilenetV3.py | 390 ++++++++++++++++++ mindspore/model_zoo/mobilenetv3/train.py | 267 ++++++++++++ 24 files changed, 2054 insertions(+), 407 deletions(-) delete mode 100644 example/mobilenetv2_imagenet2012/README.md delete mode 100644 example/mobilenetv2_imagenet2012/run_infer.sh delete mode 100644 example/mobilenetv2_imagenet2012/run_train.sh delete mode 100644 example/mobilenetv2_imagenet2012/train.py create mode 100644 mindspore/model_zoo/mobilenetv2/Readme.md create mode 100644 mindspore/model_zoo/mobilenetv2/eval.py create mode 100644 mindspore/model_zoo/mobilenetv2/scripts/run_infer.sh create mode 100644 mindspore/model_zoo/mobilenetv2/scripts/run_train.sh rename {example/mobilenetv2_imagenet2012 => mindspore/model_zoo/mobilenetv2/src}/config.py (72%) rename {example/mobilenetv2_imagenet2012 => mindspore/model_zoo/mobilenetv2/src}/dataset.py (81%) rename {example/mobilenetv2_imagenet2012 => mindspore/model_zoo/mobilenetv2/src}/launch.py (100%) rename {example/mobilenetv2_imagenet2012 => mindspore/model_zoo/mobilenetv2/src}/lr_generator.py (100%) rename mindspore/model_zoo/{mobilenet.py => mobilenetv2/src/mobilenetV2.py} (82%) create mode 100644 mindspore/model_zoo/mobilenetv2/train.py create mode 100644 mindspore/model_zoo/mobilenetv3/Readme.md rename {example/mobilenetv2_imagenet2012 => mindspore/model_zoo/mobilenetv3}/eval.py (54%) create mode 100644 mindspore/model_zoo/mobilenetv3/scripts/run_infer.sh create mode 100644 mindspore/model_zoo/mobilenetv3/scripts/run_train.sh create mode 100644 mindspore/model_zoo/mobilenetv3/src/config.py create mode 100644 mindspore/model_zoo/mobilenetv3/src/dataset.py create mode 100644 mindspore/model_zoo/mobilenetv3/src/launch.py create mode 100644 mindspore/model_zoo/mobilenetv3/src/lr_generator.py create mode 100644 mindspore/model_zoo/mobilenetv3/src/mobilenetV3.py create mode 100644 mindspore/model_zoo/mobilenetv3/train.py diff --git a/example/mobilenetv2_imagenet2012/README.md b/example/mobilenetv2_imagenet2012/README.md deleted file mode 100644 index deee9017d8..0000000000 --- a/example/mobilenetv2_imagenet2012/README.md +++ /dev/null @@ -1,101 +0,0 @@ -# MobileNetV2 Example - -## Description - -This is an example of training MobileNetV2 with ImageNet2012 dataset in MindSpore. - -## Requirements - -* Install [MindSpore](https://www.mindspore.cn/install/en). - -* Download the dataset [ImageNet2012]. - -> Unzip the ImageNet2012 dataset to any path you want and the folder structure should be as follows: -> ``` -> . -> ├── train # train dataset -> └── val # infer dataset -> ``` - -## Example structure - -``` shell -. -├── config.py # parameter configuration -├── dataset.py # data preprocessing -├── eval.py # infer script -├── launch.py # launcher for distributed training -├── lr_generator.py # generate learning rate for each step -├── run_infer.sh # launch infering -├── run_train.sh # launch training -└── train.py # train script -``` - -## Parameter configuration - -Parameters for both training and inference can be set in 'config.py'. - -``` -"num_classes": 1000, # dataset class num -"image_height": 224, # image height -"image_width": 224, # image width -"batch_size": 256, # training or infering batch size -"epoch_size": 200, # total training epochs, including warmup_epochs -"warmup_epochs": 4, # warmup epochs -"lr": 0.4, # base learning rate -"momentum": 0.9, # momentum -"weight_decay": 4e-5, # weight decay -"loss_scale": 1024, # loss scale -"save_checkpoint": True, # whether save checkpoint -"save_checkpoint_epochs": 1, # the epoch interval between two checkpoints -"keep_checkpoint_max": 200, # only keep the last keep_checkpoint_max checkpoint -"save_checkpoint_path": "./checkpoint" # path to save checkpoint -``` - -## Running the example - -### Train - -#### Usage -Usage: sh run_train.sh [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - -#### Launch - -``` -# training example -sh run_train.sh 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet -``` - -#### Result - -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. - -``` -epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] -epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 -epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] -epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 -``` - -### Infer - -#### Usage - -Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH] - -#### Launch - -``` -# infer example -sh run_infer.sh ~/imagenet ~/train/mobilenet-200_625.ckpt -``` - -> checkpoint can be produced in training process. - -#### Result - -Inference result will be stored in the example path, you can find result like the followings in `val.log`. - -``` -result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt -``` diff --git a/example/mobilenetv2_imagenet2012/run_infer.sh b/example/mobilenetv2_imagenet2012/run_infer.sh deleted file mode 100644 index dc1e4d0b5d..0000000000 --- a/example/mobilenetv2_imagenet2012/run_infer.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -if [ $# != 2 ] -then - echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]" -exit 1 -fi - -if [ ! -d $1 ] -then - echo "error: DATASET_PATH=$1 is not a directory" -exit 1 -fi - -if [ ! -f $2 ] -then - echo "error: CHECKPOINT_PATH=$2 is not a file" -exit 1 -fi - -BASEPATH=$(cd "`dirname $0`" || exit; pwd) -export PYTHONPATH=${BASEPATH}:$PYTHONPATH -export DEVICE_ID=0 -export RANK_ID=0 -export RANK_SIZE=1 -if [ -d "eval" ]; -then - rm -rf ./eval -fi -mkdir ./eval -cd ./eval || exit -python ${BASEPATH}/eval.py \ - --checkpoint_path=$2 \ - --dataset_path=$1 &> infer.log & # dataset val folder path diff --git a/example/mobilenetv2_imagenet2012/run_train.sh b/example/mobilenetv2_imagenet2012/run_train.sh deleted file mode 100644 index 3f92b4f172..0000000000 --- a/example/mobilenetv2_imagenet2012/run_train.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -if [ $# != 4 ] -then - echo "Usage: sh run_train.sh [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]" -exit 1 -fi - -if [ $1 -lt 1 ] && [ $1 -gt 8 ] -then - echo "error: DEVICE_NUM=$1 is not in (1-8)" -exit 1 -fi - -if [ ! -d $4 ] -then - echo "error: DATASET_PATH=$4 is not a directory" -exit 1 -fi - -BASEPATH=$(cd "`dirname $0`" || exit; pwd) -export PYTHONPATH=${BASEPATH}:$PYTHONPATH -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cd ./train || exit -python ${BASEPATH}/launch.py \ - --nproc_per_node=$1 \ - --visible_devices=$3 \ - --server_id=$2 \ - --training_script=${BASEPATH}/train.py \ - --dataset_path=$4 &> train.log & # dataset train folder diff --git a/example/mobilenetv2_imagenet2012/train.py b/example/mobilenetv2_imagenet2012/train.py deleted file mode 100644 index 72dfe78857..0000000000 --- a/example/mobilenetv2_imagenet2012/train.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""train_imagenet.""" -import os -import time -import argparse -import random -import numpy as np -from dataset import create_dataset -from lr_generator import get_lr -from config import config -from mindspore import context -from mindspore import Tensor -from mindspore import nn -from mindspore.model_zoo.mobilenet import mobilenet_v2 -from mindspore.parallel._auto_parallel_context import auto_parallel_context -from mindspore.nn.optim.momentum import Momentum -from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits -from mindspore.nn.loss.loss import _Loss -from mindspore.ops import operations as P -from mindspore.ops import functional as F -from mindspore.common import dtype as mstype - -from mindspore.train.model import Model, ParallelMode - -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback -from mindspore.train.loss_scale_manager import FixedLossScaleManager -from mindspore.train.serialization import load_checkpoint, load_param_into_net -import mindspore.dataset.engine as de -from mindspore.communication.management import init - -random.seed(1) -np.random.seed(1) -de.config.set_seed(1) - -parser = argparse.ArgumentParser(description='Image classification') -parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') -parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') -args_opt = parser.parse_args() - -device_id = int(os.getenv('DEVICE_ID')) -rank_id = int(os.getenv('RANK_ID')) -rank_size = int(os.getenv('RANK_SIZE')) -run_distribute = rank_size > 1 - -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False) - -class CrossEntropyWithLabelSmooth(_Loss): - """ - CrossEntropyWith LabelSmooth. - - Args: - smooth_factor (float): smooth factor, default=0. - num_classes (int): num classes - - Returns: - None. - - Examples: - >>> CrossEntropyWithLabelSmooth(smooth_factor=0., num_classes=1000) - """ - - def __init__(self, smooth_factor=0., num_classes=1000): - super(CrossEntropyWithLabelSmooth, self).__init__() - self.onehot = P.OneHot() - self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) - self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32) - self.ce = nn.SoftmaxCrossEntropyWithLogits() - self.mean = P.ReduceMean(False) - self.cast = P.Cast() - - def construct(self, logit, label): - one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], self.on_value, self.off_value) - out_loss = self.ce(logit, one_hot_label) - out_loss = self.mean(out_loss, 0) - return out_loss - -class Monitor(Callback): - """ - Monitor loss and time. - - Args: - lr_init (numpy array): train lr - - Returns: - None - - Examples: - >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy()) - """ - - def __init__(self, lr_init=None): - super(Monitor, self).__init__() - self.lr_init = lr_init - self.lr_init_len = len(lr_init) - - def epoch_begin(self, run_context): - self.losses = [] - self.epoch_time = time.time() - - def epoch_end(self, run_context): - cb_params = run_context.original_args() - - epoch_mseconds = (time.time() - self.epoch_time) * 1000 - per_step_mseconds = epoch_mseconds / cb_params.batch_num - print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, - per_step_mseconds, - np.mean(self.losses) - )) - - def step_begin(self, run_context): - self.step_time = time.time() - - def step_end(self, run_context): - cb_params = run_context.original_args() - step_mseconds = (time.time() - self.step_time) * 1000 - step_loss = cb_params.net_outputs - - if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): - step_loss = step_loss[0] - if isinstance(step_loss, Tensor): - step_loss = np.mean(step_loss.asnumpy()) - - self.losses.append(step_loss) - cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num - - print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( - cb_params.cur_epoch_num - 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, - np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) - - -if __name__ == '__main__': - if run_distribute: - context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, - parameter_broadcast=True, mirror_mean=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) - init() - - epoch_size = config.epoch_size - net = mobilenet_v2(num_classes=config.num_classes) - net.to_float(mstype.float16) - for _, cell in net.cells_and_names(): - if isinstance(cell, nn.Dense): - cell.add_flags_recursive(fp32=True) - if config.label_smooth > 0: - loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) - else: - loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') - - print("train args: ", args_opt, "\ncfg: ", config, - "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) - - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, - repeat_num=epoch_size, batch_size=config.batch_size) - step_size = dataset.get_dataset_size() - if args_opt.pre_trained: - param_dict = load_checkpoint(args_opt.pre_trained) - load_param_into_net(net, param_dict) - - loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) - lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config.lr, - warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size)) - opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, - config.weight_decay, config.loss_scale) - - model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) - - cb = None - if rank_id == 0: - cb = [Monitor(lr_init=lr.asnumpy())] - if config.save_checkpoint: - config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, - keep_checkpoint_max=config.keep_checkpoint_max) - ckpt_cb = ModelCheckpoint(prefix="mobilenet", directory=config.save_checkpoint_path, config=config_ck) - cb += [ckpt_cb] - model.train(epoch_size, dataset, callbacks=cb) diff --git a/mindspore/model_zoo/mobilenetv2/Readme.md b/mindspore/model_zoo/mobilenetv2/Readme.md new file mode 100644 index 0000000000..0244aeb1bf --- /dev/null +++ b/mindspore/model_zoo/mobilenetv2/Readme.md @@ -0,0 +1,151 @@ +# MobileNetV2 Description + + +MobileNetV2 is tuned to mobile phone CPUs through a combination of hardware- aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances.Nov 20, 2019. + +[Paper](https://arxiv.org/pdf/1905.02244) Howard, Andrew, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang et al. "Searching for MobileNetV2." In Proceedings of the IEEE International Conference on Computer Vision, pp. 1314-1324. 2019. + +# Model architecture + +The overall network architecture of MobileNetV2 is show below: + +[Link](https://arxiv.org/pdf/1905.02244) + +# Dataset + +Dataset used: [imagenet](http://www.image-net.org/) + +- Dataset size: ~125G, 1.2W colorful images in 1000 classes + - Train: 120G, 1.2W images + - Test: 5G, 50000 images +- Data format: RGB images. + - Note: Data will be processed in src/dataset.py + + +# Features + + +# Environment Requirements + +- Hardware(Ascend/GPU) + - Prepare hardware environment with Ascend or GPU processor. If you want to try Ascend , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. +- Framework + - [MindSpore](http://10.90.67.50/mindspore/archive/20200506/OpenSource/me_vm_x86/) +- For more information, please check the resources below: + - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) + - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html) + + +# Script description + +## Script and sample code + +```python +├── MobileNetV2 + ├── Readme.md + ├── scripts + │ ├──run_train.sh + │ ├──run_eval.sh + ├── src + │ ├──config.py + │ ├──dataset.py + │ ├──luanch.py + │ ├──lr_generator.py + │ ├──mobilenetV2.py + ├── train.py + ├── eval.py +``` + +## Training process + +### Usage + +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] +- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] + +### Launch + +``` +# training example + Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ +``` + +### Result + +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. + +``` +epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] +epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 +epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] +epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 +``` + +## Eval process + +### Usage + +- Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] +- GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] + +### Launch + +``` +# infer example + Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt + GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt +``` + +> checkpoint can be produced in training process. + +### Result + +Inference result will be stored in the example path, you can find result like the followings in `val.log`. + +``` +result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt +``` + +# Model description + +## Performance + +### Training Performance + +| Parameters | MobilenetV2 | | +| -------------------------- | ---------------------------------------------------------- | ------------------------- | +| Model Version | | large | +| Resource | Ascend 910, cpu:2.60GHz 56cores, memory:314G | NV SMX2 V100-32G | +| uploaded Date | 05/06/2020 | 05/06/2020 | +| MindSpore Version | 0.3.0 | 0.3.0 | +| Dataset | ImageNet | ImageNet | +| Training Parameters | src/config.py | src/config.py | +| Optimizer | Momentum | Momentum | +| Loss Function | SoftmaxCrossEntropy | SoftmaxCrossEntropy | +| outputs | | | +| Loss | | 1.913 | +| Accuracy | | ACC1[77.09%] ACC5[92.57%] | +| Total time | | | +| Params (M) | | | +| Checkpoint for Fine tuning | | | +| Model for inference | | | + +#### Inference Performance + +| Parameters | GoogLeNet | | | +| -------------------------- | ----------------------------- | ------------------------- | -------------------- | +| Model Version | V1 | | | +| Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 | +| uploaded Date | 05/06/2020 | 05/22/2020 | | +| MindSpore Version | 0.2.0 | 0.2.0 | 0.2.0 | +| Dataset | ImageNet, 1.2W | ImageNet, 1.2W | ImageNet, 1.2W | +| batch_size | | 130(8P) | | +| outputs | | | | +| Accuracy | | ACC1[72.07%] ACC5[90.90%] | | +| Speed | | | | +| Total time | | | | +| Model for inference | | | | + +# ModelZoo Homepage + [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo) \ No newline at end of file diff --git a/mindspore/model_zoo/mobilenetv2/eval.py b/mindspore/model_zoo/mobilenetv2/eval.py new file mode 100644 index 0000000000..9e08749289 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv2/eval.py @@ -0,0 +1,75 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +eval. +""" +import os +import argparse +from mindspore import context +from mindspore import nn +from mindspore.train.model import Model +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.common import dtype as mstype +from src.dataset import create_dataset +from src.config import config_ascend, config_gpu +from src.mobilenetV2 import mobilenet_v2 + + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--platform', type=str, default=None, help='run platform') +args_opt = parser.parse_args() + + +if __name__ == '__main__': + config_platform = None + if args_opt.platform == "Ascend": + config_platform = config_ascend + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", + device_id=device_id, save_graphs=False) + elif args_opt.platform == "GPU": + config_platform = config_gpu + context.set_context(mode=context.GRAPH_MODE, + device_target="GPU", save_graphs=False) + else: + raise ValueError("Unsupport platform.") + + loss = nn.SoftmaxCrossEntropyWithLogits( + is_grad=False, sparse=True, reduction='mean') + net = mobilenet_v2(num_classes=config_platform.num_classes) + + if args_opt.platform == "Ascend": + net.to_float(mstype.float16) + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Dense): + cell.to_float(mstype.float32) + + dataset = create_dataset(dataset_path=args_opt.dataset_path, + do_train=False, + config=config_platform, + platform=args_opt.platform, + batch_size=config_platform.batch_size) + step_size = dataset.get_dataset_size() + + if args_opt.checkpoint_path: + param_dict = load_checkpoint(args_opt.checkpoint_path) + load_param_into_net(net, param_dict) + net.set_train(False) + + model = Model(net, loss_fn=loss, metrics={'acc'}) + res = model.eval(dataset) + print("result:", res, "ckpt=", args_opt.checkpoint_path) diff --git a/mindspore/model_zoo/mobilenetv2/scripts/run_infer.sh b/mindspore/model_zoo/mobilenetv2/scripts/run_infer.sh new file mode 100644 index 0000000000..ee2660be7b --- /dev/null +++ b/mindspore/model_zoo/mobilenetv2/scripts/run_infer.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 3 ] +then + echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH] \ + GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +# check dataset path +if [ ! -d $2 ] +then + echo "error: DATASET_PATH=$2 is not a directory" +exit 1 +fi + +# check checkpoint file +if [ ! -f $3 ] +then + echo "error: CHECKPOINT_PATH=$3 is not a file" +exit 1 +fi + +# set environment +BASEPATH=$(cd "`dirname $0`" || exit; pwd) +export PYTHONPATH=${BASEPATH}:$PYTHONPATH +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 +if [ -d "eval" ]; +then + rm -rf ../eval +fi +mkdir ../eval +cd ../eval || exit + +# luanch +python ${BASEPATH}/../eval.py \ + --platform=$1 \ + --dataset_path=$2 \ + --checkpoint_path=$3 \ + &> infer.log & # dataset val folder path diff --git a/mindspore/model_zoo/mobilenetv2/scripts/run_train.sh b/mindspore/model_zoo/mobilenetv2/scripts/run_train.sh new file mode 100644 index 0000000000..5812a37305 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv2/scripts/run_train.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +run_ascend() +{ + if [ $2 -lt 1 ] && [ $2 -gt 8 ] + then + echo "error: DEVICE_NUM=$2 is not in (1-8)" + exit 1 + fi + + if [ ! -d $5 ] + then + echo "error: DATASET_PATH=$5 is not a directory" + exit 1 + fi + + BASEPATH=$(cd "`dirname $0`" || exit; pwd) + export PYTHONPATH=${BASEPATH}:$PYTHONPATH + if [ -d "train" ]; + then + rm -rf ../train + fi + mkdir ../train + cd ../train || exit + python ${BASEPATH}/../launch.py \ + --nproc_per_node=$2 \ + --visible_devices=$4 \ + --server_id=$3 \ + --training_script=${BASEPATH}/train.py \ + --dataset_path=$5 \ + --platform=$1 &> train.log & # dataset train folder +} + +run_gpu() +{ + if [ $2 -lt 1 ] && [ $2 -gt 8 ] + then + echo "error: DEVICE_NUM=$2 is not in (1-8)" + exit 1 + fi + + if [ ! -d $4 ] + then + echo "error: DATASET_PATH=$4 is not a directory" + exit 1 + fi + + BASEPATH=$(cd "`dirname $0`" || exit; pwd) + export PYTHONPATH=${BASEPATH}:$PYTHONPATH + if [ -d "train" ]; + then + rm -rf ../train + fi + mkdir ../train + cd ../train || exit + + export CUDA_VISIBLE_DEVICES="$3" + mpirun -n $2 --allow-run-as-root \ + python ${BASEPATH}/../train.py \ + --dataset_path=$4 \ + --platform=$1 \ + &> train.log & # dataset train folder +} + +if [ $# -gt 5 ] || [ $# -lt 4 ] +then + echo "Usage:\n \ + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ + GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ + " +exit 1 +fi + +if [ $1 = "Ascend" ] ; then + run_ascend "$@" +elif [ $1 = "GPU" ] ; then + run_gpu "$@" +else + echo "not support platform" +fi; + diff --git a/example/mobilenetv2_imagenet2012/config.py b/mindspore/model_zoo/mobilenetv2/src/config.py similarity index 72% rename from example/mobilenetv2_imagenet2012/config.py rename to mindspore/model_zoo/mobilenetv2/src/config.py index 2a8d37b6fc..c8885336b2 100644 --- a/example/mobilenetv2_imagenet2012/config.py +++ b/mindspore/model_zoo/mobilenetv2/src/config.py @@ -17,7 +17,7 @@ network config setting, will be used in train.py and eval.py """ from easydict import EasyDict as ed -config = ed({ +config_ascend = ed({ "num_classes": 1000, "image_height": 224, "image_width": 224, @@ -34,3 +34,21 @@ config = ed({ "keep_checkpoint_max": 200, "save_checkpoint_path": "./checkpoint", }) + +config_gpu = ed({ + "num_classes": 1000, + "image_height": 224, + "image_width": 224, + "batch_size": 64, + "epoch_size": 200, + "warmup_epochs": 4, + "lr": 0.5, + "momentum": 0.9, + "weight_decay": 4e-5, + "label_smooth": 0.1, + "loss_scale": 1024, + "save_checkpoint": True, + "save_checkpoint_epochs": 1, + "keep_checkpoint_max": 200, + "save_checkpoint_path": "./checkpoint", +}) diff --git a/example/mobilenetv2_imagenet2012/dataset.py b/mindspore/model_zoo/mobilenetv2/src/dataset.py similarity index 81% rename from example/mobilenetv2_imagenet2012/dataset.py rename to mindspore/model_zoo/mobilenetv2/src/dataset.py index 908ce87aa1..a1a77a8495 100644 --- a/example/mobilenetv2_imagenet2012/dataset.py +++ b/mindspore/model_zoo/mobilenetv2/src/dataset.py @@ -20,10 +20,9 @@ import mindspore.common.dtype as mstype import mindspore.dataset.engine as de import mindspore.dataset.transforms.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 -from config import config -def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): +def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32): """ create a train or eval dataset @@ -36,14 +35,18 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): Returns: dataset """ - rank_size = int(os.getenv("RANK_SIZE")) - rank_id = int(os.getenv("RANK_ID")) - - if rank_size == 1: + if platform == "Ascend": + rank_size = int(os.getenv("RANK_SIZE")) + rank_id = int(os.getenv("RANK_ID")) + if rank_size == 1: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) + else: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) + elif platform == "GPU": ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + raise ValueError("Unsupport platform.") resize_height = config.image_height resize_width = config.image_width diff --git a/example/mobilenetv2_imagenet2012/launch.py b/mindspore/model_zoo/mobilenetv2/src/launch.py similarity index 100% rename from example/mobilenetv2_imagenet2012/launch.py rename to mindspore/model_zoo/mobilenetv2/src/launch.py diff --git a/example/mobilenetv2_imagenet2012/lr_generator.py b/mindspore/model_zoo/mobilenetv2/src/lr_generator.py similarity index 100% rename from example/mobilenetv2_imagenet2012/lr_generator.py rename to mindspore/model_zoo/mobilenetv2/src/lr_generator.py diff --git a/mindspore/model_zoo/mobilenet.py b/mindspore/model_zoo/mobilenetv2/src/mobilenetV2.py similarity index 82% rename from mindspore/model_zoo/mobilenet.py rename to mindspore/model_zoo/mobilenetv2/src/mobilenetV2.py index 6539c3e269..df35c5f369 100644 --- a/mindspore/model_zoo/mobilenet.py +++ b/mindspore/model_zoo/mobilenetv2/src/mobilenetV2.py @@ -20,20 +20,10 @@ from mindspore.ops.operations import TensorAdd from mindspore import Parameter, Tensor from mindspore.common.initializer import initializer -__all__ = ['MobileNetV2', 'mobilenet_v2'] +__all__ = ['mobilenet_v2'] def _make_divisible(v, divisor, min_value=None): - """ - This function is taken from the original tf repo. - It ensures that all layers have a channel number that is divisible by 8 - It can be seen here: - https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py - :param v: - :param divisor: - :param min_value: - :return: - """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) @@ -55,6 +45,7 @@ class GlobalAvgPooling(nn.Cell): Examples: >>> GlobalAvgPooling() """ + def __init__(self): super(GlobalAvgPooling, self).__init__() self.mean = P.ReduceMean(keep_dims=False) @@ -82,6 +73,7 @@ class DepthwiseConv(nn.Cell): Examples: >>> DepthwiseConv(16, 3, 1, 'pad', 1, channel_multiplier=1) """ + def __init__(self, in_planes, kernel_size, stride, pad_mode, pad, channel_multiplier=1, has_bias=False): super(DepthwiseConv, self).__init__() self.has_bias = has_bias @@ -126,14 +118,19 @@ class ConvBNReLU(nn.Cell): Examples: >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1) """ - def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): + + def __init__(self, platform, in_planes, out_planes, kernel_size=3, stride=1, groups=1): super(ConvBNReLU, self).__init__() padding = (kernel_size - 1) // 2 if groups == 1: - conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='pad', - padding=padding) + conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad_mode='pad', padding=padding) else: - conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='pad', pad=padding) + if platform == "Ascend": + conv = DepthwiseConv(in_planes, kernel_size, stride, pad_mode='pad', pad=padding) + elif platform == "GPU": + conv = nn.Conv2d(in_planes, out_planes, kernel_size, stride, + group=in_planes, pad_mode='pad', padding=padding) + layers = [conv, nn.BatchNorm2d(out_planes), nn.ReLU6()] self.features = nn.SequentialCell(layers) @@ -158,7 +155,8 @@ class InvertedResidual(nn.Cell): Examples: >>> ResidualBlock(3, 256, 1, 1) """ - def __init__(self, inp, oup, stride, expand_ratio): + + def __init__(self, platform, inp, oup, stride, expand_ratio): super(InvertedResidual, self).__init__() assert stride in [1, 2] @@ -167,12 +165,14 @@ class InvertedResidual(nn.Cell): layers = [] if expand_ratio != 1: - layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) + layers.append(ConvBNReLU(platform, inp, hidden_dim, kernel_size=1)) layers.extend([ # dw - ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), + ConvBNReLU(platform, hidden_dim, hidden_dim, + stride=stride, groups=hidden_dim), # pw-linear - nn.Conv2d(hidden_dim, oup, kernel_size=1, stride=1, has_bias=False), + nn.Conv2d(hidden_dim, oup, kernel_size=1, + stride=1, has_bias=False), nn.BatchNorm2d(oup), ]) self.conv = nn.SequentialCell(layers) @@ -203,7 +203,8 @@ class MobileNetV2(nn.Cell): Examples: >>> MobileNetV2(num_classes=1000) """ - def __init__(self, num_classes=1000, width_mult=1., + + def __init__(self, platform, num_classes=1000, width_mult=1., has_dropout=False, inverted_residual_setting=None, round_nearest=8): super(MobileNetV2, self).__init__() block = InvertedResidual @@ -226,16 +227,16 @@ class MobileNetV2(nn.Cell): # building first layer input_channel = _make_divisible(input_channel * width_mult, round_nearest) self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) - features = [ConvBNReLU(3, input_channel, stride=2)] + features = [ConvBNReLU(platform, 3, input_channel, stride=2)] # building inverted residual blocks for t, c, n, s in self.cfgs: output_channel = _make_divisible(c * width_mult, round_nearest) for i in range(n): stride = s if i == 0 else 1 - features.append(block(input_channel, output_channel, stride, expand_ratio=t)) + features.append(block(platform, input_channel, output_channel, stride, expand_ratio=t)) input_channel = output_channel # building last several layers - features.append(ConvBNReLU(input_channel, self.out_channels, kernel_size=1)) + features.append(ConvBNReLU(platform, input_channel, self.out_channels, kernel_size=1)) # make it nn.CellList self.features = nn.SequentialCell(features) # mobilenet head @@ -268,14 +269,19 @@ class MobileNetV2(nn.Cell): m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape()).astype("float32"))) if m.bias is not None: - m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) + m.bias.set_parameter_data( + Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) elif isinstance(m, nn.BatchNorm2d): - m.gamma.set_parameter_data(Tensor(np.ones(m.gamma.data.shape(), dtype="float32"))) - m.beta.set_parameter_data(Tensor(np.zeros(m.beta.data.shape(), dtype="float32"))) + m.gamma.set_parameter_data( + Tensor(np.ones(m.gamma.data.shape(), dtype="float32"))) + m.beta.set_parameter_data( + Tensor(np.zeros(m.beta.data.shape(), dtype="float32"))) elif isinstance(m, nn.Dense): - m.weight.set_parameter_data(Tensor(np.random.normal(0, 0.01, m.weight.data.shape()).astype("float32"))) + m.weight.set_parameter_data(Tensor(np.random.normal( + 0, 0.01, m.weight.data.shape()).astype("float32"))) if m.bias is not None: - m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) + m.bias.set_parameter_data( + Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) def mobilenet_v2(**kwargs): diff --git a/mindspore/model_zoo/mobilenetv2/train.py b/mindspore/model_zoo/mobilenetv2/train.py new file mode 100644 index 0000000000..90def9989a --- /dev/null +++ b/mindspore/model_zoo/mobilenetv2/train.py @@ -0,0 +1,267 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train_imagenet.""" +import os +import time +import argparse +import random +import numpy as np +from mindspore import context +from mindspore import Tensor +from mindspore import nn +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.nn.optim.momentum import Momentum +from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits +from mindspore.nn.loss.loss import _Loss +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore.common import dtype as mstype +from mindspore.train.model import Model, ParallelMode +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback +from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.communication.management import init +import mindspore.dataset.engine as de +from src.dataset import create_dataset +from src.lr_generator import get_lr +from src.config import config_gpu, config_ascend +from src.mobilenetV2 import mobilenet_v2 + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') +parser.add_argument('--platform', type=str, default=None, help='run platform') +args_opt = parser.parse_args() + +if args_opt.platform == "Ascend": + device_id = int(os.getenv('DEVICE_ID')) + rank_id = int(os.getenv('RANK_ID')) + rank_size = int(os.getenv('RANK_SIZE')) + run_distribute = rank_size > 1 + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, + device_target="Ascend", + device_id=device_id, save_graphs=False) +elif args_opt.platform == "GPU": + context.set_context(mode=context.GRAPH_MODE, + device_target="GPU", save_graphs=False) +else: + raise ValueError("Unsupport platform.") + + +class CrossEntropyWithLabelSmooth(_Loss): + """ + CrossEntropyWith LabelSmooth. + + Args: + smooth_factor (float): smooth factor, default=0. + num_classes (int): num classes + + Returns: + None. + + Examples: + >>> CrossEntropyWithLabelSmooth(smooth_factor=0., num_classes=1000) + """ + + def __init__(self, smooth_factor=0., num_classes=1000): + super(CrossEntropyWithLabelSmooth, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor(1.0 * smooth_factor / + (num_classes - 1), mstype.float32) + self.ce = nn.SoftmaxCrossEntropyWithLogits() + self.mean = P.ReduceMean(False) + self.cast = P.Cast() + + def construct(self, logit, label): + one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], + self.on_value, self.off_value) + out_loss = self.ce(logit, one_hot_label) + out_loss = self.mean(out_loss, 0) + return out_loss + + +class Monitor(Callback): + """ + Monitor loss and time. + + Args: + lr_init (numpy array): train lr + + Returns: + None + + Examples: + >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy()) + """ + + def __init__(self, lr_init=None): + super(Monitor, self).__init__() + self.lr_init = lr_init + self.lr_init_len = len(lr_init) + + def epoch_begin(self, run_context): + self.losses = [] + self.epoch_time = time.time() + + def epoch_end(self, run_context): + cb_params = run_context.original_args() + + epoch_mseconds = (time.time() - self.epoch_time) * 1000 + per_step_mseconds = epoch_mseconds / cb_params.batch_num + print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, + per_step_mseconds, + np.mean(self.losses))) + + def step_begin(self, run_context): + self.step_time = time.time() + + def step_end(self, run_context): + cb_params = run_context.original_args() + step_mseconds = (time.time() - self.step_time) * 1000 + step_loss = cb_params.net_outputs + + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): + step_loss = step_loss[0] + if isinstance(step_loss, Tensor): + step_loss = np.mean(step_loss.asnumpy()) + + self.losses.append(step_loss) + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + + print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( + cb_params.cur_epoch_num - + 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, + np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) + + +if __name__ == '__main__': + if args_opt.platform == "GPU": + # train on gpu + print("train args: ", args_opt, "\ncfg: ", config_gpu) + + # define net + net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU") + # define loss + if config_gpu.label_smooth > 0: + loss = CrossEntropyWithLabelSmooth( + smooth_factor=config_gpu.label_smooth, num_classes=config_gpu.num_classes) + else: + loss = SoftmaxCrossEntropyWithLogits( + is_grad=False, sparse=True, reduction='mean') + # define dataset + epoch_size = config_gpu.epoch_size + dataset = create_dataset(dataset_path=args_opt.dataset_path, + do_train=True, + config=config_gpu, + platform=args_opt.platform, + repeat_num=epoch_size, + batch_size=config_gpu.batch_size) + step_size = dataset.get_dataset_size() + # resume + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) + # define optimizer + loss_scale = FixedLossScaleManager( + config_gpu.loss_scale, drop_overflow_update=False) + lr = Tensor(get_lr(global_step=0, + lr_init=0, + lr_end=0, + lr_max=config_gpu.lr, + warmup_epochs=config_gpu.warmup_epochs, + total_epochs=epoch_size, + steps_per_epoch=step_size)) + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_gpu.momentum, + config_gpu.weight_decay, config_gpu.loss_scale) + # define model + model = Model(net, loss_fn=loss, optimizer=opt, + loss_scale_manager=loss_scale) + + cb = [Monitor(lr_init=lr.asnumpy())] + if config_gpu.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config_gpu.save_checkpoint_epochs * step_size, + keep_checkpoint_max=config_gpu.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint( + prefix="mobilenet", directory=config_gpu.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + # begine train + model.train(epoch_size, dataset, callbacks=cb) + elif args_opt.platform == "Ascend": + # train on ascend + print("train args: ", args_opt, "\ncfg: ", config_ascend, + "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) + + if run_distribute: + context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, + parameter_broadcast=True, mirror_mean=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() + + epoch_size = config_ascend.epoch_size + net = mobilenet_v2(num_classes=config_ascend.num_classes) + net.to_float(mstype.float16) + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Dense): + cell.to_float(mstype.float32) + if config_ascend.label_smooth > 0: + loss = CrossEntropyWithLabelSmooth( + smooth_factor=config_ascend.label_smooth, num_classes=config.num_classes) + else: + loss = SoftmaxCrossEntropyWithLogits( + is_grad=False, sparse=True, reduction='mean') + dataset = create_dataset(dataset_path=args_opt.dataset_path, + do_train=True, + config=config_ascend, + platform=args_opt.platform, + repeat_num=epoch_size, + batch_size=config_ascend.batch_size) + step_size = dataset.get_dataset_size() + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) + + loss_scale = FixedLossScaleManager( + config_ascend.loss_scale, drop_overflow_update=False) + lr = Tensor(get_lr(global_step=0, + lr_init=0, + lr_end=0, + lr_max=config_ascend.lr, + warmup_epochs=config_ascend.warmup_epochs, + total_epochs=epoch_size, + steps_per_epoch=step_size)) + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_ascend.momentum, + config_ascend.weight_decay, config_ascend.loss_scale) + + model = Model(net, loss_fn=loss, optimizer=opt, + loss_scale_manager=loss_scale) + + cb = None + if rank_id == 0: + cb = [Monitor(lr_init=lr.asnumpy())] + if config_ascend.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config_ascend.save_checkpoint_epochs * step_size, + keep_checkpoint_max=config_ascend.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint( + prefix="mobilenet", directory=config_ascend.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + model.train(epoch_size, dataset, callbacks=cb) + else: + raise ValueError("Unsupport platform.") diff --git a/mindspore/model_zoo/mobilenetv3/Readme.md b/mindspore/model_zoo/mobilenetv3/Readme.md new file mode 100644 index 0000000000..75f3e2fbe2 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/Readme.md @@ -0,0 +1,152 @@ +# MobileNetV3 Description + + +MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware- aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances.Nov 20, 2019. + +[Paper](https://arxiv.org/pdf/1905.02244) Howard, Andrew, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang et al. "Searching for mobilenetv3." In Proceedings of the IEEE International Conference on Computer Vision, pp. 1314-1324. 2019. + +# Model architecture + +The overall network architecture of MobileNetV3 is show below: + +[Link](https://arxiv.org/pdf/1905.02244) + +# Dataset + +Dataset used: [imagenet](http://www.image-net.org/) + +- Dataset size: ~125G, 1.2W colorful images in 1000 classes + - Train: 120G, 1.2W images + - Test: 5G, 50000 images +- Data format: RGB images. + - Note: Data will be processed in src/dataset.py + + +# Features + + +# Environment Requirements + +- Hardware(Ascend/GPU) + - Prepare hardware environment with Ascend or GPU processor. If you want to try Ascend , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources. +- Framework + - [MindSpore](http://10.90.67.50/mindspore/archive/20200506/OpenSource/me_vm_x86/) +- For more information, please check the resources below: + - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) + - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html) + + +# Script description + +## Script and sample code + +```python +├── MobilenetV3 + ├── Readme.md + ├── scripts + │ ├──run_train.sh + │ ├──run_eval.sh + ├── src + │ ├──config.py + │ ├──dataset.py + │ ├──luanch.py + │ ├──lr_generator.py + │ ├──mobilenetV2.py + ├── train.py + ├── eval.py +``` + +## Training process + +### Usage + +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] +- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] + +### Launch + +``` +# training example + Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ + GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ +``` + +### Result + +Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. + +``` +epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] +epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 +epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] +epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 +``` + +## Eval process + +### Usage + +- Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] +- GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] + +### Launch + +``` +# infer example + Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt + GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt +``` + +> checkpoint can be produced in training process. + +### Result + +Inference result will be stored in the example path, you can find result like the followings in `val.log`. + +``` +result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt +``` + +# Model description + +## Performance + +### Training Performance + +| Parameters | MobilenetV3 | | +| -------------------------- | ---------------------------------------------------------- | ------------------------- | +| Model Version | | large | +| Resource | Ascend 910, cpu:2.60GHz 56cores, memory:314G | NV SMX2 V100-32G | +| uploaded Date | 05/06/2020 | 05/06/2020 | +| MindSpore Version | 0.3.0 | 0.3.0 | +| Dataset | ImageNet | ImageNet | +| Training Parameters | src/config.py | src/config.py | +| Optimizer | Momentum | Momentum | +| Loss Function | SoftmaxCrossEntropy | SoftmaxCrossEntropy | +| outputs | | | +| Loss | | 1.913 | +| Accuracy | | ACC1[77.57%] ACC5[92.51%] | +| Total time | | | +| Params (M) | | | +| Checkpoint for Fine tuning | | | +| Model for inference | | | + +#### Inference Performance + +| Parameters | GoogLeNet | | | +| -------------------------- | ----------------------------- | ------------------------- | -------------------- | +| Model Version | V1 | | | +| Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 | +| uploaded Date | 05/06/2020 | 05/22/2020 | | +| MindSpore Version | 0.2.0 | 0.2.0 | 0.2.0 | +| Dataset | ImageNet, 1.2W | ImageNet, 1.2W | ImageNet, 1.2W | +| batch_size | | 130(8P) | | +| outputs | | | | +| Accuracy | | ACC1[75.43%] ACC5[92.51%] | | +| Speed | | | | +| Total time | | | | +| Model for inference | | | | + + +# ModelZoo Homepage + [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo) \ No newline at end of file diff --git a/example/mobilenetv2_imagenet2012/eval.py b/mindspore/model_zoo/mobilenetv3/eval.py similarity index 54% rename from example/mobilenetv2_imagenet2012/eval.py rename to mindspore/model_zoo/mobilenetv3/eval.py index 79df8ea8f2..7428b748f4 100644 --- a/example/mobilenetv2_imagenet2012/eval.py +++ b/mindspore/model_zoo/mobilenetv3/eval.py @@ -17,33 +17,51 @@ eval. """ import os import argparse -from dataset import create_dataset -from config import config from mindspore import context -from mindspore.model_zoo.mobilenet import mobilenet_v2 +from mindspore import nn from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.common import dtype as mstype +from src.dataset import create_dataset +from src.config import config_ascend, config_gpu +from src.mobilenetV2 import mobilenet_v2 parser = argparse.ArgumentParser(description='Image classification') parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--platform', type=str, default=None, help='run platform') args_opt = parser.parse_args() -device_id = int(os.getenv('DEVICE_ID')) - -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False) if __name__ == '__main__': - loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') - net = mobilenet_v2(num_classes=config.num_classes) - net.to_float(mstype.float16) - for _, cell in net.cells_and_names(): - if isinstance(cell, nn.Dense): - cell.add_flags_recursive(fp32=True) - - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size) + config_platform = None + if args_opt.platform == "Ascend": + config_platform = config_ascend + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", + device_id=device_id, save_graphs=False) + elif args_opt.platform == "GPU": + config_platform = config_gpu + context.set_context(mode=context.GRAPH_MODE, + device_target="GPU", save_graphs=False) + else: + raise ValueError("Unsupport platform.") + + loss = nn.SoftmaxCrossEntropyWithLogits( + is_grad=False, sparse=True, reduction='mean') + net = mobilenet_v2(num_classes=config_platform.num_classes) + + if args_opt.platform == "Ascend": + net.to_float(mstype.float16) + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Dense): + cell.to_float(mstype.float32) + + dataset = create_dataset(dataset_path=args_opt.dataset_path, + do_train=False, + config=config_platform, + platform=args_opt.platform, + batch_size=config_platform.batch_size) step_size = dataset.get_dataset_size() if args_opt.checkpoint_path: diff --git a/mindspore/model_zoo/mobilenetv3/scripts/run_infer.sh b/mindspore/model_zoo/mobilenetv3/scripts/run_infer.sh new file mode 100644 index 0000000000..0254d1c554 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/scripts/run_infer.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +if [ $# != 3 ] +then + echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH] \ + GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +# check dataset path +if [ ! -d $2 ] +then + echo "error: DATASET_PATH=$2 is not a directory" +exit 1 +fi + +# check checkpoint file +if [ ! -f $3 ] +then + echo "error: CHECKPOINT_PATH=$3 is not a file" +exit 1 +fi + +# set environment +BASEPATH=$(cd "`dirname $0`" || exit; pwd) +export PYTHONPATH=${BASEPATH}:$PYTHONPATH +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 +if [ -d "eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cd ./eval || exit + +# luanch +python ${BASEPATH}/eval.py \ + --platform=$1 \ + --dataset_path=$2 \ + --checkpoint_path=$3 \ + &> infer.log & # dataset val folder path diff --git a/mindspore/model_zoo/mobilenetv3/scripts/run_train.sh b/mindspore/model_zoo/mobilenetv3/scripts/run_train.sh new file mode 100644 index 0000000000..ee4edb93eb --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/scripts/run_train.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +run_ascend() +{ + if [ $2 -lt 1 ] && [ $2 -gt 8 ] + then + echo "error: DEVICE_NUM=$2 is not in (1-8)" + exit 1 + fi + + if [ ! -d $5 ] + then + echo "error: DATASET_PATH=$5 is not a directory" + exit 1 + fi + + BASEPATH=$(cd "`dirname $0`" || exit; pwd) + export PYTHONPATH=${BASEPATH}:$PYTHONPATH + if [ -d "train" ]; + then + rm -rf ./train + fi + mkdir ./train + cd ./train || exit + python ${BASEPATH}/launch.py \ + --nproc_per_node=$2 \ + --visible_devices=$4 \ + --server_id=$3 \ + --training_script=${BASEPATH}/train.py \ + --dataset_path=$5 \ + --platform=$1 &> train.log & # dataset train folder +} + +run_gpu() +{ + if [ $2 -lt 1 ] && [ $2 -gt 8 ] + then + echo "error: DEVICE_NUM=$2 is not in (1-8)" + exit 1 + fi + + if [ ! -d $4 ] + then + echo "error: DATASET_PATH=$4 is not a directory" + exit 1 + fi + + BASEPATH=$(cd "`dirname $0`" || exit; pwd) + export PYTHONPATH=${BASEPATH}:$PYTHONPATH + if [ -d "train" ]; + then + rm -rf ./train + fi + mkdir ./train + cd ./train || exit + + export CUDA_VISIBLE_DEVICES="$3" + mpirun -n $2 --allow-run-as-root \ + python ${BASEPATH}/train.py \ + --dataset_path=$4 \ + --platform=$1 \ + &> train.log & # dataset train folder +} + +if [ $# -gt 5 ] || [ $# -lt 4 ] +then + echo "Usage:\n \ + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ + GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ + " +exit 1 +fi + +if [ $1 = "Ascend" ] ; then + run_ascend "$@" +elif [ $1 = "GPU" ] ; then + run_gpu "$@" +else + echo "not support platform" +fi; + diff --git a/mindspore/model_zoo/mobilenetv3/src/config.py b/mindspore/model_zoo/mobilenetv3/src/config.py new file mode 100644 index 0000000000..b6b4cd4e9b --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/src/config.py @@ -0,0 +1,54 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py and eval.py +""" +from easydict import EasyDict as ed + +config_ascend = ed({ + "num_classes": 1000, + "image_height": 224, + "image_width": 224, + "batch_size": 256, + "epoch_size": 200, + "warmup_epochs": 4, + "lr": 0.4, + "momentum": 0.9, + "weight_decay": 4e-5, + "label_smooth": 0.1, + "loss_scale": 1024, + "save_checkpoint": True, + "save_checkpoint_epochs": 1, + "keep_checkpoint_max": 200, + "save_checkpoint_path": "./checkpoint", +}) + +config_gpu = ed({ + "num_classes": 1000, + "image_height": 224, + "image_width": 224, + "batch_size": 64, + "epoch_size": 300, + "warmup_epochs": 4, + "lr": 0.5, + "momentum": 0.9, + "weight_decay": 4e-5, + "label_smooth": 0.1, + "loss_scale": 1024, + "save_checkpoint": True, + "save_checkpoint_epochs": 1, + "keep_checkpoint_max": 500, + "save_checkpoint_path": "./checkpoint", +}) diff --git a/mindspore/model_zoo/mobilenetv3/src/dataset.py b/mindspore/model_zoo/mobilenetv3/src/dataset.py new file mode 100644 index 0000000000..a1a77a8495 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/src/dataset.py @@ -0,0 +1,85 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +create train or eval dataset. +""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset.engine as de +import mindspore.dataset.transforms.vision.c_transforms as C +import mindspore.dataset.transforms.c_transforms as C2 + + +def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32): + """ + create a train or eval dataset + + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + + Returns: + dataset + """ + if platform == "Ascend": + rank_size = int(os.getenv("RANK_SIZE")) + rank_id = int(os.getenv("RANK_ID")) + if rank_size == 1: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) + else: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) + elif platform == "GPU": + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) + else: + raise ValueError("Unsupport platform.") + + resize_height = config.image_height + resize_width = config.image_width + buffer_size = 1000 + + # define map operations + decode_op = C.Decode() + resize_crop_op = C.RandomCropDecodeResize(resize_height, scale=(0.08, 1.0), ratio=(0.75, 1.333)) + horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5) + + resize_op = C.Resize((256, 256)) + center_crop = C.CenterCrop(resize_width) + rescale_op = C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) + normalize_op = C.Normalize(mean=[0.485*255, 0.456*255, 0.406*255], std=[0.229*255, 0.224*255, 0.225*255]) + change_swap_op = C.HWC2CHW() + + if do_train: + trans = [resize_crop_op, horizontal_flip_op, rescale_op, normalize_op, change_swap_op] + else: + trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op] + + type_cast_op = C2.TypeCast(mstype.int32) + + ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) + ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) + + # apply shuffle operations + ds = ds.shuffle(buffer_size=buffer_size) + + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + + # apply dataset repeat operation + ds = ds.repeat(repeat_num) + + return ds diff --git a/mindspore/model_zoo/mobilenetv3/src/launch.py b/mindspore/model_zoo/mobilenetv3/src/launch.py new file mode 100644 index 0000000000..48c8159664 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/src/launch.py @@ -0,0 +1,163 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""launch train script""" +import os +import sys +import json +import subprocess +import shutil +from argparse import ArgumentParser + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="mindspore distributed training launch " + "helper utilty that will spawn up " + "multiple distributed processes") + parser.add_argument("--nproc_per_node", type=int, default=1, + help="The number of processes to launch on each node, " + "for D training, this is recommended to be set " + "to the number of D in your system so that " + "each process can be bound to a single D.") + parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", + help="will use the visible devices sequentially") + parser.add_argument("--server_id", type=str, default="", + help="server ip") + parser.add_argument("--training_script", type=str, + help="The full path to the single D training " + "program/script to be launched in parallel, " + "followed by all the arguments for the " + "training script") + # rest from the training program + args, unknown = parser.parse_known_args() + args.training_script_args = unknown + return args + + +def main(): + print("start", __file__) + args = parse_args() + print(args) + visible_devices = args.visible_devices.split(',') + assert os.path.isfile(args.training_script) + assert len(visible_devices) >= args.nproc_per_node + print('visible_devices:{}'.format(visible_devices)) + if not args.server_id: + print('pleaser input server ip!!!') + exit(0) + print('server_id:{}'.format(args.server_id)) + + # construct hccn_table + hccn_configs = open('/etc/hccn.conf', 'r').readlines() + device_ips = {} + for hccn_item in hccn_configs: + hccn_item = hccn_item.strip() + if hccn_item.startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip + print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) + hccn_table = {} + hccn_table['board_id'] = '0x0000' + hccn_table['chip_info'] = '910' + hccn_table['deploy_mode'] = 'lab' + hccn_table['group_count'] = '1' + hccn_table['group_list'] = [] + instance_list = [] + usable_dev = '' + for instance_id in range(args.nproc_per_node): + instance = {} + instance['devices'] = [] + device_id = visible_devices[instance_id] + device_ip = device_ips[device_id] + usable_dev += str(device_id) + instance['devices'].append({ + 'device_id': device_id, + 'device_ip': device_ip, + }) + instance['rank_id'] = str(instance_id) + instance['server_id'] = args.server_id + instance_list.append(instance) + hccn_table['group_list'].append({ + 'device_num': str(args.nproc_per_node), + 'server_num': '1', + 'group_name': '', + 'instance_count': str(args.nproc_per_node), + 'instance_list': instance_list, + }) + hccn_table['para_plane_nic_location'] = 'device' + hccn_table['para_plane_nic_name'] = [] + for instance_id in range(args.nproc_per_node): + eth_id = visible_devices[instance_id] + hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) + hccn_table['para_plane_nic_num'] = str(args.nproc_per_node) + hccn_table['status'] = 'completed' + + # save hccn_table to file + table_path = os.getcwd() + if not os.path.exists(table_path): + os.mkdir(table_path) + table_fn = os.path.join(table_path, + 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id)) + with open(table_fn, 'w') as table_fp: + json.dump(hccn_table, table_fp, indent=4) + sys.stdout.flush() + + # spawn the processes + processes = [] + cmds = [] + log_files = [] + env = os.environ.copy() + env['RANK_SIZE'] = str(args.nproc_per_node) + cur_path = os.getcwd() + for rank_id in range(0, args.nproc_per_node): + os.chdir(cur_path) + device_id = visible_devices[rank_id] + device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) + env['RANK_ID'] = str(rank_id) + env['DEVICE_ID'] = str(device_id) + if args.nproc_per_node > 1: + env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn + env['RANK_TABLE_FILE'] = table_fn + if os.path.exists(device_dir): + shutil.rmtree(device_dir) + os.mkdir(device_dir) + os.chdir(device_dir) + cmd = [sys.executable, '-u'] + cmd.append(args.training_script) + cmd.extend(args.training_script_args) + log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') + process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) + processes.append(process) + cmds.append(cmd) + log_files.append(log_file) + for process, cmd, log_file in zip(processes, cmds, log_files): + process.wait() + if process.returncode != 0: + raise subprocess.CalledProcessError(returncode=process, cmd=cmd) + log_file.close() + + +if __name__ == "__main__": + main() diff --git a/mindspore/model_zoo/mobilenetv3/src/lr_generator.py b/mindspore/model_zoo/mobilenetv3/src/lr_generator.py new file mode 100644 index 0000000000..68bbfe3158 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/src/lr_generator.py @@ -0,0 +1,54 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""learning rate generator""" +import math +import numpy as np + + +def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): + """ + generate learning rate array + + Args: + global_step(int): total steps of the training + lr_init(float): init learning rate + lr_end(float): end learning rate + lr_max(float): max learning rate + warmup_epochs(int): number of warmup epochs + total_epochs(int): total epoch of training + steps_per_epoch(int): steps of one epoch + + Returns: + np.array, learning rate array + """ + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + warmup_steps = steps_per_epoch * warmup_epochs + for i in range(total_steps): + if i < warmup_steps: + lr = lr_init + (lr_max - lr_init) * i / warmup_steps + else: + lr = lr_end + \ + (lr_max - lr_end) * \ + (1. + math.cos(math.pi * (i - warmup_steps) / (total_steps - warmup_steps))) / 2. + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + + current_step = global_step + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[current_step:] + + return learning_rate diff --git a/mindspore/model_zoo/mobilenetv3/src/mobilenetV3.py b/mindspore/model_zoo/mobilenetv3/src/mobilenetV3.py new file mode 100644 index 0000000000..820e60493f --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/src/mobilenetV3.py @@ -0,0 +1,390 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""MobileNetV3 model define""" +from functools import partial +import numpy as np +import mindspore.nn as nn +from mindspore.ops import operations as P +from mindspore import Tensor + + +__all__ = ['mobilenet_v3_large', + 'mobilenet_v3_small'] + + +def _make_divisible(x, divisor=8): + return int(np.ceil(x * 1. / divisor) * divisor) + + +class Activation(nn.Cell): + """ + Activation definition. + + Args: + act_func(string): activation name. + + Returns: + Tensor, output tensor. + """ + + def __init__(self, act_func): + super(Activation, self).__init__() + if act_func == 'relu': + self.act = nn.ReLU() + elif act_func == 'relu6': + self.act = nn.ReLU6() + elif act_func in ('hsigmoid', 'hard_sigmoid'): + self.act = nn.HSigmoid() + elif act_func in ('hswish', 'hard_swish'): + self.act = nn.HSwish() + else: + raise NotImplementedError + + def construct(self, x): + return self.act(x) + + +class GlobalAvgPooling(nn.Cell): + """ + Global avg pooling definition. + + Args: + + Returns: + Tensor, output tensor. + + Examples: + >>> GlobalAvgPooling() + """ + + def __init__(self, keep_dims=False): + super(GlobalAvgPooling, self).__init__() + self.mean = P.ReduceMean(keep_dims=keep_dims) + + def construct(self, x): + x = self.mean(x, (2, 3)) + return x + + +class SE(nn.Cell): + """ + SE warpper definition. + + Args: + num_out (int): Output channel. + ratio (int): middle output ratio. + + Returns: + Tensor, output tensor. + + Examples: + >>> SE(4) + """ + + def __init__(self, num_out, ratio=4): + super(SE, self).__init__() + num_mid = _make_divisible(num_out // ratio) + self.pool = GlobalAvgPooling(keep_dims=True) + self.conv1 = nn.Conv2d(in_channels=num_out, out_channels=num_mid, + kernel_size=1, has_bias=True, pad_mode='pad') + self.act1 = Activation('relu') + self.conv2 = nn.Conv2d(in_channels=num_mid, out_channels=num_out, + kernel_size=1, has_bias=True, pad_mode='pad') + self.act2 = Activation('hsigmoid') + self.mul = P.Mul() + + def construct(self, x): + out = self.pool(x) + out = self.conv1(out) + out = self.act1(out) + out = self.conv2(out) + out = self.act2(out) + out = self.mul(x, out) + return out + + +class Unit(nn.Cell): + """ + Unit warpper definition. + + Args: + num_in (int): Input channel. + num_out (int): Output channel. + kernel_size (int): Input kernel size. + stride (int): Stride size. + padding (int): Padding number. + num_groups (int): Output num group. + use_act (bool): Used activation or not. + act_type (string): Activation type. + + Returns: + Tensor, output tensor. + + Examples: + >>> Unit(3, 3) + """ + + def __init__(self, num_in, num_out, kernel_size=1, stride=1, padding=0, num_groups=1, + use_act=True, act_type='relu'): + super(Unit, self).__init__() + self.conv = nn.Conv2d(in_channels=num_in, + out_channels=num_out, + kernel_size=kernel_size, + stride=stride, + padding=padding, + group=num_groups, + has_bias=False, + pad_mode='pad') + self.bn = nn.BatchNorm2d(num_out) + self.use_act = use_act + self.act = Activation(act_type) if use_act else None + + def construct(self, x): + out = self.conv(x) + out = self.bn(out) + if self.use_act: + out = self.act(out) + return out + + +class ResUnit(nn.Cell): + """ + ResUnit warpper definition. + + Args: + num_in (int): Input channel. + num_mid (int): Middle channel. + num_out (int): Output channel. + kernel_size (int): Input kernel size. + stride (int): Stride size. + act_type (str): Activation type. + use_se (bool): Use SE warpper or not. + + Returns: + Tensor, output tensor. + + Examples: + >>> ResUnit(16, 3, 1, 1) + """ + def __init__(self, num_in, num_mid, num_out, kernel_size, stride=1, act_type='relu', use_se=False): + super(ResUnit, self).__init__() + self.use_se = use_se + self.first_conv = (num_out != num_mid) + self.use_short_cut_conv = True + + if self.first_conv: + self.expand = Unit(num_in, num_mid, kernel_size=1, + stride=1, padding=0, act_type=act_type) + else: + self.expand = None + self.conv1 = Unit(num_mid, num_mid, kernel_size=kernel_size, stride=stride, + padding=self._get_pad(kernel_size), act_type=act_type, num_groups=num_mid) + if use_se: + self.se = SE(num_mid) + self.conv2 = Unit(num_mid, num_out, kernel_size=1, stride=1, + padding=0, act_type=act_type, use_act=False) + if num_in != num_out or stride != 1: + self.use_short_cut_conv = False + self.add = P.TensorAdd() if self.use_short_cut_conv else None + + def construct(self, x): + if self.first_conv: + out = self.expand(x) + else: + out = x + out = self.conv1(out) + if self.use_se: + out = self.se(out) + out = self.conv2(out) + if self.use_short_cut_conv: + out = self.add(x, out) + return out + + def _get_pad(self, kernel_size): + """set the padding number""" + pad = 0 + if kernel_size == 1: + pad = 0 + elif kernel_size == 3: + pad = 1 + elif kernel_size == 5: + pad = 2 + elif kernel_size == 7: + pad = 3 + else: + raise NotImplementedError + return pad + + +class MobileNetV3(nn.Cell): + """ + MobileNetV3 architecture. + + Args: + model_cfgs (Cell): number of classes. + num_classes (int): Output number classes. + multiplier (int): Channels multiplier for round to 8/16 and others. Default is 1. + final_drop (float): Dropout number. + round_nearest (list): Channel round to . Default is 8. + Returns: + Tensor, output tensor. + + Examples: + >>> MobileNetV3(num_classes=1000) + """ + + def __init__(self, model_cfgs, num_classes=1000, multiplier=1., final_drop=0., round_nearest=8): + super(MobileNetV3, self).__init__() + self.cfgs = model_cfgs['cfg'] + self.inplanes = 16 + self.features = [] + first_conv_in_channel = 3 + first_conv_out_channel = _make_divisible(multiplier * self.inplanes) + + self.features.append(nn.Conv2d(in_channels=first_conv_in_channel, + out_channels=first_conv_out_channel, + kernel_size=3, padding=1, stride=2, + has_bias=False, pad_mode='pad')) + self.features.append(nn.BatchNorm2d(first_conv_out_channel)) + self.features.append(Activation('hswish')) + for layer_cfg in self.cfgs: + self.features.append(self._make_layer(kernel_size=layer_cfg[0], + exp_ch=_make_divisible(multiplier * layer_cfg[1]), + out_channel=_make_divisible(multiplier * layer_cfg[2]), + use_se=layer_cfg[3], + act_func=layer_cfg[4], + stride=layer_cfg[5])) + output_channel = _make_divisible(multiplier * model_cfgs["cls_ch_squeeze"]) + self.features.append(nn.Conv2d(in_channels=_make_divisible(multiplier * self.cfgs[-1][2]), + out_channels=output_channel, + kernel_size=1, padding=0, stride=1, + has_bias=False, pad_mode='pad')) + self.features.append(nn.BatchNorm2d(output_channel)) + self.features.append(Activation('hswish')) + self.features.append(GlobalAvgPooling(keep_dims=True)) + self.features.append(nn.Conv2d(in_channels=output_channel, + out_channels=model_cfgs['cls_ch_expand'], + kernel_size=1, padding=0, stride=1, + has_bias=False, pad_mode='pad')) + self.features.append(Activation('hswish')) + if final_drop > 0: + self.features.append((nn.Dropout(final_drop))) + + # make it nn.CellList + self.features = nn.SequentialCell(self.features) + self.output = nn.Conv2d(in_channels=model_cfgs['cls_ch_expand'], + out_channels=num_classes, + kernel_size=1, has_bias=True, pad_mode='pad') + self.squeeze = P.Squeeze(axis=(2, 3)) + + self._initialize_weights() + + def construct(self, x): + x = self.features(x) + x = self.output(x) + x = self.squeeze(x) + return x + + def _make_layer(self, kernel_size, exp_ch, out_channel, use_se, act_func, stride=1): + mid_planes = exp_ch + out_planes = out_channel + #num_in, num_mid, num_out, kernel_size, stride=1, act_type='relu', use_se=False): + layer = ResUnit(self.inplanes, mid_planes, out_planes, + kernel_size, stride=stride, act_type=act_func, use_se=use_se) + self.inplanes = out_planes + return layer + + def _initialize_weights(self): + """ + Initialize weights. + + Args: + + Returns: + None. + + Examples: + >>> _initialize_weights() + """ + for _, m in self.cells_and_names(): + if isinstance(m, (nn.Conv2d)): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.set_parameter_data(Tensor(np.random.normal(0, np.sqrt(2. / n), + m.weight.data.shape()).astype("float32"))) + if m.bias is not None: + m.bias.set_parameter_data( + Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) + elif isinstance(m, nn.BatchNorm2d): + m.gamma.set_parameter_data( + Tensor(np.ones(m.gamma.data.shape(), dtype="float32"))) + m.beta.set_parameter_data( + Tensor(np.zeros(m.beta.data.shape(), dtype="float32"))) + elif isinstance(m, nn.Dense): + m.weight.set_parameter_data(Tensor(np.random.normal( + 0, 0.01, m.weight.data.shape()).astype("float32"))) + if m.bias is not None: + m.bias.set_parameter_data( + Tensor(np.zeros(m.bias.data.shape(), dtype="float32"))) + + +def mobilenet_v3(model_name, **kwargs): + """ + Constructs a MobileNet V2 model + """ + model_cfgs = { + "large": { + "cfg": [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', 1], + [3, 64, 24, False, 'relu', 2], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', 2], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hswish', 2], + [3, 200, 80, False, 'hswish', 1], + [3, 184, 80, False, 'hswish', 1], + [3, 184, 80, False, 'hswish', 1], + [3, 480, 112, True, 'hswish', 1], + [3, 672, 112, True, 'hswish', 1], + [5, 672, 160, True, 'hswish', 2], + [5, 960, 160, True, 'hswish', 1], + [5, 960, 160, True, 'hswish', 1]], + "cls_ch_squeeze": 960, + "cls_ch_expand": 1280, + }, + "small": { + "cfg": [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', 2], + [3, 72, 24, False, 'relu', 2], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hswish', 2], + [5, 240, 40, True, 'hswish', 1], + [5, 240, 40, True, 'hswish', 1], + [5, 120, 48, True, 'hswish', 1], + [5, 144, 48, True, 'hswish', 1], + [5, 288, 96, True, 'hswish', 2], + [5, 576, 96, True, 'hswish', 1], + [5, 576, 96, True, 'hswish', 1]], + "cls_ch_squeeze": 576, + "cls_ch_expand": 1280, + } + } + return MobileNetV3(model_cfgs[model_name], **kwargs) + + +mobilenet_v3_large = partial(mobilenet_v3, model_name="large") +mobilenet_v3_small = partial(mobilenet_v3, model_name="small") diff --git a/mindspore/model_zoo/mobilenetv3/train.py b/mindspore/model_zoo/mobilenetv3/train.py new file mode 100644 index 0000000000..b11f1dc6e7 --- /dev/null +++ b/mindspore/model_zoo/mobilenetv3/train.py @@ -0,0 +1,267 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train_imagenet.""" +import os +import time +import argparse +import random +import numpy as np +from mindspore import context +from mindspore import Tensor +from mindspore import nn +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.nn.optim.momentum import Momentum +from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits +from mindspore.nn.loss.loss import _Loss +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore.common import dtype as mstype +from mindspore.train.model import Model, ParallelMode +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback +from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.train.serialization import load_checkpoint, load_param_into_net +import mindspore.dataset.engine as de +from mindspore.communication.management import init +from src.dataset import create_dataset +from src.lr_generator import get_lr +from src.config import config_gpu, config_ascend +from src.mobilenetV3 import mobilenet_v3_large + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') +parser.add_argument('--platform', type=str, default=None, help='run platform') +args_opt = parser.parse_args() + +if args_opt.platform == "Ascend": + device_id = int(os.getenv('DEVICE_ID')) + rank_id = int(os.getenv('RANK_ID')) + rank_size = int(os.getenv('RANK_SIZE')) + run_distribute = rank_size > 1 + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, + device_target="Ascend", + device_id=device_id, save_graphs=False) +elif args_opt.platform == "GPU": + context.set_context(mode=context.GRAPH_MODE, + device_target="GPU", save_graphs=False) +else: + raise ValueError("Unsupport platform.") + + +class CrossEntropyWithLabelSmooth(_Loss): + """ + CrossEntropyWith LabelSmooth. + + Args: + smooth_factor (float): smooth factor, default=0. + num_classes (int): num classes + + Returns: + None. + + Examples: + >>> CrossEntropyWithLabelSmooth(smooth_factor=0., num_classes=1000) + """ + + def __init__(self, smooth_factor=0., num_classes=1000): + super(CrossEntropyWithLabelSmooth, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor(1.0 * smooth_factor / + (num_classes - 1), mstype.float32) + self.ce = nn.SoftmaxCrossEntropyWithLogits() + self.mean = P.ReduceMean(False) + self.cast = P.Cast() + + def construct(self, logit, label): + one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], + self.on_value, self.off_value) + out_loss = self.ce(logit, one_hot_label) + out_loss = self.mean(out_loss, 0) + return out_loss + + +class Monitor(Callback): + """ + Monitor loss and time. + + Args: + lr_init (numpy array): train lr + + Returns: + None + + Examples: + >>> Monitor(100,lr_init=Tensor([0.05]*100).asnumpy()) + """ + + def __init__(self, lr_init=None): + super(Monitor, self).__init__() + self.lr_init = lr_init + self.lr_init_len = len(lr_init) + + def epoch_begin(self, run_context): + self.losses = [] + self.epoch_time = time.time() + + def epoch_end(self, run_context): + cb_params = run_context.original_args() + + epoch_mseconds = (time.time() - self.epoch_time) * 1000 + per_step_mseconds = epoch_mseconds / cb_params.batch_num + print("epoch time: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}".format(epoch_mseconds, + per_step_mseconds, + np.mean(self.losses))) + + def step_begin(self, run_context): + self.step_time = time.time() + + def step_end(self, run_context): + cb_params = run_context.original_args() + step_mseconds = (time.time() - self.step_time) * 1000 + step_loss = cb_params.net_outputs + + if isinstance(step_loss, (tuple, list)) and isinstance(step_loss[0], Tensor): + step_loss = step_loss[0] + if isinstance(step_loss, Tensor): + step_loss = np.mean(step_loss.asnumpy()) + + self.losses.append(step_loss) + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + + print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( + cb_params.cur_epoch_num - + 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, + np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) + + +if __name__ == '__main__': + if args_opt.platform == "GPU": + # train on gpu + print("train args: ", args_opt, "\ncfg: ", config_gpu) + + # define net + net = mobilenet_v3_large(num_classes=config_gpu.num_classes) + # define loss + if config_gpu.label_smooth > 0: + loss = CrossEntropyWithLabelSmooth( + smooth_factor=config_gpu.label_smooth, num_classes=config_gpu.num_classes) + else: + loss = SoftmaxCrossEntropyWithLogits( + is_grad=False, sparse=True, reduction='mean') + # define dataset + epoch_size = config_gpu.epoch_size + dataset = create_dataset(dataset_path=args_opt.dataset_path, + do_train=True, + config=config_gpu, + platform=args_opt.platform, + repeat_num=epoch_size, + batch_size=config_gpu.batch_size) + step_size = dataset.get_dataset_size() + # resume + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) + # define optimizer + loss_scale = FixedLossScaleManager( + config_gpu.loss_scale, drop_overflow_update=False) + lr = Tensor(get_lr(global_step=0, + lr_init=0, + lr_end=0, + lr_max=config_gpu.lr, + warmup_epochs=config_gpu.warmup_epochs, + total_epochs=epoch_size, + steps_per_epoch=step_size)) + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_gpu.momentum, + config_gpu.weight_decay, config_gpu.loss_scale) + # define model + model = Model(net, loss_fn=loss, optimizer=opt, + loss_scale_manager=loss_scale) + + cb = [Monitor(lr_init=lr.asnumpy())] + if config_gpu.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config_gpu.save_checkpoint_epochs * step_size, + keep_checkpoint_max=config_gpu.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint( + prefix="mobilenet", directory=config_gpu.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + # begine train + model.train(epoch_size, dataset, callbacks=cb) + elif args_opt.platform == "Ascend": + # train on ascend + print("train args: ", args_opt, "\ncfg: ", config_ascend, + "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) + + if run_distribute: + context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, + parameter_broadcast=True, mirror_mean=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() + + epoch_size = config_ascend.epoch_size + net = mobilenet_v3_large(num_classes=config_ascend.num_classes) + net.to_float(mstype.float16) + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Dense): + cell.to_float(mstype.float32) + if config_ascend.label_smooth > 0: + loss = CrossEntropyWithLabelSmooth( + smooth_factor=config_ascend.label_smooth, num_classes=config.num_classes) + else: + loss = SoftmaxCrossEntropyWithLogits( + is_grad=False, sparse=True, reduction='mean') + dataset = create_dataset(dataset_path=args_opt.dataset_path, + do_train=True, + config=config_ascend, + platform=args_opt.platform, + repeat_num=epoch_size, + batch_size=config_ascend.batch_size) + step_size = dataset.get_dataset_size() + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) + + loss_scale = FixedLossScaleManager( + config_ascend.loss_scale, drop_overflow_update=False) + lr = Tensor(get_lr(global_step=0, + lr_init=0, + lr_end=0, + lr_max=config_ascend.lr, + warmup_epochs=config_ascend.warmup_epochs, + total_epochs=epoch_size, + steps_per_epoch=step_size)) + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_ascend.momentum, + config_ascend.weight_decay, config_ascend.loss_scale) + + model = Model(net, loss_fn=loss, optimizer=opt, + loss_scale_manager=loss_scale) + + cb = None + if rank_id == 0: + cb = [Monitor(lr_init=lr.asnumpy())] + if config_ascend.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config_ascend.save_checkpoint_epochs * step_size, + keep_checkpoint_max=config_ascend.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint( + prefix="mobilenet", directory=config_ascend.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + model.train(epoch_size, dataset, callbacks=cb) + else: + raise Exception