| @@ -81,9 +81,10 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil | |||
| ├─config.py # parameter configuration | |||
| ├─dataset.py # data preprocessing | |||
| ├─Xception.py # network definition | |||
| ├─CrossEntropySmooth.py # Customized CrossEntropy loss function | |||
| ├─loss.py # Customized CrossEntropy loss function | |||
| └─lr_generator.py # learning rate generator | |||
| ├─train.py # train net | |||
| ├─export.py # export net | |||
| └─eval.py # eval net | |||
| ``` | |||
| @@ -110,7 +111,6 @@ Major parameters in train.py and config.py are: | |||
| 'lr_init': 0.00004 # initiate learning rate | |||
| 'lr_max': 0.4 # max bound of learning rate | |||
| 'lr_end': 0.00004 # min bound of learning rate | |||
| "weight_init": 'xavier_uniform' # Weight initialization mode | |||
| ``` | |||
| ## [Training process](#contents) | |||
| @@ -149,13 +149,13 @@ sh scripts/run_standalone_train.sh DEVICE_ID DATA_PATH | |||
| ### Result | |||
| Training result will be stored in the example path. Checkpoints will be stored at `. /model_0` by default, and training log will be redirected to `log.txt` like followings. | |||
| Training result will be stored in the example path. Checkpoints will be stored at `. /ckpt_0` by default, and training log will be redirected to `log.txt` like followings. | |||
| ``` shell | |||
| epoch: [ 0/250], step:[ 1250/ 1251], loss:[4.761/5.613], time:[529.305], lr:[0.400] | |||
| epoch time: 1128662.862, per step time: 902.209, avg loss: 5.609 | |||
| epoch: [ 1/250], step:[ 1250/ 1251], loss:[4.164/4.318], time:[503.708], lr:[0.398] | |||
| epoch time: 889163.081, per step time: 710.762, avg loss: 4.312 | |||
| epoch: 1 step: 1251, loss is 4.8427444 | |||
| epoch time: 701242.350 ms, per step time: 560.545 ms | |||
| epoch: 2 step: 1251, loss is 4.0637593 | |||
| epoch time: 598591.422 ms, per step time: 478.490ms | |||
| ``` | |||
| ## [Eval process](#contents) | |||
| @@ -199,18 +199,19 @@ result: {'Loss': 1.7797744848789312, 'Top_1_Acc': 0.7985777243589743, 'Top_5_Acc | |||
| | -------------------------- | ---------------------------------------------- | | |||
| | Model Version | Xception | | |||
| | Resource | HUAWEI CLOUD Modelarts | | |||
| | uploaded Date | 11/15/2020 | | |||
| | MindSpore Version | 1.0.0 | | |||
| | uploaded Date | 12/10/2020 | | |||
| | MindSpore Version | 1.1.0 | | |||
| | Dataset | 1200k images | | |||
| | Batch_size | 128 | | |||
| | Training Parameters | src/config.py | | |||
| | Optimizer | Momentum | | |||
| | Loss Function | CrossEntropySmooth | | |||
| | Loss | 1.78 | | |||
| | Accuracy (8p) | Top1[79.9%] Top5[94.9%] | | |||
| | Total time (8p) | 63h | | |||
| | Accuracy (8p) | Top1[79.8%] Top5[94.8%] | | |||
| | Per step time (8p) | 479 ms/step | | |||
| | Total time (8p) | 42h | | |||
| | Params (M) | 180M | | |||
| | Scripts | [Xception script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/Xception) | | |||
| | Scripts | [Xception script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/xception) | | |||
| #### Inference Performance | |||
| @@ -231,4 +232,4 @@ In `dataset.py`, we set the seed inside `create_dataset` function. We also use r | |||
| # [ModelZoo Homepage](#contents) | |||
| Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). | |||
| Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). | |||
| @@ -27,13 +27,10 @@ echo "avg_core_per_rank" $avg_core_per_rank | |||
| echo "core_gap" $core_gap | |||
| for((i=0;i<RANK_SIZE;i++)) | |||
| do | |||
| start=`expr $i \* $avg_core_per_rank` | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$i | |||
| export DEPLOY_MODE=0 | |||
| export GE_USE_STATIC_MEMORY=1 | |||
| end=`expr $start \+ $core_gap` | |||
| cmdopt=$start"-"$end | |||
| rm -rf train_parallel$i | |||
| mkdir ./train_parallel$i | |||
| @@ -42,7 +39,7 @@ do | |||
| echo "start training for rank $i, device $DEVICE_ID" | |||
| env > env.log | |||
| taskset -c $cmdopt python ../train.py \ | |||
| python ../train.py \ | |||
| --is_distributed \ | |||
| --device_target=Ascend \ | |||
| --dataset_path=$DATA_DIR > log.txt 2>&1 & | |||
| @@ -18,8 +18,14 @@ export DEVICE_ID=$1 | |||
| DATA_DIR=$2 | |||
| PATH_CHECKPOINT=$3 | |||
| python ./eval.py \ | |||
| rm -rf eval_output | |||
| mkdir ./eval_output | |||
| cd ./eval_output || exit | |||
| echo "start evaluating model..." | |||
| python ../eval.py \ | |||
| --device_target=Ascend \ | |||
| --device_id=$DEVICE_ID \ | |||
| --checkpoint_path=$PATH_CHECKPOINT \ | |||
| --dataset_path=$DATA_DIR > eval.log 2>&1 & | |||
| cd ../ | |||
| @@ -16,7 +16,13 @@ | |||
| export DEVICE_ID=$1 | |||
| DATA_DIR=$2 | |||
| python ./train.py \ | |||
| rm -rf train_standalone | |||
| mkdir ./train_standalone | |||
| cd ./train_standalone || exit | |||
| echo "start training standalone on device $DEVICE_ID" | |||
| python ../train.py \ | |||
| --device_target=Ascend \ | |||
| --dataset_path=$DATA_DIR > log.txt 2>&1 & | |||
| cd ../ | |||
| @@ -15,15 +15,14 @@ | |||
| """Xception.""" | |||
| import mindspore.nn as nn | |||
| import mindspore.ops.operations as P | |||
| from src.config import config | |||
| class SeparableConv2d(nn.Cell): | |||
| def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0): | |||
| super(SeparableConv2d, self).__init__() | |||
| self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, group=in_channels, pad_mode='pad', | |||
| padding=padding, weight_init=config.weight_init) | |||
| padding=padding, weight_init='xavier_uniform') | |||
| self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, pad_mode='valid', | |||
| weight_init=config.weight_init) | |||
| weight_init='xavier_uniform') | |||
| def construct(self, x): | |||
| x = self.conv1(x) | |||
| @@ -37,7 +36,7 @@ class Block(nn.Cell): | |||
| if out_filters != in_filters or strides != 1: | |||
| self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, pad_mode='valid', has_bias=False, | |||
| weight_init=config.weight_init) | |||
| weight_init='xavier_uniform') | |||
| self.skipbn = nn.BatchNorm2d(out_filters, momentum=0.9) | |||
| else: | |||
| self.skip = None | |||
| @@ -96,10 +95,10 @@ class Xception(nn.Cell): | |||
| """ | |||
| super(Xception, self).__init__() | |||
| self.num_classes = num_classes | |||
| self.conv1 = nn.Conv2d(3, 32, 3, 2, pad_mode='valid', weight_init=config.weight_init) | |||
| self.conv1 = nn.Conv2d(3, 32, 3, 2, pad_mode='valid', weight_init='xavier_uniform') | |||
| self.bn1 = nn.BatchNorm2d(32, momentum=0.9) | |||
| self.relu = nn.ReLU() | |||
| self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='valid', weight_init=config.weight_init) | |||
| self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='valid', weight_init='xavier_uniform') | |||
| self.bn2 = nn.BatchNorm2d(64, momentum=0.9) | |||
| # Entry flow | |||
| @@ -36,6 +36,5 @@ config = ed({ | |||
| "label_smooth_factor": 0.1, | |||
| "lr_init": 0.00004, | |||
| "lr_max": 0.4, | |||
| "lr_end": 0.00004, | |||
| "weight_init": 'xavier_uniform' | |||
| "lr_end": 0.00004 | |||
| }) | |||
| @@ -17,7 +17,7 @@ import math | |||
| import numpy as np | |||
| def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): | |||
| def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode, global_step=0): | |||
| """ | |||
| generate learning rate array | |||
| @@ -82,6 +82,6 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch | |||
| lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) | |||
| lr_each_step.append(lr) | |||
| lr_each_step = np.array(lr_each_step).astype(np.float32) | |||
| lr_each_step = np.array(lr_each_step[global_step:]).astype(np.float32) | |||
| return lr_each_step | |||
| @@ -59,7 +59,8 @@ if __name__ == '__main__': | |||
| else: | |||
| rank = 0 | |||
| group_size = 1 | |||
| context.set_context(device_id=0) | |||
| if os.getenv('DEVICE_ID', "not_set").isdigit(): | |||
| context.set_context(device_id=int(os.getenv('DEVICE_ID'))) | |||
| # define network | |||
| net = xception(class_num=config.class_num) | |||
| @@ -88,7 +89,8 @@ if __name__ == '__main__': | |||
| warmup_epochs=config.warmup_epochs, | |||
| total_epochs=config.epoch_size, | |||
| steps_per_epoch=step_size, | |||
| lr_decay_mode=config.lr_decay_mode)) | |||
| lr_decay_mode=config.lr_decay_mode, | |||
| global_step=config.finish_epoch * step_size)) | |||
| # define optimization | |||
| opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay, config.loss_scale) | |||
| @@ -100,7 +102,7 @@ if __name__ == '__main__': | |||
| # define callbacks | |||
| cb = [TimeMonitor(), LossMonitor()] | |||
| if config.save_checkpoint: | |||
| save_ckpt_path = os.path.join(config.save_checkpoint_path, 'model_' + str(rank) + '/') | |||
| save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/') | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, | |||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||
| ckpt_cb = ModelCheckpoint(f"Xception-rank{rank}", directory=save_ckpt_path, config=config_ck) | |||