| @@ -65,32 +65,34 @@ Parameters for both training and evaluation can be set in src/config.py. | |||
| ## Running the example | |||
| ### Train | |||
| ### Running on Ascend | |||
| #### Usage | |||
| #### Train | |||
| ##### Usage | |||
| ``` | |||
| # distributed training | |||
| Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training | |||
| Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| ##### Launch | |||
| ``` | |||
| # distribute training example | |||
| ./run_distribute_train.sh rank_table.json ~/dataset_path | |||
| bash run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin | |||
| # standalone training example | |||
| ./run_standalone_train.sh ~/dataset_path | |||
| bash run_standalone_train.sh ~/cifar-10-batches-bin | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html). | |||
| #### Result | |||
| ##### Result | |||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. | |||
| @@ -103,20 +105,20 @@ epoch: 1 step: 603, loss is 2.305666 | |||
| ... | |||
| ``` | |||
| ### Evaluation | |||
| #### Evaluation | |||
| #### Usage | |||
| ##### Usage | |||
| ``` | |||
| # evaluation | |||
| Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| #### Launch | |||
| ##### Launch | |||
| ``` | |||
| # evaluation example | |||
| ./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt | |||
| bash run_eval.sh ~/cifar-10-verify-bin ~/resnet50/train/alexnet-1.591.ckpt | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| @@ -125,11 +127,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ### Running on GPU | |||
| ``` | |||
| # distributed training example | |||
| ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training example | |||
| ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # infer example | |||
| ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| @@ -57,14 +57,13 @@ fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| start_id=0 | |||
| for((i=start_id; i<DEVICE_NUM + start_id; i++)) | |||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$((i - start_id)) | |||
| export DEVICE_ID=$((i + start_id)) | |||
| export RANK_ID=$i | |||
| rm -rf ./train_parallel$i | |||
| mkdir ./train_parallel$i | |||
| cp ../*.py ./train_parallel$i | |||
| @@ -19,9 +19,7 @@ network config setting, will be used in train.py | |||
| from easydict import EasyDict as edict | |||
| cfg = edict({ | |||
| {% if dataset=='MNIST' %} | |||
| 'num_classes': 10, | |||
| {% elif dataset=='Cifar10' %} | |||
| {% if dataset=='Cifar10' %} | |||
| 'num_classes': 10, | |||
| {% elif dataset=='ImageNet' %} | |||
| 'num_classes': 1001, | |||
| @@ -50,32 +50,34 @@ Parameters for both training and evaluation can be set in src/config.py. | |||
| ## Running the example | |||
| ### Train | |||
| ### Running on Ascend | |||
| #### Usage | |||
| #### Train | |||
| ##### Usage | |||
| ``` | |||
| # distributed training | |||
| Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training | |||
| Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| ##### Launch | |||
| ``` | |||
| # distribute training example | |||
| ./run_distribute_train.sh rank_table.json ~/MNIST_data | |||
| bash run_distribute_train.sh rank_table.json ~/MNIST_data | |||
| # standalone training example | |||
| ./run_standalone_train.sh ~/MNIST_data | |||
| bash run_standalone_train.sh ~/MNIST_data | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html). | |||
| #### Result | |||
| ##### Result | |||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. | |||
| @@ -88,20 +90,20 @@ epoch: 1 step: 603, loss is 2.305666 | |||
| ... | |||
| ``` | |||
| ### Evaluation | |||
| #### Evaluation | |||
| #### Usage | |||
| ##### Usage | |||
| ``` | |||
| # evaluation | |||
| Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| #### Launch | |||
| ##### Launch | |||
| ``` | |||
| # evaluation example | |||
| ./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt | |||
| bash run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| @@ -110,11 +112,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ### Running on GPU | |||
| ``` | |||
| # distributed training example | |||
| ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training example | |||
| ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # infer example | |||
| ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| @@ -60,13 +60,11 @@ export DEVICE_NUM=8 | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| export SERVER_ID=0 | |||
| rank_start=$((DEVICE_NUM * SERVER_ID)) | |||
| for((i=0; i<DEVICE_NUM; i++)) | |||
| start_id=0 | |||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$((rank_start + i)) | |||
| export DEVICE_ID=$((i + start_id)) | |||
| export RANK_ID=$i | |||
| rm -rf ./train_parallel$i | |||
| mkdir ./train_parallel$i | |||
| cp ../*.py ./train_parallel$i | |||
| @@ -66,32 +66,34 @@ Parameters for both training and evaluation can be set in src/config.py. | |||
| ## Running the example | |||
| ### Train | |||
| ### Running on Ascend | |||
| #### Usage | |||
| #### Train | |||
| ##### Usage | |||
| ``` | |||
| # distributed training | |||
| Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training | |||
| Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| ##### Launch | |||
| ``` | |||
| # distribute training example | |||
| ./run_distribute_train.sh rank_table.json ~/dataset_path | |||
| bash run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin | |||
| # standalone training example | |||
| ./run_standalone_train.sh ~/dataset_path | |||
| bash run_standalone_train.sh ~/cifar-10-batches-bin | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html). | |||
| #### Result | |||
| ##### Result | |||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. | |||
| @@ -104,20 +106,20 @@ epoch: 1 step: 603, loss is 2.305666 | |||
| ... | |||
| ``` | |||
| ### Evaluation | |||
| #### Evaluation | |||
| #### Usage | |||
| ##### Usage | |||
| ``` | |||
| # evaluation | |||
| Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| #### Launch | |||
| ##### Launch | |||
| ``` | |||
| # evaluation example | |||
| ./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt | |||
| bash run_eval.sh ~/cifar-10-verify-bin ~/resnet50/train/alexnet-1.591.ckpt | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| @@ -126,11 +128,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ### Running on GPU | |||
| ``` | |||
| # distributed training example | |||
| ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training example | |||
| ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # infer example | |||
| ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| @@ -22,7 +22,6 @@ from mindspore import context | |||
| from mindspore import dataset as de | |||
| from mindspore.train.model import Model | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| from src.crossentropy import CrossEntropy | |||
| parser = argparse.ArgumentParser(description='Image classification') | |||
| @@ -67,9 +66,14 @@ if __name__ == '__main__': | |||
| # define loss, model | |||
| {% if dataset=='ImageNet' %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| if not cfg.use_label_smooth: | |||
| cfg.label_smooth_factor = 0.0 | |||
| loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', | |||
| smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% else %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||
| @@ -57,14 +57,13 @@ fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| start_id=0 | |||
| for((i=start_id; i<DEVICE_NUM + start_id; i++)) | |||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$((i - start_id)) | |||
| export DEVICE_ID=$((i + start_id)) | |||
| export RANK_ID=$i | |||
| rm -rf ./train_parallel$i | |||
| mkdir ./train_parallel$i | |||
| cp ../*.py ./train_parallel$i | |||
| @@ -72,5 +72,5 @@ if [ $# == 2 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH3 &> log & | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | |||
| fi | |||
| @@ -18,9 +18,7 @@ network config setting, will be used in train.py and eval.py | |||
| from easydict import EasyDict as ed | |||
| cfg = ed({ | |||
| {% if dataset=='MNIST' %} | |||
| 'num_classes': 10, | |||
| {% elif dataset=='Cifar10' %} | |||
| {% if dataset=='Cifar10' %} | |||
| 'num_classes': 10, | |||
| {% elif dataset=='ImageNet' %} | |||
| 'num_classes': 1001, | |||
| @@ -48,7 +46,7 @@ cfg = ed({ | |||
| {% if dataset=='ImageNet' %} | |||
| "warmup_epochs": 0, | |||
| "lr_decay_mode": "cosine", | |||
| {% else %} | |||
| {% elif dataset=='Cifar10' %} | |||
| "warmup_epochs": 5, | |||
| "lr_decay_mode": "poly", | |||
| {% endif %} | |||
| @@ -29,7 +29,6 @@ from mindspore.communication.management import init, get_rank, get_group_size | |||
| import mindspore.nn as nn | |||
| import mindspore.common.initializer as weight_init | |||
| from src.lr_generator import get_lr, warmup_cosine_annealing_lr | |||
| from src.crossentropy import CrossEntropy | |||
| parser = argparse.ArgumentParser(description='Image classification') | |||
| parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') | |||
| @@ -122,9 +121,14 @@ if __name__ == '__main__': | |||
| # define loss, model | |||
| if target == "Ascend": | |||
| {% if dataset=='ImageNet' %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| if not cfg.use_label_smooth: | |||
| cfg.label_smooth_factor = 0.0 | |||
| loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', | |||
| smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% else %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||
| @@ -137,6 +141,16 @@ if __name__ == '__main__': | |||
| amp_level="O2", keep_batchnorm_fp32=False) | |||
| else: | |||
| # GPU target | |||
| {% if dataset=='ImageNet' %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| if not cfg.use_label_smooth: | |||
| cfg.label_smooth_factor = 0.0 | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean', | |||
| smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% else %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean') | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| @@ -147,6 +161,7 @@ if __name__ == '__main__': | |||
| {% else %} | |||
| opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) | |||
| {% endif %} | |||
| {% endif %} | |||
| model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) | |||
| # define callbacks | |||