Merge pull request !6534 from linqingke/masstags/v1.0.0
| @@ -18,7 +18,7 @@ | |||
| - [Model Description](#model-description) | |||
| - [Performance](#performance) | |||
| - [Training accuracy results](#training-accuracy-results) | |||
| - [Training performance results](#yraining-performance-results) | |||
| - [Training performance results](#training-performance-results) | |||
| - [Description of Random Situation](#description-of-random-situation) | |||
| - [ModelZoo Homepage](#modelzoo-homepage) | |||
| @@ -80,13 +80,13 @@ After installing MindSpore via the official website, you can start training and | |||
| ```python | |||
| # run training example | |||
| python train.py --data_dir /PATH/TO/DATASET --is_distributed 0> train.log 2>&1 & | |||
| python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & | |||
| # run distributed training example | |||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET | |||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT | |||
| # run evaluation example | |||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & | |||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & | |||
| OR | |||
| sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT | |||
| ``` | |||
| @@ -168,7 +168,7 @@ You can modify the training behaviour through the various flags in the `train.py | |||
| - running on Ascend | |||
| ``` | |||
| python train.py --data_dir /PATH/TO/DATASET --is_distributed 0 > train.log 2>&1 & | |||
| python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & | |||
| ``` | |||
| The python command above will run in the background, The log and model checkpoint will be generated in `output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: | |||
| @@ -190,7 +190,7 @@ You can modify the training behaviour through the various flags in the `train.py | |||
| - running on Ascend | |||
| ``` | |||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET | |||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT | |||
| ``` | |||
| The above shell script will run distribute training in the background. You can view the results log and model checkpoint through the file `train[X]/output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: | |||
| @@ -217,7 +217,7 @@ You can modify the training behaviour through the various flags in the `train.py | |||
| running the command below for evaluation. | |||
| ``` | |||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & | |||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & | |||
| OR | |||
| sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT | |||
| ``` | |||
| @@ -16,8 +16,8 @@ | |||
| echo "==============================================================================================================" | |||
| echo "Please run the scipt as: " | |||
| echo "sh scipts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET" | |||
| echo "for example: sh scipts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset" | |||
| echo "sh scripts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET CKPT_FILE" | |||
| echo "for example: sh scripts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset ckpt_file" | |||
| echo "It is better to use absolute path." | |||
| echo "=================================================================================================================" | |||
| @@ -26,6 +26,7 @@ echo "After running the scipt, the network runs in the background. The log will | |||
| export RANK_SIZE=$1 | |||
| export RANK_TABLE_FILE=$2 | |||
| DATASET=$3 | |||
| CKPT_FILE=$4 | |||
| for((i=0;i<RANK_SIZE;i++)) | |||
| do | |||
| @@ -38,8 +39,12 @@ do | |||
| export RANK_ID=$i | |||
| echo "start training for rank $i, device $DEVICE_ID" | |||
| env > env.log | |||
| python train.py \ | |||
| --data_dir=$DATASET > log.txt 2>&1 & | |||
| if [ -f $CKPT_FILE ] | |||
| then | |||
| python train.py --data_dir=$DATASET --pretrained=$CKPT_FILE > log.txt 2>&1 & | |||
| else | |||
| python train.py --data_dir=$DATASET > log.txt 2>&1 & | |||
| fi | |||
| cd ../ | |||
| done | |||
| @@ -37,7 +37,7 @@ config = ed({ | |||
| "label_smooth_factor": 0.1, | |||
| "log_interval": 100, | |||
| "ckpt_interval": 2000, | |||
| "ckpt_interval": 50000, | |||
| "ckpt_path": 'outputs/', | |||
| "is_save_on_master": 1, | |||
| @@ -41,7 +41,6 @@ fi | |||
| python ${current_exec_path}/src/generate_hccn_file.py | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=4 | |||
| export RANK_SIZE=4 | |||
| export RANK_TABLE_FILE=${current_exec_path}/rank_table_4p.json | |||
| @@ -30,7 +30,7 @@ config = ed({ | |||
| 'NECK_OUT_CHANNEL': 256, | |||
| # dataset for train | |||
| "TRAIN_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/', | |||
| "TRAIN_ROOT_DIR": 'psenet/ic15/', | |||
| "TRAIN_IS_TRANSFORM": True, | |||
| "TRAIN_LONG_SIZE": 640, | |||
| "TRAIN_DATASET_SIZE": 1000, | |||
| @@ -43,7 +43,7 @@ config = ed({ | |||
| "TRAIN_MODEL_SAVE_PATH": './checkpoints/', | |||
| # dataset for test | |||
| "TEST_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/', | |||
| "TEST_ROOT_DIR": 'psenet/ic15/', | |||
| "TEST_DATASET_SIZE": 500, | |||
| "TEST_BUFFER_SIZE": 4, | |||
| "TEST_DROP_REMAINDER": False, | |||
| @@ -16,6 +16,7 @@ | |||
| import os | |||
| import random | |||
| import math | |||
| import cv2 | |||
| import pyclipper | |||
| import numpy as np | |||
| @@ -298,13 +299,40 @@ def IC15_TEST_Generator(): | |||
| yield img, img_resized, img_name | |||
| def train_dataset_creator(): | |||
| class DistributedSampler(): | |||
| def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): | |||
| self.dataset = dataset | |||
| self.rank = rank | |||
| self.group_size = group_size | |||
| self.dataset_len = len(self.dataset) | |||
| self.num_samplers = int(math.ceil(self.dataset_len * 1.0 / self.group_size)) | |||
| self.total_size = self.num_samplers * self.group_size | |||
| self.shuffle = shuffle | |||
| self.seed = seed | |||
| def __iter__(self): | |||
| if self.shuffle: | |||
| self.seed = (self.seed + 1) & 0xffffffff | |||
| np.random.seed(self.seed) | |||
| indices = np.random.permutation(self.dataset_len).tolist() | |||
| else: | |||
| indices = list(range(len(self.dataset_len))) | |||
| indices += indices[:(self.total_size - len(indices))] | |||
| indices = indices[self.rank::self.group_size] | |||
| return iter(indices) | |||
| def __len__(self): | |||
| return self.num_samplers | |||
| def train_dataset_creator(rank, group_size, shuffle=True): | |||
| cv2.setNumThreads(0) | |||
| dataset = TrainDataset() | |||
| ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8) | |||
| #ds = ds.repeat(config.TRAIN_REPEAT_NUM) | |||
| sampler = DistributedSampler(dataset, rank, group_size, shuffle) | |||
| ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, | |||
| sampler=sampler) | |||
| ds = ds.repeat(1) | |||
| ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | |||
| ds = ds.shuffle(buffer_size=config.TRAIN_BUFFER_SIZE) | |||
| return ds | |||
| def test_dataset_creator(): | |||
| @@ -54,7 +54,7 @@ def train(): | |||
| rank_id = get_rank() | |||
| # dataset/network/criterion/optim | |||
| ds = train_dataset_creator() | |||
| ds = train_dataset_creator(args.device_id, args.device_num) | |||
| step_size = ds.get_dataset_size() | |||
| print('Create dataset done!') | |||
| @@ -25,7 +25,7 @@ from mindspore.nn import Momentum | |||
| from mindspore.nn.optim import Adam, Lamb | |||
| from mindspore.train.model import Model | |||
| from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager | |||
| from mindspore.train.callback import CheckpointConfig, ModelCheckpoint | |||
| from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor | |||
| from mindspore import context, Parameter | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication import management as MultiAscend | |||
| @@ -216,11 +216,13 @@ def _build_training_pipeline(config: TransformerConfig, | |||
| scale_update_cell=scale_manager.get_update_cell()) | |||
| net_with_grads.set_train(True) | |||
| model = Model(net_with_grads) | |||
| time_cb = TimeMonitor(data_size=dataset.get_dataset_size()) | |||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, | |||
| keep_checkpoint_max=config.keep_ckpt_max) | |||
| rank_size = os.getenv('RANK_SIZE') | |||
| callbacks = [] | |||
| callbacks.append(time_cb) | |||
| if rank_size is not None and int(rank_size) > 1: | |||
| loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) | |||
| callbacks.append(loss_monitor) | |||