Merge pull request !6534 from linqingke/masstags/v1.0.0
| @@ -18,7 +18,7 @@ | |||||
| - [Model Description](#model-description) | - [Model Description](#model-description) | ||||
| - [Performance](#performance) | - [Performance](#performance) | ||||
| - [Training accuracy results](#training-accuracy-results) | - [Training accuracy results](#training-accuracy-results) | ||||
| - [Training performance results](#yraining-performance-results) | |||||
| - [Training performance results](#training-performance-results) | |||||
| - [Description of Random Situation](#description-of-random-situation) | - [Description of Random Situation](#description-of-random-situation) | ||||
| - [ModelZoo Homepage](#modelzoo-homepage) | - [ModelZoo Homepage](#modelzoo-homepage) | ||||
| @@ -80,13 +80,13 @@ After installing MindSpore via the official website, you can start training and | |||||
| ```python | ```python | ||||
| # run training example | # run training example | ||||
| python train.py --data_dir /PATH/TO/DATASET --is_distributed 0> train.log 2>&1 & | |||||
| python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & | |||||
| # run distributed training example | # run distributed training example | ||||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET | |||||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT | |||||
| # run evaluation example | # run evaluation example | ||||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & | |||||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & | |||||
| OR | OR | ||||
| sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT | sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT | ||||
| ``` | ``` | ||||
| @@ -168,7 +168,7 @@ You can modify the training behaviour through the various flags in the `train.py | |||||
| - running on Ascend | - running on Ascend | ||||
| ``` | ``` | ||||
| python train.py --data_dir /PATH/TO/DATASET --is_distributed 0 > train.log 2>&1 & | |||||
| python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & | |||||
| ``` | ``` | ||||
| The python command above will run in the background, The log and model checkpoint will be generated in `output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: | The python command above will run in the background, The log and model checkpoint will be generated in `output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: | ||||
| @@ -190,7 +190,7 @@ You can modify the training behaviour through the various flags in the `train.py | |||||
| - running on Ascend | - running on Ascend | ||||
| ``` | ``` | ||||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET | |||||
| sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT | |||||
| ``` | ``` | ||||
| The above shell script will run distribute training in the background. You can view the results log and model checkpoint through the file `train[X]/output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: | The above shell script will run distribute training in the background. You can view the results log and model checkpoint through the file `train[X]/output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: | ||||
| @@ -217,7 +217,7 @@ You can modify the training behaviour through the various flags in the `train.py | |||||
| running the command below for evaluation. | running the command below for evaluation. | ||||
| ``` | ``` | ||||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & | |||||
| python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & | |||||
| OR | OR | ||||
| sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT | sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT | ||||
| ``` | ``` | ||||
| @@ -16,8 +16,8 @@ | |||||
| echo "==============================================================================================================" | echo "==============================================================================================================" | ||||
| echo "Please run the scipt as: " | echo "Please run the scipt as: " | ||||
| echo "sh scipts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET" | |||||
| echo "for example: sh scipts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset" | |||||
| echo "sh scripts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET CKPT_FILE" | |||||
| echo "for example: sh scripts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset ckpt_file" | |||||
| echo "It is better to use absolute path." | echo "It is better to use absolute path." | ||||
| echo "=================================================================================================================" | echo "=================================================================================================================" | ||||
| @@ -26,6 +26,7 @@ echo "After running the scipt, the network runs in the background. The log will | |||||
| export RANK_SIZE=$1 | export RANK_SIZE=$1 | ||||
| export RANK_TABLE_FILE=$2 | export RANK_TABLE_FILE=$2 | ||||
| DATASET=$3 | DATASET=$3 | ||||
| CKPT_FILE=$4 | |||||
| for((i=0;i<RANK_SIZE;i++)) | for((i=0;i<RANK_SIZE;i++)) | ||||
| do | do | ||||
| @@ -38,8 +39,12 @@ do | |||||
| export RANK_ID=$i | export RANK_ID=$i | ||||
| echo "start training for rank $i, device $DEVICE_ID" | echo "start training for rank $i, device $DEVICE_ID" | ||||
| env > env.log | env > env.log | ||||
| python train.py \ | |||||
| --data_dir=$DATASET > log.txt 2>&1 & | |||||
| if [ -f $CKPT_FILE ] | |||||
| then | |||||
| python train.py --data_dir=$DATASET --pretrained=$CKPT_FILE > log.txt 2>&1 & | |||||
| else | |||||
| python train.py --data_dir=$DATASET > log.txt 2>&1 & | |||||
| fi | |||||
| cd ../ | cd ../ | ||||
| done | done | ||||
| @@ -37,7 +37,7 @@ config = ed({ | |||||
| "label_smooth_factor": 0.1, | "label_smooth_factor": 0.1, | ||||
| "log_interval": 100, | "log_interval": 100, | ||||
| "ckpt_interval": 2000, | |||||
| "ckpt_interval": 50000, | |||||
| "ckpt_path": 'outputs/', | "ckpt_path": 'outputs/', | ||||
| "is_save_on_master": 1, | "is_save_on_master": 1, | ||||
| @@ -41,7 +41,6 @@ fi | |||||
| python ${current_exec_path}/src/generate_hccn_file.py | python ${current_exec_path}/src/generate_hccn_file.py | ||||
| ulimit -u unlimited | |||||
| export DEVICE_NUM=4 | export DEVICE_NUM=4 | ||||
| export RANK_SIZE=4 | export RANK_SIZE=4 | ||||
| export RANK_TABLE_FILE=${current_exec_path}/rank_table_4p.json | export RANK_TABLE_FILE=${current_exec_path}/rank_table_4p.json | ||||
| @@ -30,7 +30,7 @@ config = ed({ | |||||
| 'NECK_OUT_CHANNEL': 256, | 'NECK_OUT_CHANNEL': 256, | ||||
| # dataset for train | # dataset for train | ||||
| "TRAIN_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/', | |||||
| "TRAIN_ROOT_DIR": 'psenet/ic15/', | |||||
| "TRAIN_IS_TRANSFORM": True, | "TRAIN_IS_TRANSFORM": True, | ||||
| "TRAIN_LONG_SIZE": 640, | "TRAIN_LONG_SIZE": 640, | ||||
| "TRAIN_DATASET_SIZE": 1000, | "TRAIN_DATASET_SIZE": 1000, | ||||
| @@ -43,7 +43,7 @@ config = ed({ | |||||
| "TRAIN_MODEL_SAVE_PATH": './checkpoints/', | "TRAIN_MODEL_SAVE_PATH": './checkpoints/', | ||||
| # dataset for test | # dataset for test | ||||
| "TEST_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/', | |||||
| "TEST_ROOT_DIR": 'psenet/ic15/', | |||||
| "TEST_DATASET_SIZE": 500, | "TEST_DATASET_SIZE": 500, | ||||
| "TEST_BUFFER_SIZE": 4, | "TEST_BUFFER_SIZE": 4, | ||||
| "TEST_DROP_REMAINDER": False, | "TEST_DROP_REMAINDER": False, | ||||
| @@ -16,6 +16,7 @@ | |||||
| import os | import os | ||||
| import random | import random | ||||
| import math | |||||
| import cv2 | import cv2 | ||||
| import pyclipper | import pyclipper | ||||
| import numpy as np | import numpy as np | ||||
| @@ -298,13 +299,40 @@ def IC15_TEST_Generator(): | |||||
| yield img, img_resized, img_name | yield img, img_resized, img_name | ||||
| def train_dataset_creator(): | |||||
| class DistributedSampler(): | |||||
| def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): | |||||
| self.dataset = dataset | |||||
| self.rank = rank | |||||
| self.group_size = group_size | |||||
| self.dataset_len = len(self.dataset) | |||||
| self.num_samplers = int(math.ceil(self.dataset_len * 1.0 / self.group_size)) | |||||
| self.total_size = self.num_samplers * self.group_size | |||||
| self.shuffle = shuffle | |||||
| self.seed = seed | |||||
| def __iter__(self): | |||||
| if self.shuffle: | |||||
| self.seed = (self.seed + 1) & 0xffffffff | |||||
| np.random.seed(self.seed) | |||||
| indices = np.random.permutation(self.dataset_len).tolist() | |||||
| else: | |||||
| indices = list(range(len(self.dataset_len))) | |||||
| indices += indices[:(self.total_size - len(indices))] | |||||
| indices = indices[self.rank::self.group_size] | |||||
| return iter(indices) | |||||
| def __len__(self): | |||||
| return self.num_samplers | |||||
| def train_dataset_creator(rank, group_size, shuffle=True): | |||||
| cv2.setNumThreads(0) | cv2.setNumThreads(0) | ||||
| dataset = TrainDataset() | dataset = TrainDataset() | ||||
| ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8) | |||||
| #ds = ds.repeat(config.TRAIN_REPEAT_NUM) | |||||
| sampler = DistributedSampler(dataset, rank, group_size, shuffle) | |||||
| ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, | |||||
| sampler=sampler) | |||||
| ds = ds.repeat(1) | |||||
| ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) | ||||
| ds = ds.shuffle(buffer_size=config.TRAIN_BUFFER_SIZE) | |||||
| return ds | return ds | ||||
| def test_dataset_creator(): | def test_dataset_creator(): | ||||
| @@ -54,7 +54,7 @@ def train(): | |||||
| rank_id = get_rank() | rank_id = get_rank() | ||||
| # dataset/network/criterion/optim | # dataset/network/criterion/optim | ||||
| ds = train_dataset_creator() | |||||
| ds = train_dataset_creator(args.device_id, args.device_num) | |||||
| step_size = ds.get_dataset_size() | step_size = ds.get_dataset_size() | ||||
| print('Create dataset done!') | print('Create dataset done!') | ||||
| @@ -25,7 +25,7 @@ from mindspore.nn import Momentum | |||||
| from mindspore.nn.optim import Adam, Lamb | from mindspore.nn.optim import Adam, Lamb | ||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager | from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager | ||||
| from mindspore.train.callback import CheckpointConfig, ModelCheckpoint | |||||
| from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor | |||||
| from mindspore import context, Parameter | from mindspore import context, Parameter | ||||
| from mindspore.context import ParallelMode | from mindspore.context import ParallelMode | ||||
| from mindspore.communication import management as MultiAscend | from mindspore.communication import management as MultiAscend | ||||
| @@ -216,11 +216,13 @@ def _build_training_pipeline(config: TransformerConfig, | |||||
| scale_update_cell=scale_manager.get_update_cell()) | scale_update_cell=scale_manager.get_update_cell()) | ||||
| net_with_grads.set_train(True) | net_with_grads.set_train(True) | ||||
| model = Model(net_with_grads) | model = Model(net_with_grads) | ||||
| time_cb = TimeMonitor(data_size=dataset.get_dataset_size()) | |||||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, | ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, | ||||
| keep_checkpoint_max=config.keep_ckpt_max) | keep_checkpoint_max=config.keep_ckpt_max) | ||||
| rank_size = os.getenv('RANK_SIZE') | rank_size = os.getenv('RANK_SIZE') | ||||
| callbacks = [] | callbacks = [] | ||||
| callbacks.append(time_cb) | |||||
| if rank_size is not None and int(rank_size) > 1: | if rank_size is not None and int(rank_size) > 1: | ||||
| loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) | loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) | ||||
| callbacks.append(loss_monitor) | callbacks.append(loss_monitor) | ||||