1. Fix random seed for trainer and init it at the first line of init
2. Add a regress test for fixed training
3. Change the dataset 'dureader_robust_qg' to 'DuReader_robust-QG'
4. Change some datasets from loading hf.datasets to loading msdataset.load
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10029509
master
| @@ -75,6 +75,7 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| this preprocessing action will be executed every time the dataset's __getitem__ is called. | this preprocessing action will be executed every time the dataset's __getitem__ is called. | ||||
| optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple | optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple | ||||
| containing the optimizer and the scheduler to use. | containing the optimizer and the scheduler to use. | ||||
| seed (int): The optional random seed for torch, cuda, numpy and random. | |||||
| max_epochs: (int, optional): Total training epochs. | max_epochs: (int, optional): Total training epochs. | ||||
| """ | """ | ||||
| @@ -93,8 +94,11 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| torch.optim.lr_scheduler._LRScheduler] = (None, | torch.optim.lr_scheduler._LRScheduler] = (None, | ||||
| None), | None), | ||||
| model_revision: Optional[str] = DEFAULT_MODEL_REVISION, | model_revision: Optional[str] = DEFAULT_MODEL_REVISION, | ||||
| seed: int = 42, | |||||
| **kwargs): | **kwargs): | ||||
| self._seed = seed | |||||
| set_random_seed(self._seed) | |||||
| if isinstance(model, str): | if isinstance(model, str): | ||||
| if os.path.exists(model): | if os.path.exists(model): | ||||
| self.model_dir = model if os.path.isdir( | self.model_dir = model if os.path.isdir( | ||||
| @@ -213,9 +217,6 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| self.use_fp16 = kwargs.get('use_fp16', False) | self.use_fp16 = kwargs.get('use_fp16', False) | ||||
| # TODO @wenmeng.zwm add seed init fn | |||||
| self._seed = 0 | |||||
| if kwargs.get('launcher', None) is not None: | if kwargs.get('launcher', None) is not None: | ||||
| init_dist(kwargs['launcher']) | init_dist(kwargs['launcher']) | ||||
| @@ -133,6 +133,7 @@ class RegressTool: | |||||
| compare_fn=None, | compare_fn=None, | ||||
| ignore_keys=None, | ignore_keys=None, | ||||
| compare_random=True, | compare_random=True, | ||||
| reset_dropout=True, | |||||
| lazy_stop_callback=None): | lazy_stop_callback=None): | ||||
| """Monitor a pytorch module's backward data and cfg data within a step of the optimizer. | """Monitor a pytorch module's backward data and cfg data within a step of the optimizer. | ||||
| @@ -151,6 +152,7 @@ class RegressTool: | |||||
| @param compare_fn: A custom fn used to compare the results manually. | @param compare_fn: A custom fn used to compare the results manually. | ||||
| @param ignore_keys: The keys to ignore of the named_parameters. | @param ignore_keys: The keys to ignore of the named_parameters. | ||||
| @param compare_random: If to compare random setttings, default True. | @param compare_random: If to compare random setttings, default True. | ||||
| @param reset_dropout: Reset all dropout modules to 0.0. | |||||
| @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called. | @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called. | ||||
| >>> def compare_fn(v1, v2, key, type): | >>> def compare_fn(v1, v2, key, type): | ||||
| @@ -202,6 +204,18 @@ class RegressTool: | |||||
| trainer, | trainer, | ||||
| '_seed') else trainer.seed if hasattr(trainer, 'seed') else None | '_seed') else trainer.seed if hasattr(trainer, 'seed') else None | ||||
| if reset_dropout: | |||||
| with torch.no_grad(): | |||||
| def reinit_dropout(_module): | |||||
| for name, submodule in _module.named_children(): | |||||
| if isinstance(submodule, torch.nn.Dropout): | |||||
| setattr(_module, name, torch.nn.Dropout(0.)) | |||||
| else: | |||||
| reinit_dropout(submodule) | |||||
| reinit_dropout(module) | |||||
| if level == 'strict': | if level == 'strict': | ||||
| hack_forward(module, file_name, io_json) | hack_forward(module, file_name, io_json) | ||||
| intercept_module(module, io_json) | intercept_module(module, io_json) | ||||
| @@ -186,6 +186,7 @@ def set_random_seed(seed): | |||||
| random.seed(seed) | random.seed(seed) | ||||
| np.random.seed(seed) | np.random.seed(seed) | ||||
| torch.manual_seed(seed) | torch.manual_seed(seed) | ||||
| torch.cuda.manual_seed_all(seed) | |||||
| else: | else: | ||||
| raise ValueError( | raise ValueError( | ||||
| f'Random seed should be positive, current seed is {seed}') | f'Random seed should be positive, current seed is {seed}') | ||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:2df2a5f3cdfc6dded52d31a8e97d9a9c41a803cb6d46dee709c51872eda37b21 | |||||
| size 151830 | |||||
| @@ -10,11 +10,14 @@ from modelscope.msdatasets import MsDataset | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.trainers.hooks import Hook | from modelscope.trainers.hooks import Hook | ||||
| from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer | |||||
| from modelscope.trainers.nlp_trainer import (EpochBasedTrainer, | |||||
| NlpEpochBasedTrainer) | |||||
| from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \ | from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \ | ||||
| calculate_fisher | calculate_fisher | ||||
| from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
| from modelscope.utils.data_utils import to_device | from modelscope.utils.data_utils import to_device | ||||
| from modelscope.utils.regress_test_utils import MsRegressTool | |||||
| from modelscope.utils.test_utils import test_level | |||||
| class TestFinetuneSequenceClassification(unittest.TestCase): | class TestFinetuneSequenceClassification(unittest.TestCase): | ||||
| @@ -28,11 +31,76 @@ class TestFinetuneSequenceClassification(unittest.TestCase): | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | self.tmp_dir = tempfile.TemporaryDirectory().name | ||||
| if not os.path.exists(self.tmp_dir): | if not os.path.exists(self.tmp_dir): | ||||
| os.makedirs(self.tmp_dir) | os.makedirs(self.tmp_dir) | ||||
| self.regress_tool = MsRegressTool(baseline=False) | |||||
| def tearDown(self): | def tearDown(self): | ||||
| shutil.rmtree(self.tmp_dir) | shutil.rmtree(self.tmp_dir) | ||||
| super().tearDown() | super().tearDown() | ||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_trainer_repeatable(self): | |||||
| import torch # noqa | |||||
| def cfg_modify_fn(cfg): | |||||
| cfg.task = 'nli' | |||||
| cfg['preprocessor'] = {'type': 'nli-tokenizer'} | |||||
| cfg.train.optimizer.lr = 2e-5 | |||||
| cfg['dataset'] = { | |||||
| 'train': { | |||||
| 'labels': [ | |||||
| '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', | |||||
| '11', '12', '13', '14' | |||||
| ], | |||||
| 'first_sequence': | |||||
| 'sentence', | |||||
| 'label': | |||||
| 'label', | |||||
| } | |||||
| } | |||||
| cfg.train.max_epochs = 5 | |||||
| cfg.train.lr_scheduler = { | |||||
| 'type': 'LinearLR', | |||||
| 'start_factor': 1.0, | |||||
| 'end_factor': 0.0, | |||||
| 'total_iters': | |||||
| int(len(dataset['train']) / 32) * cfg.train.max_epochs, | |||||
| 'options': { | |||||
| 'by_epoch': False | |||||
| } | |||||
| } | |||||
| cfg.train.hooks = [{ | |||||
| 'type': 'CheckpointHook', | |||||
| 'interval': 1 | |||||
| }, { | |||||
| 'type': 'TextLoggerHook', | |||||
| 'interval': 1 | |||||
| }, { | |||||
| 'type': 'IterTimerHook' | |||||
| }, { | |||||
| 'type': 'EvaluationHook', | |||||
| 'by_epoch': False, | |||||
| 'interval': 100 | |||||
| }] | |||||
| return cfg | |||||
| dataset = MsDataset.load('clue', subset_name='tnews') | |||||
| kwargs = dict( | |||||
| model='damo/nlp_structbert_backbone_base_std', | |||||
| train_dataset=dataset['train'], | |||||
| eval_dataset=dataset['validation'], | |||||
| work_dir=self.tmp_dir, | |||||
| seed=42, | |||||
| cfg_modify_fn=cfg_modify_fn) | |||||
| os.environ['LOCAL_RANK'] = '0' | |||||
| trainer: EpochBasedTrainer = build_trainer( | |||||
| name=Trainers.nlp_base_trainer, default_args=kwargs) | |||||
| with self.regress_tool.monitor_ms_train( | |||||
| trainer, 'sbert-base-tnews', level='strict'): | |||||
| trainer.train() | |||||
| def finetune(self, | def finetune(self, | ||||
| model_id, | model_id, | ||||
| train_dataset, | train_dataset, | ||||
| @@ -54,7 +122,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase): | |||||
| results_files = os.listdir(self.tmp_dir) | results_files = os.listdir(self.tmp_dir) | ||||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | self.assertIn(f'{trainer.timestamp}.log.json', results_files) | ||||
| for i in range(self.epoch_num): | for i in range(self.epoch_num): | ||||
| self.assertIn(f'epoch_{i+1}.pth', results_files) | |||||
| self.assertIn(f'epoch_{i + 1}.pth', results_files) | |||||
| output_files = os.listdir( | output_files = os.listdir( | ||||
| os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)) | os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)) | ||||
| @@ -118,11 +186,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase): | |||||
| }] | }] | ||||
| return cfg | return cfg | ||||
| from datasets import load_dataset | |||||
| from datasets import DownloadConfig | |||||
| dc = DownloadConfig() | |||||
| dc.local_files_only = True | |||||
| dataset = load_dataset('clue', 'afqmc', download_config=dc) | |||||
| dataset = MsDataset.load('clue', subset_name='afqmc') | |||||
| self.finetune( | self.finetune( | ||||
| model_id='damo/nlp_structbert_backbone_base_std', | model_id='damo/nlp_structbert_backbone_base_std', | ||||
| train_dataset=dataset['train'], | train_dataset=dataset['train'], | ||||
| @@ -182,11 +246,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase): | |||||
| }] | }] | ||||
| return cfg | return cfg | ||||
| from datasets import load_dataset | |||||
| from datasets import DownloadConfig | |||||
| dc = DownloadConfig() | |||||
| dc.local_files_only = True | |||||
| dataset = load_dataset('clue', 'tnews', download_config=dc) | |||||
| dataset = MsDataset.load('clue', subset_name='tnews') | |||||
| self.finetune( | self.finetune( | ||||
| model_id='damo/nlp_structbert_backbone_base_std', | model_id='damo/nlp_structbert_backbone_base_std', | ||||
| @@ -129,7 +129,7 @@ class TestFinetuneTextGeneration(unittest.TestCase): | |||||
| @unittest.skip | @unittest.skip | ||||
| def test_finetune_cnndm(self): | def test_finetune_cnndm(self): | ||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| dataset_dict = MsDataset.load('dureader_robust_qg') | |||||
| dataset_dict = MsDataset.load('DuReader_robust-QG') | |||||
| train_dataset = dataset_dict['train'].to_hf_dataset() \ | train_dataset = dataset_dict['train'].to_hf_dataset() \ | ||||
| .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) | .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) | ||||
| eval_dataset = dataset_dict['validation'].to_hf_dataset() \ | eval_dataset = dataset_dict['validation'].to_hf_dataset() \ | ||||