* fix msdataset
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9436292
* fix msdataset
master
| @@ -57,6 +57,9 @@ class MsDataset: | |||||
| def __getitem__(self, key): | def __getitem__(self, key): | ||||
| return self._hf_ds[key] | return self._hf_ds[key] | ||||
| def __len__(self): | |||||
| return len(self._hf_ds) | |||||
| @classmethod | @classmethod | ||||
| def from_hf_dataset(cls, | def from_hf_dataset(cls, | ||||
| hf_ds: Union[Dataset, DatasetDict], | hf_ds: Union[Dataset, DatasetDict], | ||||
| @@ -223,6 +226,7 @@ class MsDataset: | |||||
| retained_columns.append(k) | retained_columns.append(k) | ||||
| import torch | import torch | ||||
| import math | |||||
| class MsIterableDataset(torch.utils.data.IterableDataset): | class MsIterableDataset(torch.utils.data.IterableDataset): | ||||
| @@ -230,8 +234,23 @@ class MsDataset: | |||||
| super(MsIterableDataset).__init__() | super(MsIterableDataset).__init__() | ||||
| self.dataset = dataset | self.dataset = dataset | ||||
| def __len__(self): | |||||
| return len(self.dataset) | |||||
| def __iter__(self): | def __iter__(self): | ||||
| for item_dict in self.dataset: | |||||
| worker_info = torch.utils.data.get_worker_info() | |||||
| if worker_info is None: # single-process data loading | |||||
| iter_start = 0 | |||||
| iter_end = len(self.dataset) | |||||
| else: # in a worker process | |||||
| per_worker = math.ceil( | |||||
| len(self.dataset) / float(worker_info.num_workers)) | |||||
| worker_id = worker_info.id | |||||
| iter_start = worker_id * per_worker | |||||
| iter_end = min(iter_start + per_worker, len(self.dataset)) | |||||
| for idx in range(iter_start, iter_end): | |||||
| item_dict = self.dataset[idx] | |||||
| res = { | res = { | ||||
| k: np.array(item_dict[k]) | k: np.array(item_dict[k]) | ||||
| for k in columns if k in retained_columns | for k in columns if k in retained_columns | ||||
| @@ -273,7 +292,8 @@ class MsDataset: | |||||
| 'The function to_torch_dataset requires pytorch to be installed' | 'The function to_torch_dataset requires pytorch to be installed' | ||||
| ) | ) | ||||
| if preprocessors is not None: | if preprocessors is not None: | ||||
| return self.to_torch_dataset_with_processors(preprocessors) | |||||
| return self.to_torch_dataset_with_processors( | |||||
| preprocessors, columns=columns) | |||||
| else: | else: | ||||
| self._hf_ds.reset_format() | self._hf_ds.reset_format() | ||||
| self._hf_ds.set_format( | self._hf_ds.set_format( | ||||
| @@ -21,7 +21,6 @@ from modelscope.models.base_torch import TorchModel | |||||
| from modelscope.msdatasets.ms_dataset import MsDataset | from modelscope.msdatasets.ms_dataset import MsDataset | ||||
| from modelscope.preprocessors import build_preprocessor | from modelscope.preprocessors import build_preprocessor | ||||
| from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
| from modelscope.task_datasets import TorchTaskDataset, build_task_dataset | |||||
| from modelscope.trainers.hooks.builder import HOOKS | from modelscope.trainers.hooks.builder import HOOKS | ||||
| from modelscope.trainers.hooks.priority import Priority, get_priority | from modelscope.trainers.hooks.priority import Priority, get_priority | ||||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | ||||
| @@ -49,7 +48,7 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| or a model id. If model is None, build_model method will be called. | or a model id. If model is None, build_model method will be called. | ||||
| data_collator (`Callable`, *optional*): | data_collator (`Callable`, *optional*): | ||||
| The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. | The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. | ||||
| train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): | |||||
| train_dataset (`MsDataset`, *optional*): | |||||
| The dataset to use for training. | The dataset to use for training. | ||||
| Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a | Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a | ||||
| @@ -117,10 +116,10 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| # TODO how to fill device option? | # TODO how to fill device option? | ||||
| self.device = int( | self.device = int( | ||||
| os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None | os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None | ||||
| self.train_dataset = self.to_task_dataset( | |||||
| train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) | |||||
| self.eval_dataset = self.to_task_dataset( | |||||
| eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) | |||||
| self.train_dataset = train_dataset.to_torch_dataset( | |||||
| preprocessors=self.preprocessor) if train_dataset else None | |||||
| self.eval_dataset = eval_dataset.to_torch_dataset( | |||||
| preprocessors=self.preprocessor) if eval_dataset else None | |||||
| self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | ||||
| self.metrics = self.get_metrics() | self.metrics = self.get_metrics() | ||||
| self.optimizers = optimizers | self.optimizers = optimizers | ||||
| @@ -179,38 +178,6 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| """int: Maximum training iterations.""" | """int: Maximum training iterations.""" | ||||
| return self._max_epochs * len(self.data_loader) | return self._max_epochs * len(self.data_loader) | ||||
| def to_task_dataset(self, | |||||
| datasets: Tuple[Dataset, List[Dataset]], | |||||
| mode: str, | |||||
| preprocessor: Optional[Preprocessor] = None): | |||||
| """Build the task specific dataset processor for this trainer. | |||||
| Returns: The task dataset processor for the task. If no result for the very model-type and task, | |||||
| the default TaskDataset will be returned. | |||||
| """ | |||||
| try: | |||||
| if not datasets: | |||||
| return datasets | |||||
| if isinstance(datasets, TorchTaskDataset): | |||||
| return datasets | |||||
| task_dataset = build_task_dataset( | |||||
| ConfigDict({ | |||||
| **self.cfg.model, | |||||
| 'mode': mode, | |||||
| 'preprocessor': preprocessor, | |||||
| 'datasets': datasets, | |||||
| }), getattr(self.cfg, 'task', None)) | |||||
| return task_dataset | |||||
| except Exception: | |||||
| if isinstance(datasets, (List, Tuple)) or preprocessor is not None: | |||||
| return TorchTaskDataset( | |||||
| datasets, | |||||
| mode=mode, | |||||
| preprocessor=preprocessor, | |||||
| **(self.cfg.model if hasattr(self.cfg, 'model') else {})) | |||||
| else: | |||||
| return datasets | |||||
| def build_preprocessor(self) -> Preprocessor: | def build_preprocessor(self) -> Preprocessor: | ||||
| """Build the preprocessor. | """Build the preprocessor. | ||||
| @@ -448,9 +415,7 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| ) | ) | ||||
| torch_dataset = dataset.to_torch_dataset( | torch_dataset = dataset.to_torch_dataset( | ||||
| preprocessors=self.preprocessor, ) | preprocessors=self.preprocessor, ) | ||||
| dataset = self.to_task_dataset( | |||||
| torch_dataset, mode, preprocessor=self.preprocessor) | |||||
| return dataset | |||||
| return torch_dataset | |||||
| def create_optimizer_and_scheduler(self): | def create_optimizer_and_scheduler(self): | ||||
| """ Create optimizer and lr scheduler | """ Create optimizer and lr scheduler | ||||
| @@ -60,6 +60,8 @@ def torch_default_data_collator(features): | |||||
| ) and v is not None and not isinstance(v, str): | ) and v is not None and not isinstance(v, str): | ||||
| if isinstance(v, torch.Tensor): | if isinstance(v, torch.Tensor): | ||||
| batch[k] = torch.stack([f[k] for f in features]) | batch[k] = torch.stack([f[k] for f in features]) | ||||
| elif isinstance(v, list): | |||||
| batch[k] = torch.stack([d for f in features for d in f[k]]) | |||||
| else: | else: | ||||
| batch[k] = torch.tensor([f[k] for f in features]) | batch[k] = torch.tensor([f[k] for f in features]) | ||||
| elif isinstance(first, tuple): | elif isinstance(first, tuple): | ||||
| @@ -4,8 +4,12 @@ | |||||
| import os | import os | ||||
| import unittest | import unittest | ||||
| import numpy as np | |||||
| from datasets import Dataset | |||||
| from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE | from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE | ||||
| from modelscope.msdatasets import MsDataset | |||||
| TEST_LEVEL = 2 | TEST_LEVEL = 2 | ||||
| TEST_LEVEL_STR = 'TEST_LEVEL' | TEST_LEVEL_STR = 'TEST_LEVEL' | ||||
| @@ -33,3 +37,8 @@ def require_torch(test_case): | |||||
| def set_test_level(level: int): | def set_test_level(level: int): | ||||
| global TEST_LEVEL | global TEST_LEVEL | ||||
| TEST_LEVEL = level | TEST_LEVEL = level | ||||
| def create_dummy_test_dataset(feat, label, num): | |||||
| return MsDataset.from_hf_dataset( | |||||
| Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num))) | |||||
| @@ -34,20 +34,15 @@ class MsDatasetTest(unittest.TestCase): | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| def test_ds_basic(self): | def test_ds_basic(self): | ||||
| ms_ds_full = MsDataset.load('squad', namespace='damotest') | |||||
| ms_ds_full_hf = hfdata.load_dataset('squad') | |||||
| ms_ds_train = MsDataset.load( | |||||
| 'squad', namespace='damotest', split='train') | |||||
| ms_ds_train_hf = hfdata.load_dataset('squad', split='train') | |||||
| ms_image_train = MsDataset.from_hf_dataset( | |||||
| hfdata.load_dataset('beans', split='train')) | |||||
| self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) | |||||
| self.assertEqual(ms_ds_full['validation'][0], | |||||
| ms_ds_full_hf['validation'][0]) | |||||
| self.assertEqual(ms_ds_train[0], ms_ds_train_hf[0]) | |||||
| print(next(iter(ms_ds_full['train']))) | |||||
| print(next(iter(ms_ds_train))) | |||||
| print(next(iter(ms_image_train))) | |||||
| ms_ds_full = MsDataset.load( | |||||
| 'xcopa', subset_name='translation-et', namespace='damotest') | |||||
| ms_ds = MsDataset.load( | |||||
| 'xcopa', | |||||
| subset_name='translation-et', | |||||
| namespace='damotest', | |||||
| split='test') | |||||
| print(next(iter(ms_ds_full['test']))) | |||||
| print(next(iter(ms_ds))) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| @require_torch | @require_torch | ||||
| @@ -56,10 +51,13 @@ class MsDatasetTest(unittest.TestCase): | |||||
| nlp_model = Model.from_pretrained(model_id) | nlp_model = Model.from_pretrained(model_id) | ||||
| preprocessor = SequenceClassificationPreprocessor( | preprocessor = SequenceClassificationPreprocessor( | ||||
| nlp_model.model_dir, | nlp_model.model_dir, | ||||
| first_sequence='context', | |||||
| first_sequence='premise', | |||||
| second_sequence=None) | second_sequence=None) | ||||
| ms_ds_train = MsDataset.load( | ms_ds_train = MsDataset.load( | ||||
| 'squad', namespace='damotest', split='train') | |||||
| 'xcopa', | |||||
| subset_name='translation-et', | |||||
| namespace='damotest', | |||||
| split='test') | |||||
| pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) | pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) | ||||
| import torch | import torch | ||||
| dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | ||||
| @@ -74,10 +72,13 @@ class MsDatasetTest(unittest.TestCase): | |||||
| nlp_model = Model.from_pretrained(model_id) | nlp_model = Model.from_pretrained(model_id) | ||||
| preprocessor = SequenceClassificationPreprocessor( | preprocessor = SequenceClassificationPreprocessor( | ||||
| nlp_model.model_dir, | nlp_model.model_dir, | ||||
| first_sequence='context', | |||||
| first_sequence='premise', | |||||
| second_sequence=None) | second_sequence=None) | ||||
| ms_ds_train = MsDataset.load( | ms_ds_train = MsDataset.load( | ||||
| 'squad', namespace='damotest', split='train') | |||||
| 'xcopa', | |||||
| subset_name='translation-et', | |||||
| namespace='damotest', | |||||
| split='test') | |||||
| tf_dataset = ms_ds_train.to_tf_dataset( | tf_dataset = ms_ds_train.to_tf_dataset( | ||||
| batch_size=5, | batch_size=5, | ||||
| shuffle=True, | shuffle=True, | ||||
| @@ -89,10 +90,9 @@ class MsDatasetTest(unittest.TestCase): | |||||
| @require_torch | @require_torch | ||||
| def test_to_torch_dataset_img(self): | def test_to_torch_dataset_img(self): | ||||
| ms_image_train = MsDataset.load( | ms_image_train = MsDataset.load( | ||||
| 'beans', namespace='damotest', split='train') | |||||
| 'fixtures_image_utils', namespace='damotest', split='test') | |||||
| pt_dataset = ms_image_train.to_torch_dataset( | pt_dataset = ms_image_train.to_torch_dataset( | ||||
| preprocessors=ImgPreprocessor( | |||||
| image_path='image_file_path', label='labels')) | |||||
| preprocessors=ImgPreprocessor(image_path='file')) | |||||
| import torch | import torch | ||||
| dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | ||||
| print(next(iter(dataloader))) | print(next(iter(dataloader))) | ||||
| @@ -103,13 +103,13 @@ class MsDatasetTest(unittest.TestCase): | |||||
| import tensorflow as tf | import tensorflow as tf | ||||
| tf.compat.v1.enable_eager_execution() | tf.compat.v1.enable_eager_execution() | ||||
| ms_image_train = MsDataset.load( | ms_image_train = MsDataset.load( | ||||
| 'beans', namespace='damotest', split='train') | |||||
| 'fixtures_image_utils', namespace='damotest', split='test') | |||||
| tf_dataset = ms_image_train.to_tf_dataset( | tf_dataset = ms_image_train.to_tf_dataset( | ||||
| batch_size=5, | batch_size=5, | ||||
| shuffle=True, | shuffle=True, | ||||
| preprocessors=ImgPreprocessor(image_path='image_file_path'), | |||||
| preprocessors=ImgPreprocessor(image_path='file'), | |||||
| drop_remainder=True, | drop_remainder=True, | ||||
| label_cols='labels') | |||||
| ) | |||||
| print(next(iter(tf_dataset))) | print(next(iter(tf_dataset))) | ||||
| @@ -64,10 +64,13 @@ class ImageMattingTest(unittest.TestCase): | |||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
| def test_run_with_modelscope_dataset(self): | def test_run_with_modelscope_dataset(self): | ||||
| dataset = MsDataset.load( | dataset = MsDataset.load( | ||||
| 'beans', namespace='damotest', split='train', target='image') | |||||
| 'fixtures_image_utils', | |||||
| namespace='damotest', | |||||
| split='test', | |||||
| target='file') | |||||
| img_matting = pipeline(Tasks.image_matting, model=self.model_id) | img_matting = pipeline(Tasks.image_matting, model=self.model_id) | ||||
| result = img_matting(dataset) | result = img_matting(dataset) | ||||
| for i in range(10): | |||||
| for i in range(2): | |||||
| cv2.imwrite(f'result_{i}.png', next(result)[OutputKeys.OUTPUT_IMG]) | cv2.imwrite(f'result_{i}.png', next(result)[OutputKeys.OUTPUT_IMG]) | ||||
| print( | print( | ||||
| f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}' | f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}' | ||||
| @@ -51,11 +51,11 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
| task=Tasks.text_classification, model=self.model_id) | task=Tasks.text_classification, model=self.model_id) | ||||
| result = text_classification( | result = text_classification( | ||||
| MsDataset.load( | MsDataset.load( | ||||
| 'glue', | |||||
| subset_name='sst2', | |||||
| split='train', | |||||
| target='sentence', | |||||
| hub=Hubs.huggingface)) | |||||
| 'xcopa', | |||||
| subset_name='translation-et', | |||||
| namespace='damotest', | |||||
| split='test', | |||||
| target='premise')) | |||||
| self.printDataset(result) | self.printDataset(result) | ||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
| @@ -63,28 +63,11 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
| text_classification = pipeline(task=Tasks.text_classification) | text_classification = pipeline(task=Tasks.text_classification) | ||||
| result = text_classification( | result = text_classification( | ||||
| MsDataset.load( | MsDataset.load( | ||||
| 'glue', | |||||
| subset_name='sst2', | |||||
| split='train', | |||||
| target='sentence', | |||||
| hub=Hubs.huggingface)) | |||||
| self.printDataset(result) | |||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||||
| def test_run_with_dataset(self): | |||||
| model = Model.from_pretrained(self.model_id) | |||||
| preprocessor = SequenceClassificationPreprocessor( | |||||
| model.model_dir, first_sequence='sentence', second_sequence=None) | |||||
| text_classification = pipeline( | |||||
| Tasks.text_classification, model=model, preprocessor=preprocessor) | |||||
| # loaded from huggingface dataset | |||||
| dataset = MsDataset.load( | |||||
| 'glue', | |||||
| subset_name='sst2', | |||||
| split='train', | |||||
| target='sentence', | |||||
| hub=Hubs.huggingface) | |||||
| result = text_classification(dataset) | |||||
| 'xcopa', | |||||
| subset_name='translation-et', | |||||
| namespace='damotest', | |||||
| split='test', | |||||
| target='premise')) | |||||
| self.printDataset(result) | self.printDataset(result) | ||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| @@ -92,11 +75,11 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
| text_classification = pipeline(task=Tasks.text_classification) | text_classification = pipeline(task=Tasks.text_classification) | ||||
| # loaded from modelscope dataset | # loaded from modelscope dataset | ||||
| dataset = MsDataset.load( | dataset = MsDataset.load( | ||||
| 'squad', | |||||
| 'xcopa', | |||||
| subset_name='translation-et', | |||||
| namespace='damotest', | namespace='damotest', | ||||
| split='train', | |||||
| target='context', | |||||
| hub=Hubs.modelscope) | |||||
| split='test', | |||||
| target='premise') | |||||
| result = text_classification(dataset) | result = text_classification(dataset) | ||||
| self.printDataset(result) | self.printDataset(result) | ||||
| @@ -4,24 +4,18 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModelFile | from modelscope.utils.constant import LogKeys, ModelFile | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| def __len__(self): | |||||
| return 20 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -84,7 +78,7 @@ class TensorboardHookTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=DummyModel(), | model=DummyModel(), | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| max_epochs=2) | max_epochs=2) | ||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| @@ -3,24 +3,18 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModelFile | from modelscope.utils.constant import LogKeys, ModelFile | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| def __len__(self): | |||||
| return 20 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -94,7 +88,7 @@ class CheckpointHookTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=DummyModel(), | model=DummyModel(), | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| max_epochs=2) | max_epochs=2) | ||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| @@ -3,17 +3,17 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.metrics.builder import METRICS, MetricKeys | from modelscope.metrics.builder import METRICS, MetricKeys | ||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModelFile | from modelscope.utils.constant import LogKeys, ModelFile | ||||
| from modelscope.utils.registry import default_group | from modelscope.utils.registry import default_group | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | |||||
| _global_iter = 0 | _global_iter = 0 | ||||
| @@ -32,13 +32,8 @@ class DummyMetric: | |||||
| return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]} | return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]} | ||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| def __len__(self): | |||||
| return 20 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -115,8 +110,8 @@ class EvaluationHookTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=DummyModel(), | model=DummyModel(), | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| eval_dataset=dummy_dataset, | |||||
| max_epochs=3) | max_epochs=3) | ||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| @@ -177,8 +172,8 @@ class EvaluationHookTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=DummyModel(), | model=DummyModel(), | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| eval_dataset=dummy_dataset, | |||||
| max_epochs=3) | max_epochs=3) | ||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| @@ -3,28 +3,20 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.optim import SGD | from torch.optim import SGD | ||||
| from torch.optim.lr_scheduler import MultiStepLR | from torch.optim.lr_scheduler import MultiStepLR | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages | from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 10 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -77,7 +69,7 @@ class LrSchedulerHookTest(unittest.TestCase): | |||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=5) | max_epochs=5) | ||||
| @@ -148,7 +140,7 @@ class LrSchedulerHookTest(unittest.TestCase): | |||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| # optimizers=(optimmizer, lr_scheduler), | # optimizers=(optimmizer, lr_scheduler), | ||||
| max_epochs=7) | max_epochs=7) | ||||
| @@ -3,28 +3,20 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.optim import SGD | from torch.optim import SGD | ||||
| from torch.optim.lr_scheduler import MultiStepLR | from torch.optim.lr_scheduler import MultiStepLR | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import ModelFile, TrainerStages | from modelscope.utils.constant import ModelFile, TrainerStages | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 10 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((2, 2)), label=torch.randint(0, 2, (1, ))) | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -76,7 +68,7 @@ class OptimizerHookTest(unittest.TestCase): | |||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=2) | max_epochs=2) | ||||
| @@ -140,7 +132,7 @@ class TorchAMPOptimizerHookTest(unittest.TestCase): | |||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=2, | max_epochs=2, | ||||
| use_fp16=True) | use_fp16=True) | ||||
| @@ -3,28 +3,20 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.optim import SGD | from torch.optim import SGD | ||||
| from torch.optim.lr_scheduler import MultiStepLR | from torch.optim.lr_scheduler import MultiStepLR | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages | from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 10 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -80,7 +72,7 @@ class IterTimerHookTest(unittest.TestCase): | |||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| train_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset, | |||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=5) | max_epochs=5) | ||||
| @@ -6,27 +6,31 @@ import unittest | |||||
| from abc import ABCMeta | from abc import ABCMeta | ||||
| import json | import json | ||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from datasets import Dataset | |||||
| from torch import nn | from torch import nn | ||||
| from torch.optim import SGD | from torch.optim import SGD | ||||
| from torch.optim.lr_scheduler import StepLR | from torch.optim.lr_scheduler import StepLR | ||||
| from torch.utils.data import Dataset | |||||
| from modelscope.metrics.builder import MetricKeys | from modelscope.metrics.builder import MetricKeys | ||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile | from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile | ||||
| from modelscope.utils.test_utils import test_level | |||||
| from modelscope.utils.test_utils import create_dummy_test_dataset, test_level | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| class DummyMetric: | |||||
| def __len__(self): | |||||
| return 20 | |||||
| def __call__(self, ground_truth, predict_results): | |||||
| return {'accuracy': 0.5} | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| dummy_dataset_small = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) | |||||
| dummy_dataset_big = create_dummy_test_dataset( | |||||
| np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -116,8 +120,8 @@ class TrainerTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=DummyModel(), | model=DummyModel(), | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset_small, | |||||
| eval_dataset=dummy_dataset_small, | |||||
| max_epochs=3) | max_epochs=3) | ||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| @@ -174,8 +178,8 @@ class TrainerTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset_small, | |||||
| eval_dataset=dummy_dataset_small, | |||||
| optimizers=(optimmizer, lr_scheduler), | optimizers=(optimmizer, lr_scheduler), | ||||
| max_epochs=3) | max_epochs=3) | ||||
| @@ -212,13 +216,6 @@ class TrainerTest(unittest.TestCase): | |||||
| } | } | ||||
| } | } | ||||
| class _DummyDataset(DummyDataset): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 40 | |||||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | ||||
| with open(config_path, 'w') as f: | with open(config_path, 'w') as f: | ||||
| json.dump(json_cfg, f) | json.dump(json_cfg, f) | ||||
| @@ -231,8 +228,8 @@ class TrainerTest(unittest.TestCase): | |||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=_DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| train_dataset=dummy_dataset_big, | |||||
| eval_dataset=dummy_dataset_small, | |||||
| optimizers=(optimmizer, lr_scheduler), | optimizers=(optimmizer, lr_scheduler), | ||||
| max_epochs=3) | max_epochs=3) | ||||
| @@ -34,13 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase): | |||||
| 'label': [0, 1, 1] | 'label': [0, 1, 1] | ||||
| } | } | ||||
| dataset = Dataset.from_dict(dataset_dict) | dataset = Dataset.from_dict(dataset_dict) | ||||
| class MsDatasetDummy(MsDataset): | |||||
| def __len__(self): | |||||
| return len(self._hf_ds) | |||||
| self.dataset = MsDatasetDummy(dataset) | |||||
| self.dataset = MsDataset.from_hf_dataset(dataset) | |||||
| def tearDown(self): | def tearDown(self): | ||||
| shutil.rmtree(self.tmp_dir) | shutil.rmtree(self.tmp_dir) | ||||