Browse Source

[to #43299989] Fix msdataset

* fix msdataset
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9436292

    * fix msdataset
master
feiwu.yfw 3 years ago
parent
commit
2c3875c0e1
15 changed files with 137 additions and 205 deletions
  1. +22
    -2
      modelscope/msdatasets/ms_dataset.py
  2. +6
    -41
      modelscope/trainers/trainer.py
  3. +2
    -0
      modelscope/utils/tensor_utils.py
  4. +9
    -0
      modelscope/utils/test_utils.py
  5. +24
    -24
      tests/msdatasets/test_ms_dataset.py
  6. +5
    -2
      tests/pipelines/test_image_matting.py
  7. +14
    -31
      tests/pipelines/test_text_classification.py
  8. +5
    -11
      tests/trainers/hooks/logger/test_tensorboard_hook.py
  9. +5
    -11
      tests/trainers/hooks/test_checkpoint_hook.py
  10. +8
    -13
      tests/trainers/hooks/test_evaluation_hook.py
  11. +6
    -14
      tests/trainers/hooks/test_lr_scheduler_hook.py
  12. +6
    -14
      tests/trainers/hooks/test_optimizer_hook.py
  13. +5
    -13
      tests/trainers/hooks/test_timer_hook.py
  14. +19
    -22
      tests/trainers/test_trainer.py
  15. +1
    -7
      tests/trainers/test_trainer_with_nlp.py

+ 22
- 2
modelscope/msdatasets/ms_dataset.py View File

@@ -57,6 +57,9 @@ class MsDataset:
def __getitem__(self, key):
return self._hf_ds[key]

def __len__(self):
return len(self._hf_ds)

@classmethod
def from_hf_dataset(cls,
hf_ds: Union[Dataset, DatasetDict],
@@ -223,6 +226,7 @@ class MsDataset:
retained_columns.append(k)

import torch
import math

class MsIterableDataset(torch.utils.data.IterableDataset):

@@ -230,8 +234,23 @@ class MsDataset:
super(MsIterableDataset).__init__()
self.dataset = dataset

def __len__(self):
return len(self.dataset)

def __iter__(self):
for item_dict in self.dataset:
worker_info = torch.utils.data.get_worker_info()
if worker_info is None: # single-process data loading
iter_start = 0
iter_end = len(self.dataset)
else: # in a worker process
per_worker = math.ceil(
len(self.dataset) / float(worker_info.num_workers))
worker_id = worker_info.id
iter_start = worker_id * per_worker
iter_end = min(iter_start + per_worker, len(self.dataset))

for idx in range(iter_start, iter_end):
item_dict = self.dataset[idx]
res = {
k: np.array(item_dict[k])
for k in columns if k in retained_columns
@@ -273,7 +292,8 @@ class MsDataset:
'The function to_torch_dataset requires pytorch to be installed'
)
if preprocessors is not None:
return self.to_torch_dataset_with_processors(preprocessors)
return self.to_torch_dataset_with_processors(
preprocessors, columns=columns)
else:
self._hf_ds.reset_format()
self._hf_ds.set_format(


+ 6
- 41
modelscope/trainers/trainer.py View File

@@ -21,7 +21,6 @@ from modelscope.models.base_torch import TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.preprocessors import build_preprocessor
from modelscope.preprocessors.base import Preprocessor
from modelscope.task_datasets import TorchTaskDataset, build_task_dataset
from modelscope.trainers.hooks.builder import HOOKS
from modelscope.trainers.hooks.priority import Priority, get_priority
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -49,7 +48,7 @@ class EpochBasedTrainer(BaseTrainer):
or a model id. If model is None, build_model method will be called.
data_collator (`Callable`, *optional*):
The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`.
train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*):
train_dataset (`MsDataset`, *optional*):
The dataset to use for training.

Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
@@ -117,10 +116,10 @@ class EpochBasedTrainer(BaseTrainer):
# TODO how to fill device option?
self.device = int(
os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None
self.train_dataset = self.to_task_dataset(
train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
self.eval_dataset = self.to_task_dataset(
eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
self.train_dataset = train_dataset.to_torch_dataset(
preprocessors=self.preprocessor) if train_dataset else None
self.eval_dataset = eval_dataset.to_torch_dataset(
preprocessors=self.preprocessor) if eval_dataset else None
self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
self.metrics = self.get_metrics()
self.optimizers = optimizers
@@ -179,38 +178,6 @@ class EpochBasedTrainer(BaseTrainer):
"""int: Maximum training iterations."""
return self._max_epochs * len(self.data_loader)

def to_task_dataset(self,
datasets: Tuple[Dataset, List[Dataset]],
mode: str,
preprocessor: Optional[Preprocessor] = None):
"""Build the task specific dataset processor for this trainer.

Returns: The task dataset processor for the task. If no result for the very model-type and task,
the default TaskDataset will be returned.
"""
try:
if not datasets:
return datasets
if isinstance(datasets, TorchTaskDataset):
return datasets
task_dataset = build_task_dataset(
ConfigDict({
**self.cfg.model,
'mode': mode,
'preprocessor': preprocessor,
'datasets': datasets,
}), getattr(self.cfg, 'task', None))
return task_dataset
except Exception:
if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
return TorchTaskDataset(
datasets,
mode=mode,
preprocessor=preprocessor,
**(self.cfg.model if hasattr(self.cfg, 'model') else {}))
else:
return datasets

def build_preprocessor(self) -> Preprocessor:
"""Build the preprocessor.

@@ -448,9 +415,7 @@ class EpochBasedTrainer(BaseTrainer):
)
torch_dataset = dataset.to_torch_dataset(
preprocessors=self.preprocessor, )
dataset = self.to_task_dataset(
torch_dataset, mode, preprocessor=self.preprocessor)
return dataset
return torch_dataset

def create_optimizer_and_scheduler(self):
""" Create optimizer and lr scheduler


+ 2
- 0
modelscope/utils/tensor_utils.py View File

@@ -60,6 +60,8 @@ def torch_default_data_collator(features):
) and v is not None and not isinstance(v, str):
if isinstance(v, torch.Tensor):
batch[k] = torch.stack([f[k] for f in features])
elif isinstance(v, list):
batch[k] = torch.stack([d for f in features for d in f[k]])
else:
batch[k] = torch.tensor([f[k] for f in features])
elif isinstance(first, tuple):


+ 9
- 0
modelscope/utils/test_utils.py View File

@@ -4,8 +4,12 @@
import os
import unittest

import numpy as np
from datasets import Dataset
from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE

from modelscope.msdatasets import MsDataset

TEST_LEVEL = 2
TEST_LEVEL_STR = 'TEST_LEVEL'

@@ -33,3 +37,8 @@ def require_torch(test_case):
def set_test_level(level: int):
global TEST_LEVEL
TEST_LEVEL = level


def create_dummy_test_dataset(feat, label, num):
return MsDataset.from_hf_dataset(
Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num)))

+ 24
- 24
tests/msdatasets/test_ms_dataset.py View File

@@ -34,20 +34,15 @@ class MsDatasetTest(unittest.TestCase):

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_basic(self):
ms_ds_full = MsDataset.load('squad', namespace='damotest')
ms_ds_full_hf = hfdata.load_dataset('squad')
ms_ds_train = MsDataset.load(
'squad', namespace='damotest', split='train')
ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
ms_image_train = MsDataset.from_hf_dataset(
hfdata.load_dataset('beans', split='train'))
self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0])
self.assertEqual(ms_ds_full['validation'][0],
ms_ds_full_hf['validation'][0])
self.assertEqual(ms_ds_train[0], ms_ds_train_hf[0])
print(next(iter(ms_ds_full['train'])))
print(next(iter(ms_ds_train)))
print(next(iter(ms_image_train)))
ms_ds_full = MsDataset.load(
'xcopa', subset_name='translation-et', namespace='damotest')
ms_ds = MsDataset.load(
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test')
print(next(iter(ms_ds_full['test'])))
print(next(iter(ms_ds)))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@require_torch
@@ -56,10 +51,13 @@ class MsDatasetTest(unittest.TestCase):
nlp_model = Model.from_pretrained(model_id)
preprocessor = SequenceClassificationPreprocessor(
nlp_model.model_dir,
first_sequence='context',
first_sequence='premise',
second_sequence=None)
ms_ds_train = MsDataset.load(
'squad', namespace='damotest', split='train')
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test')
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
import torch
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
@@ -74,10 +72,13 @@ class MsDatasetTest(unittest.TestCase):
nlp_model = Model.from_pretrained(model_id)
preprocessor = SequenceClassificationPreprocessor(
nlp_model.model_dir,
first_sequence='context',
first_sequence='premise',
second_sequence=None)
ms_ds_train = MsDataset.load(
'squad', namespace='damotest', split='train')
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test')
tf_dataset = ms_ds_train.to_tf_dataset(
batch_size=5,
shuffle=True,
@@ -89,10 +90,9 @@ class MsDatasetTest(unittest.TestCase):
@require_torch
def test_to_torch_dataset_img(self):
ms_image_train = MsDataset.load(
'beans', namespace='damotest', split='train')
'fixtures_image_utils', namespace='damotest', split='test')
pt_dataset = ms_image_train.to_torch_dataset(
preprocessors=ImgPreprocessor(
image_path='image_file_path', label='labels'))
preprocessors=ImgPreprocessor(image_path='file'))
import torch
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
print(next(iter(dataloader)))
@@ -103,13 +103,13 @@ class MsDatasetTest(unittest.TestCase):
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
ms_image_train = MsDataset.load(
'beans', namespace='damotest', split='train')
'fixtures_image_utils', namespace='damotest', split='test')
tf_dataset = ms_image_train.to_tf_dataset(
batch_size=5,
shuffle=True,
preprocessors=ImgPreprocessor(image_path='image_file_path'),
preprocessors=ImgPreprocessor(image_path='file'),
drop_remainder=True,
label_cols='labels')
)
print(next(iter(tf_dataset)))




+ 5
- 2
tests/pipelines/test_image_matting.py View File

@@ -64,10 +64,13 @@ class ImageMattingTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_modelscope_dataset(self):
dataset = MsDataset.load(
'beans', namespace='damotest', split='train', target='image')
'fixtures_image_utils',
namespace='damotest',
split='test',
target='file')
img_matting = pipeline(Tasks.image_matting, model=self.model_id)
result = img_matting(dataset)
for i in range(10):
for i in range(2):
cv2.imwrite(f'result_{i}.png', next(result)[OutputKeys.OUTPUT_IMG])
print(
f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}'


+ 14
- 31
tests/pipelines/test_text_classification.py View File

@@ -51,11 +51,11 @@ class SequenceClassificationTest(unittest.TestCase):
task=Tasks.text_classification, model=self.model_id)
result = text_classification(
MsDataset.load(
'glue',
subset_name='sst2',
split='train',
target='sentence',
hub=Hubs.huggingface))
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test',
target='premise'))
self.printDataset(result)

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -63,28 +63,11 @@ class SequenceClassificationTest(unittest.TestCase):
text_classification = pipeline(task=Tasks.text_classification)
result = text_classification(
MsDataset.load(
'glue',
subset_name='sst2',
split='train',
target='sentence',
hub=Hubs.huggingface))
self.printDataset(result)

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_dataset(self):
model = Model.from_pretrained(self.model_id)
preprocessor = SequenceClassificationPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
text_classification = pipeline(
Tasks.text_classification, model=model, preprocessor=preprocessor)
# loaded from huggingface dataset
dataset = MsDataset.load(
'glue',
subset_name='sst2',
split='train',
target='sentence',
hub=Hubs.huggingface)
result = text_classification(dataset)
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test',
target='premise'))
self.printDataset(result)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -92,11 +75,11 @@ class SequenceClassificationTest(unittest.TestCase):
text_classification = pipeline(task=Tasks.text_classification)
# loaded from modelscope dataset
dataset = MsDataset.load(
'squad',
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='train',
target='context',
hub=Hubs.modelscope)
split='test',
target='premise')
result = text_classification(dataset)
self.printDataset(result)



+ 5
- 11
tests/trainers/hooks/logger/test_tensorboard_hook.py View File

@@ -4,24 +4,18 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset

from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile
from modelscope.utils.test_utils import create_dummy_test_dataset


class DummyDataset(Dataset, metaclass=ABCMeta):

def __len__(self):
return 20

def __getitem__(self, idx):
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, )))
dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)


class DummyModel(nn.Module):
@@ -84,7 +78,7 @@ class TensorboardHookTest(unittest.TestCase):
cfg_file=config_path,
model=DummyModel(),
data_collator=None,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
max_epochs=2)

trainer = build_trainer(trainer_name, kwargs)


+ 5
- 11
tests/trainers/hooks/test_checkpoint_hook.py View File

@@ -3,24 +3,18 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset

from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile
from modelscope.utils.test_utils import create_dummy_test_dataset


class DummyDataset(Dataset, metaclass=ABCMeta):

def __len__(self):
return 20

def __getitem__(self, idx):
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, )))
dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)


class DummyModel(nn.Module):
@@ -94,7 +88,7 @@ class CheckpointHookTest(unittest.TestCase):
cfg_file=config_path,
model=DummyModel(),
data_collator=None,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
max_epochs=2)

trainer = build_trainer(trainer_name, kwargs)


+ 8
- 13
tests/trainers/hooks/test_evaluation_hook.py View File

@@ -3,17 +3,17 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset

from modelscope.metrics.builder import METRICS, MetricKeys
from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile
from modelscope.utils.registry import default_group
from modelscope.utils.test_utils import create_dummy_test_dataset

_global_iter = 0

@@ -32,13 +32,8 @@ class DummyMetric:
return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]}


class DummyDataset(Dataset, metaclass=ABCMeta):

def __len__(self):
return 20

def __getitem__(self, idx):
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, )))
dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)


class DummyModel(nn.Module):
@@ -115,8 +110,8 @@ class EvaluationHookTest(unittest.TestCase):
cfg_file=config_path,
model=DummyModel(),
data_collator=None,
train_dataset=DummyDataset(),
eval_dataset=DummyDataset(),
train_dataset=dummy_dataset,
eval_dataset=dummy_dataset,
max_epochs=3)

trainer = build_trainer(trainer_name, kwargs)
@@ -177,8 +172,8 @@ class EvaluationHookTest(unittest.TestCase):
cfg_file=config_path,
model=DummyModel(),
data_collator=None,
train_dataset=DummyDataset(),
eval_dataset=DummyDataset(),
train_dataset=dummy_dataset,
eval_dataset=dummy_dataset,
max_epochs=3)

trainer = build_trainer(trainer_name, kwargs)


+ 6
- 14
tests/trainers/hooks/test_lr_scheduler_hook.py View File

@@ -3,28 +3,20 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import Dataset

from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
from modelscope.utils.test_utils import create_dummy_test_dataset


class DummyDataset(Dataset, metaclass=ABCMeta):
"""Base Dataset
"""

def __len__(self):
return 10

def __getitem__(self, idx):
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, )))
dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)


class DummyModel(nn.Module):
@@ -77,7 +69,7 @@ class LrSchedulerHookTest(unittest.TestCase):
kwargs = dict(
cfg_file=config_path,
model=model,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=5)

@@ -148,7 +140,7 @@ class LrSchedulerHookTest(unittest.TestCase):
kwargs = dict(
cfg_file=config_path,
model=model,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
# optimizers=(optimmizer, lr_scheduler),
max_epochs=7)



+ 6
- 14
tests/trainers/hooks/test_optimizer_hook.py View File

@@ -3,28 +3,20 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import Dataset

from modelscope.trainers import build_trainer
from modelscope.utils.constant import ModelFile, TrainerStages
from modelscope.utils.test_utils import create_dummy_test_dataset


class DummyDataset(Dataset, metaclass=ABCMeta):
"""Base Dataset
"""

def __len__(self):
return 10

def __getitem__(self, idx):
return dict(feat=torch.rand((2, 2)), label=torch.randint(0, 2, (1, )))
dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10)


class DummyModel(nn.Module):
@@ -76,7 +68,7 @@ class OptimizerHookTest(unittest.TestCase):
kwargs = dict(
cfg_file=config_path,
model=model,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=2)

@@ -140,7 +132,7 @@ class TorchAMPOptimizerHookTest(unittest.TestCase):
kwargs = dict(
cfg_file=config_path,
model=model,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=2,
use_fp16=True)


+ 5
- 13
tests/trainers/hooks/test_timer_hook.py View File

@@ -3,28 +3,20 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import Dataset

from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
from modelscope.utils.test_utils import create_dummy_test_dataset


class DummyDataset(Dataset, metaclass=ABCMeta):
"""Base Dataset
"""

def __len__(self):
return 10

def __getitem__(self, idx):
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, )))
dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)


class DummyModel(nn.Module):
@@ -80,7 +72,7 @@ class IterTimerHookTest(unittest.TestCase):
kwargs = dict(
cfg_file=config_path,
model=model,
train_dataset=DummyDataset(),
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=5)



+ 19
- 22
tests/trainers/test_trainer.py View File

@@ -6,27 +6,31 @@ import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from datasets import Dataset
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset

from modelscope.metrics.builder import MetricKeys
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
from modelscope.utils.test_utils import test_level
from modelscope.utils.test_utils import create_dummy_test_dataset, test_level


class DummyDataset(Dataset, metaclass=ABCMeta):
"""Base Dataset
"""
class DummyMetric:

def __len__(self):
return 20
def __call__(self, ground_truth, predict_results):
return {'accuracy': 0.5}

def __getitem__(self, idx):
return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, )))

dummy_dataset_small = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)

dummy_dataset_big = create_dummy_test_dataset(
np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)


class DummyModel(nn.Module):
@@ -116,8 +120,8 @@ class TrainerTest(unittest.TestCase):
cfg_file=config_path,
model=DummyModel(),
data_collator=None,
train_dataset=DummyDataset(),
eval_dataset=DummyDataset(),
train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small,
max_epochs=3)

trainer = build_trainer(trainer_name, kwargs)
@@ -174,8 +178,8 @@ class TrainerTest(unittest.TestCase):
cfg_file=config_path,
model=model,
data_collator=None,
train_dataset=DummyDataset(),
eval_dataset=DummyDataset(),
train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small,
optimizers=(optimmizer, lr_scheduler),
max_epochs=3)

@@ -212,13 +216,6 @@ class TrainerTest(unittest.TestCase):
}
}

class _DummyDataset(DummyDataset):
"""Base Dataset
"""

def __len__(self):
return 40

config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
with open(config_path, 'w') as f:
json.dump(json_cfg, f)
@@ -231,8 +228,8 @@ class TrainerTest(unittest.TestCase):
cfg_file=config_path,
model=model,
data_collator=None,
train_dataset=_DummyDataset(),
eval_dataset=DummyDataset(),
train_dataset=dummy_dataset_big,
eval_dataset=dummy_dataset_small,
optimizers=(optimmizer, lr_scheduler),
max_epochs=3)



+ 1
- 7
tests/trainers/test_trainer_with_nlp.py View File

@@ -34,13 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase):
'label': [0, 1, 1]
}
dataset = Dataset.from_dict(dataset_dict)

class MsDatasetDummy(MsDataset):

def __len__(self):
return len(self._hf_ds)

self.dataset = MsDatasetDummy(dataset)
self.dataset = MsDataset.from_hf_dataset(dataset)

def tearDown(self):
shutil.rmtree(self.tmp_dir)


Loading…
Cancel
Save