diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index d906a80d..09bff2c1 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -362,8 +362,10 @@ class HubApi: dataset_name: str, namespace: str, revision: Optional[str] = DEFAULT_DATASET_REVISION): - return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ - f'Revision={revision}&FilePath={file_name}' + if file_name.endswith('.csv'): + file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ + f'Revision={revision}&FilePath={file_name}' + return file_name def get_dataset_access_config( self, diff --git a/modelscope/models/cv/image_instance_segmentation/__init__.py b/modelscope/models/cv/image_instance_segmentation/__init__.py index 4706f8f8..8ccfef4b 100644 --- a/modelscope/models/cv/image_instance_segmentation/__init__.py +++ b/modelscope/models/cv/image_instance_segmentation/__init__.py @@ -7,13 +7,11 @@ if TYPE_CHECKING: from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin from .model import CascadeMaskRCNNSwinModel from .postprocess_utils import get_img_ins_seg_result - from .datasets import ImageInstanceSegmentationCocoDataset else: _import_structure = { 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], 'model': ['CascadeMaskRCNNSwinModel'], 'postprocess_utils': ['get_img_ins_seg_result'], - 'datasets': ['ImageInstanceSegmentationCocoDataset'] } import sys diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py index 93c71b46..cca1432f 100644 --- a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py +++ b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py @@ -1,2 +1 @@ -from .dataset import ImageInstanceSegmentationCocoDataset from .transforms import build_preprocess_transform diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 1e84dd8a..6e4486dd 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path, relative_to_absolute_path) from modelscope.msdatasets.config import MS_DATASETS_CACHE +from modelscope.utils.config import ConfigDict from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DatasetFormations, DownloadMode, Hubs) from modelscope.utils.logger import get_logger +from .task_datasets.builder import build_task_dataset +from .utils.dataset_builder import ExternalDataset from .utils.dataset_utils import (get_dataset_files, get_target_dataset_structure, load_dataset_builder) @@ -67,9 +70,16 @@ class MsDataset: def __len__(self): return len(self._hf_ds) + @property + def config_kwargs(self): + if isinstance(self._hf_ds, ExternalDataset): + return self._hf_ds.config_kwargs + else: + return None + @classmethod def from_hf_dataset(cls, - hf_ds: Union[Dataset, DatasetDict], + hf_ds: Union[Dataset, DatasetDict, ExternalDataset], target: str = None) -> Union[dict, 'MsDataset']: if isinstance(hf_ds, Dataset): return cls(hf_ds, target) @@ -77,6 +87,8 @@ class MsDataset: if len(hf_ds.keys()) == 1: return cls(next(iter(hf_ds.values())), target) return {k: cls(v, target) for k, v in hf_ds.items()} + elif isinstance(hf_ds, ExternalDataset): + return cls(hf_ds) else: raise TypeError( f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' @@ -96,7 +108,8 @@ class MsDataset: Mapping[str, Union[str, Sequence[str]]]]] = None, download_mode: Optional[DownloadMode] = DownloadMode. - REUSE_DATASET_IF_EXISTS + REUSE_DATASET_IF_EXISTS, + **config_kwargs, ) -> Union[dict, 'MsDataset']: """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. Args: @@ -113,6 +126,7 @@ class MsDataset: hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope download_mode (DownloadMode or str, optional): How to treat existing datasets. default DownloadMode.REUSE_DATASET_IF_EXISTS + **config_kwargs (additional keyword arguments): Keyword arguments to be passed Returns: MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. @@ -128,7 +142,8 @@ class MsDataset: split=split, data_dir=data_dir, data_files=data_files, - download_mode=download_mode.value) + download_mode=download_mode.value, + **config_kwargs) return MsDataset.from_hf_dataset(dataset, target=target) elif hub == Hubs.modelscope: return MsDataset._load_ms_dataset( @@ -140,22 +155,22 @@ class MsDataset: split=split, data_dir=data_dir, data_files=data_files, - download_mode=download_mode) + download_mode=download_mode, + **config_kwargs) @staticmethod - def _load_ms_dataset( - dataset_name: Union[str, list], - namespace: Optional[str] = None, - target: Optional[str] = None, - version: Optional[str] = DEFAULT_DATASET_REVISION, - subset_name: Optional[str] = None, - split: Optional[str] = None, - data_dir: Optional[str] = None, - data_files: Optional[Union[str, Sequence[str], - Mapping[str, Union[str, - Sequence[str]]]]] = None, - download_mode: Optional[DownloadMode] = None - ) -> Union[dict, 'MsDataset']: + def _load_ms_dataset(dataset_name: Union[str, list], + namespace: Optional[str] = None, + target: Optional[str] = None, + version: Optional[str] = DEFAULT_DATASET_REVISION, + subset_name: Optional[str] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[ + str, Sequence[str], + Mapping[str, Union[str, Sequence[str]]]]] = None, + download_mode: Optional[DownloadMode] = None, + **config_kwargs) -> Union[dict, 'MsDataset']: if isinstance(dataset_name, str): dataset_formation = DatasetFormations.native if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ @@ -184,7 +199,8 @@ class MsDataset: data_dir=data_dir, data_files=data_files, cache_dir=MS_DATASETS_CACHE, - download_mode=download_mode.value) + download_mode=download_mode.value, + **config_kwargs) else: dataset = MsDataset._load_from_ms( dataset_name, @@ -195,7 +211,7 @@ class MsDataset: subset_name=subset_name, split=split, download_mode=download_mode, - ) + **config_kwargs) elif isinstance(dataset_name, list): if target is None: target = 'target' @@ -206,16 +222,15 @@ class MsDataset: return MsDataset.from_hf_dataset(dataset, target=target) @staticmethod - def _load_from_ms( - dataset_name: str, - dataset_files: dict, - download_dir: str, - namespace: Optional[str] = None, - version: Optional[str] = DEFAULT_DATASET_REVISION, - subset_name: Optional[str] = None, - split: Optional[str] = None, - download_mode: Optional[DownloadMode] = None, - ) -> Union[Dataset, DatasetDict]: + def _load_from_ms(dataset_name: str, + dataset_files: dict, + download_dir: str, + namespace: Optional[str] = None, + version: Optional[str] = DEFAULT_DATASET_REVISION, + subset_name: Optional[str] = None, + split: Optional[str] = None, + download_mode: Optional[DownloadMode] = None, + **config_kwargs) -> Union[Dataset, DatasetDict]: for json_path in dataset_files['.json']: if json_path.endswith(f'{dataset_name}.json'): with open(json_path, encoding='utf-8') as dataset_json_file: @@ -226,7 +241,6 @@ class MsDataset: meta_map, file_map = get_dataset_files(target_dataset_structure, dataset_name, namespace, version) - builder = load_dataset_builder( dataset_name, subset_name, @@ -235,7 +249,8 @@ class MsDataset: zip_data_files=file_map, cache_dir=MS_DATASETS_CACHE, version=version, - split=list(target_dataset_structure.keys())) + split=list(target_dataset_structure.keys()), + **config_kwargs) download_config = DownloadConfig( cache_dir=download_dir, @@ -253,7 +268,6 @@ class MsDataset: data_dir=download_dir, ) builder.download_and_prepare( - download_config=download_config, dl_manager=dl_manager, download_mode=download_mode.value, try_from_hf_gcs=False) @@ -338,6 +352,8 @@ class MsDataset: self, columns: Union[str, List[str]] = None, preprocessors: Union[Callable, List[Callable]] = None, + task_name: str = None, + task_data_config: ConfigDict = None, **format_kwargs, ): """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to @@ -350,6 +366,8 @@ class MsDataset: columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of processors will also be added. + task_name (str, default None): task name, refer to :obj:`Tasks` for more details + task_data_config (ConfigDict, default None): config dict for model object. format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. Returns: @@ -360,6 +378,10 @@ class MsDataset: raise ImportError( 'The function to_torch_dataset requires pytorch to be installed' ) + if isinstance(self._hf_ds, ExternalDataset): + task_data_config.update({'preprocessor': preprocessors}) + return build_task_dataset(task_data_config, task_name, + self._hf_ds.config_kwargs) if preprocessors is not None: return self.to_torch_dataset_with_processors( preprocessors, columns=columns) diff --git a/modelscope/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py similarity index 80% rename from modelscope/task_datasets/__init__.py rename to modelscope/msdatasets/task_datasets/__init__.py index 93e01cb5..c80f8cd5 100644 --- a/modelscope/task_datasets/__init__.py +++ b/modelscope/msdatasets/task_datasets/__init__.py @@ -8,6 +8,7 @@ if TYPE_CHECKING: from .builder import TASK_DATASETS, build_task_dataset from .torch_base_dataset import TorchTaskDataset from .veco_dataset import VecoDataset + from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset else: _import_structure = { @@ -15,6 +16,8 @@ else: 'builder': ['TASK_DATASETS', 'build_task_dataset'], 'torch_base_dataset': ['TorchTaskDataset'], 'veco_dataset': ['VecoDataset'], + 'image_instance_segmentation_coco_dataset': + ['ImageInstanceSegmentationCocoDataset'] } import sys diff --git a/modelscope/task_datasets/base.py b/modelscope/msdatasets/task_datasets/base.py similarity index 100% rename from modelscope/task_datasets/base.py rename to modelscope/msdatasets/task_datasets/base.py diff --git a/modelscope/task_datasets/builder.py b/modelscope/msdatasets/task_datasets/builder.py similarity index 100% rename from modelscope/task_datasets/builder.py rename to modelscope/msdatasets/task_datasets/builder.py diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py similarity index 90% rename from modelscope/models/cv/image_instance_segmentation/datasets/dataset.py rename to modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py index d9e1b348..04c8e142 100644 --- a/modelscope/models/cv/image_instance_segmentation/datasets/dataset.py +++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py @@ -2,14 +2,32 @@ import os.path as osp import numpy as np from pycocotools.coco import COCO -from torch.utils.data import Dataset - -class ImageInstanceSegmentationCocoDataset(Dataset): +from modelscope.metainfo import Models +from modelscope.utils.constant import Tasks +from .builder import TASK_DATASETS +from .torch_base_dataset import TorchTaskDataset + +DATASET_STRUCTURE = { + 'train': { + 'annotation': 'annotations/instances_train.json', + 'images': 'images/train' + }, + 'validation': { + 'annotation': 'annotations/instances_val.json', + 'images': 'images/val' + } +} + + +@TASK_DATASETS.register_module( + module_name=Models.cascade_mask_rcnn_swin, + group_key=Tasks.image_segmentation) +class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): """Coco-style dataset for image instance segmentation. Args: - ann_file (str): Annotation file path. + split_config (dict): Annotation file path. {"train":"xxxxx"} classes (Sequence[str], optional): Specify classes to load. If is None, ``cls.CLASSES`` will be used. Default: None. data_root (str, optional): Data root for ``ann_file``, @@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset): 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') def __init__(self, - ann_file, + split_config: dict, + preprocessor=None, classes=None, - data_root=None, - img_prefix='', seg_prefix=None, test_mode=False, - filter_empty_gt=True): - self.ann_file = ann_file - self.data_root = data_root - self.img_prefix = img_prefix + filter_empty_gt=True, + **kwargs): + self.data_root = next(iter(split_config.values())) + self.split = next(iter(split_config.keys())) + self.preprocessor = preprocessor + + self.ann_file = osp.join(self.data_root, + DATASET_STRUCTURE[self.split]['annotation']) + + self.img_prefix = osp.join(self.data_root, + DATASET_STRUCTURE[self.split]['images']) self.seg_prefix = seg_prefix self.test_mode = test_mode self.filter_empty_gt = filter_empty_gt self.CLASSES = self.get_classes(classes) - # join paths if data_root is specified - if self.data_root is not None: - if not osp.isabs(self.ann_file): - self.ann_file = osp.join(self.data_root, self.ann_file) - if not (self.img_prefix is None or osp.isabs(self.img_prefix)): - self.img_prefix = osp.join(self.data_root, self.img_prefix) - if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): - self.seg_prefix = osp.join(self.data_root, self.seg_prefix) - # load annotations self.data_infos = self.load_annotations(self.ann_file) @@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset): # set group flag for the sampler self._set_group_flag() - self.preprocessor = None - def __len__(self): """Total number of samples of data.""" return len(self.data_infos) @@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset): raise ValueError(f'Unsupported type {type(classes)} of classes.') return class_names - - def to_torch_dataset(self, preprocessors=None): - self.preprocessor = preprocessors - return self diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py similarity index 100% rename from modelscope/task_datasets/torch_base_dataset.py rename to modelscope/msdatasets/task_datasets/torch_base_dataset.py diff --git a/modelscope/task_datasets/veco_dataset.py b/modelscope/msdatasets/task_datasets/veco_dataset.py similarity index 100% rename from modelscope/task_datasets/veco_dataset.py rename to modelscope/msdatasets/task_datasets/veco_dataset.py diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py index 2b4bad07..85489c58 100644 --- a/modelscope/msdatasets/utils/dataset_builder.py +++ b/modelscope/msdatasets/utils/dataset_builder.py @@ -8,6 +8,7 @@ from datasets.info import DatasetInfo from datasets.packaged_modules import csv from datasets.utils.filelock import FileLock +from modelscope.utils.constant import DownloadMode from modelscope.utils.logger import get_logger logger = get_logger() @@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv): zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, **config_kwargs, ): + self.namespace = namespace super().__init__( cache_dir=cache_dir, name=subset_name, hash=hash, - namespace=namespace, data_files=meta_data_files, **config_kwargs) @@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv): os.rmdir(self._cache_dir) self.zip_data_files = zip_data_files + def _relative_data_dir(self, with_version=True, with_hash=True) -> str: + """Relative path of this dataset in cache_dir: + Will be: + self.name/self.config.version/self.hash/ + or if a namespace has been specified: + self.namespace___self.name/self.config.version/self.hash/ + """ + builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' + builder_config = self.config + hash = self.hash + if builder_config: + builder_data_dir = os.path.join(builder_data_dir, self.config_id) + if with_version: + builder_data_dir = os.path.join(builder_data_dir, + str(self.config.version)) + if with_hash and hash and isinstance(hash, str): + builder_data_dir = os.path.join(builder_data_dir, hash) + return builder_data_dir + def _build_cache_dir(self): builder_data_dir = os.path.join( self._cache_dir_root, @@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv): datasets.SplitGenerator( name=split_name, gen_kwargs={ - 'files': dl_manager.iter_files(files), - 'base_dir': zip_data_files.get(split_name) + 'files': + dl_manager.iter_files(files), + 'base_dir': + os.path.join( + zip_data_files.get(split_name), + os.path.splitext( + self.zip_data_files.get(split_name))[0]) + if self.zip_data_files.get(split_name) else + zip_data_files.get(split_name) })) return splits @@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv): logger.error( f"Failed to read file '{file}' with error {type(e)}: {e}") raise + + +class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): + + def __init__( + self, + dataset_name: str, + cache_dir: str, + namespace: str, + subset_name: str, + hash: str, + meta_data_files: Mapping[str, Union[str, Sequence[str]]], + zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, + **config_kwargs, + ): + self.name = dataset_name + self.subset_name = subset_name + self.namespace = namespace + self.hash = hash + self.data_files = meta_data_files + self.zip_data_files = zip_data_files + self.split_path_dict = None + self.config = None + self._cache_dir_root = os.path.expanduser(cache_dir) + self._cache_dir = self._build_cache_dir() + self._config_kwargs = config_kwargs + + def download_and_prepare(self, download_mode, dl_manager, + **download_kwargs): + # Prevent parallel disk operations + lock_path = os.path.join( + self._cache_dir_root, + self._cache_dir.replace(os.sep, '_') + '.lock') + with FileLock(lock_path): + data_exists = os.path.exists(self._cache_dir) + if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: + logger.warning( + f'Reusing dataset {self.name} ({self._cache_dir})') + return + logger.info(f'Generating dataset {self.name} ({self._cache_dir})') + self._download_and_prepare(dl_manager=dl_manager) + + def _download_and_prepare(self, dl_manager): + split_path_dict = dl_manager.download_and_extract(self.zip_data_files) + self.split_path_dict = { + k: os.path.join(v, + os.path.splitext(self.zip_data_files[k])[0]) + for k, v in split_path_dict.items() + } + + def as_dataset(self): + return ExternalDataset(self.split_path_dict, self._config_kwargs) + + +class ExternalDataset(object): + + def __init__(self, split_path_dict, config_kwargs): + config_kwargs.update({'split_config': split_path_dict}) + self.config_kwargs = config_kwargs + + def __len__(self): + return len(self.config_kwargs['split_config']) diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index ff7cd8b1..09556d84 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder from modelscope.utils.constant import DEFAULT_DATASET_REVISION from modelscope.utils.logger import get_logger -from .dataset_builder import MsCsvDatasetBuilder +from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder logger = get_logger() @@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict, modelscope_api = HubApi() for split, info in subset_split_into.items(): meta_map[split] = modelscope_api.get_dataset_file_url( - info['meta'], dataset_name, namespace, revision) + info.get('meta', ''), dataset_name, namespace, revision) if info.get('file'): file_map[split] = info['file'] return meta_map, file_map @@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, zip_data_files: Mapping[str, Union[str, Sequence[str]]], cache_dir: str, version: Optional[Union[str]], - split: Sequence[str]) -> DatasetBuilder: + split: Sequence[str], + **config_kwargs) -> DatasetBuilder: sub_dir = os.path.join(version, '_'.join(split)) - builder_instance = MsCsvDatasetBuilder( - dataset_name=dataset_name, - namespace=namespace, - cache_dir=cache_dir, - subset_name=subset_name, - meta_data_files=meta_data_files, - zip_data_files=zip_data_files, - hash=sub_dir) + meta_data_file = next(iter(meta_data_files.values())) + if not meta_data_file: + builder_instance = TaskSpecificDatasetBuilder( + dataset_name=dataset_name, + namespace=namespace, + cache_dir=cache_dir, + subset_name=subset_name, + meta_data_files=meta_data_files, + zip_data_files=zip_data_files, + hash=sub_dir, + **config_kwargs) + elif meta_data_file.endswith('.csv'): + builder_instance = MsCsvDatasetBuilder( + dataset_name=dataset_name, + namespace=namespace, + cache_dir=cache_dir, + subset_name=subset_name, + meta_data_files=meta_data_files, + zip_data_files=zip_data_files, + hash=sub_dir) + else: + raise NotImplementedError( + f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet' + ) return builder_instance diff --git a/modelscope/trainers/cv/image_instance_segmentation_trainer.py b/modelscope/trainers/cv/image_instance_segmentation_trainer.py index e7632147..2e2415dc 100644 --- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py +++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py @@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer): def prediction_step(self, model, inputs): pass - - def to_task_dataset(self, datasets, mode, preprocessor=None): - # wait for dataset interface to become stable... - return datasets.to_torch_dataset(preprocessor) diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 9922d374..3692b486 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -202,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer): """Veco evaluates the datasets one by one. """ - from modelscope.task_datasets import VecoDataset + from modelscope.msdatasets.task_datasets import VecoDataset self.model.eval() self._mode = ModeKeys.EVAL metric_values = {} diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 544c0d9e..0916495c 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -21,11 +21,12 @@ from modelscope.metainfo import Trainers from modelscope.metrics import build_metric, task_default_metrics from modelscope.models.base import Model, TorchModel from modelscope.msdatasets.ms_dataset import MsDataset +from modelscope.msdatasets.task_datasets.builder import build_task_dataset +from modelscope.msdatasets.task_datasets.torch_base_dataset import \ + TorchTaskDataset from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import build_preprocessor from modelscope.preprocessors.common import Compose -from modelscope.task_datasets.builder import build_task_dataset -from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.lrscheduler.builder import build_lr_scheduler @@ -288,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer): if isinstance(datasets, TorchTaskDataset): return datasets elif isinstance(datasets, MsDataset): - datasets = datasets.to_torch_dataset( + cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ + else ConfigDict(type=None, mode=mode) + return datasets.to_torch_dataset( + task_data_config=cfg, + task_name=self.cfg.task, preprocessors=preprocessor) - return datasets elif isinstance(datasets, List) and isinstance( datasets[0], MsDataset): + cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ + else ConfigDict(type=None, mode=mode) datasets = [ - d.to_torch_dataset(preprocessor=preprocessor) - for d in datasets + d.to_torch_dataset( + task_data_config=cfg, + task_name=self.cfg.task, + preprocessors=preprocessor) for d in datasets ] cfg = ConfigDict( type=self.cfg.task, mode=mode, datasets=datasets) @@ -585,8 +593,13 @@ class EpochBasedTrainer(BaseTrainer): subset_name=data_cfg.subset_name if hasattr( data_cfg, 'subset_name') else None, hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, + **data_cfg, ) - torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) + cfg = ConfigDict(type=self.cfg.model.type, mode=mode) + torch_dataset = dataset.to_torch_dataset( + task_data_config=cfg, + task_name=self.cfg.task, + preprocessors=self.preprocessor) dataset = self.to_task_dataset(torch_dataset, mode) return dataset diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index a86dfbc2..ef6517fa 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1]) REGISTER_MODULE = 'register_module' IGNORED_PACKAGES = ['modelscope', '.'] SCAN_SUB_FOLDERS = [ - 'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets', - 'trainers' + 'models', 'metrics', 'pipelines', 'preprocessors', + 'msdatasets/task_datasets', 'trainers' ] INDEXER_FILE = 'ast_indexer' DECORATOR_KEY = 'decorators' diff --git a/requirements/runtime.txt b/requirements/runtime.txt index ce18dcea..e2b78f06 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,6 +1,5 @@ addict -#version above 2.1.0 introduces backward-compatability issue which is being resolved -datasets==2.1.0 +datasets easydict einops filelock>=3.3.0 diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 0894ce3d..f9118353 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -4,6 +4,7 @@ from modelscope.models import Model from modelscope.msdatasets import MsDataset from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors.base import Preprocessor +from modelscope.utils.constant import DownloadMode from modelscope.utils.test_utils import require_tf, require_torch, test_level @@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor): class MsDatasetTest(unittest.TestCase): + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_coco(self): + ms_ds_train = MsDataset.load( + 'pets_small', + namespace='modelscope', + split='train', + download_mode=DownloadMode.FORCE_REDOWNLOAD, + classes=('1', '2')) + print(ms_ds_train._hf_ds.config_kwargs) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ms_csv_basic(self): ms_ds_train = MsDataset.load( diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py index fc59750d..76da1681 100644 --- a/tests/taskdataset/test_veco_dataset.py +++ b/tests/taskdataset/test_veco_dataset.py @@ -2,7 +2,7 @@ import unittest -from modelscope.task_datasets.veco_dataset import VecoDataset +from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset from modelscope.utils.test_utils import test_level diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py index 35d0378f..c8557ff5 100644 --- a/tests/trainers/test_image_instance_segmentation_trainer.py +++ b/tests/trainers/test_image_instance_segmentation_trainer.py @@ -8,10 +8,13 @@ from functools import partial from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers -from modelscope.models.cv.image_instance_segmentation import ( - CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset) +from modelscope.models.cv.image_instance_segmentation import \ + CascadeMaskRCNNSwinModel +from modelscope.msdatasets import MsDataset +from modelscope.msdatasets.task_datasets import \ + ImageInstanceSegmentationCocoDataset from modelscope.trainers import build_trainer -from modelscope.utils.config import Config +from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import ModelFile from modelscope.utils.test_utils import test_level @@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) cfg = Config.from_file(config_path) - data_root = cfg.dataset.data_root - classes = tuple(cfg.dataset.classes) max_epochs = cfg.train.max_epochs samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu - - if data_root is None: + try: + train_data_cfg = cfg.dataset.train + val_data_cfg = cfg.dataset.val + except Exception: + train_data_cfg = None + val_data_cfg = None + if train_data_cfg is None: # use default toy data - dataset_path = os.path.join(cache_path, 'toydata.zip') - with zipfile.ZipFile(dataset_path, 'r') as zipf: - zipf.extractall(cache_path) - data_root = cache_path + '/toydata/' - classes = ('Cat', 'Dog') - - self.train_dataset = ImageInstanceSegmentationCocoDataset( - data_root + 'annotations/instances_train.json', - classes=classes, - data_root=data_root, - img_prefix=data_root + 'images/train/', - seg_prefix=None, - test_mode=False) - - self.eval_dataset = ImageInstanceSegmentationCocoDataset( - data_root + 'annotations/instances_val.json', - classes=classes, - data_root=data_root, - img_prefix=data_root + 'images/val/', - seg_prefix=None, - test_mode=True) + train_data_cfg = ConfigDict( + name='pets_small', + split='train', + classes=('Cat', 'Dog'), + test_mode=False) + if val_data_cfg is None: + val_data_cfg = ConfigDict( + name='pets_small', + split='validation', + classes=('Cat', 'Dog'), + test_mode=True) + + self.train_dataset = MsDataset.load( + dataset_name=train_data_cfg.name, + split=train_data_cfg.split, + classes=train_data_cfg.classes, + test_mode=train_data_cfg.test_mode) + assert self.train_dataset.config_kwargs[ + 'classes'] == train_data_cfg.classes + assert next( + iter(self.train_dataset.config_kwargs['split_config'].values())) + + self.eval_dataset = MsDataset.load( + dataset_name=val_data_cfg.name, + split=val_data_cfg.split, + classes=val_data_cfg.classes, + test_mode=val_data_cfg.test_mode) + assert self.eval_dataset.config_kwargs[ + 'classes'] == val_data_cfg.classes + assert next( + iter(self.eval_dataset.config_kwargs['split_config'].values())) from mmcv.parallel import collate