msdataset add coco dataset unify taskdataset and ms dataset fix hf datasetsmaster
| @@ -362,8 +362,10 @@ class HubApi: | |||||
| dataset_name: str, | dataset_name: str, | ||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| f'Revision={revision}&FilePath={file_name}' | |||||
| if file_name.endswith('.csv'): | |||||
| file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| f'Revision={revision}&FilePath={file_name}' | |||||
| return file_name | |||||
| def get_dataset_access_config( | def get_dataset_access_config( | ||||
| self, | self, | ||||
| @@ -7,13 +7,11 @@ if TYPE_CHECKING: | |||||
| from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin | from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin | ||||
| from .model import CascadeMaskRCNNSwinModel | from .model import CascadeMaskRCNNSwinModel | ||||
| from .postprocess_utils import get_img_ins_seg_result | from .postprocess_utils import get_img_ins_seg_result | ||||
| from .datasets import ImageInstanceSegmentationCocoDataset | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], | 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], | ||||
| 'model': ['CascadeMaskRCNNSwinModel'], | 'model': ['CascadeMaskRCNNSwinModel'], | ||||
| 'postprocess_utils': ['get_img_ins_seg_result'], | 'postprocess_utils': ['get_img_ins_seg_result'], | ||||
| 'datasets': ['ImageInstanceSegmentationCocoDataset'] | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -1,2 +1 @@ | |||||
| from .dataset import ImageInstanceSegmentationCocoDataset | |||||
| from .transforms import build_preprocess_transform | from .transforms import build_preprocess_transform | ||||
| @@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path, | |||||
| relative_to_absolute_path) | relative_to_absolute_path) | ||||
| from modelscope.msdatasets.config import MS_DATASETS_CACHE | from modelscope.msdatasets.config import MS_DATASETS_CACHE | ||||
| from modelscope.utils.config import ConfigDict | |||||
| from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | ||||
| DatasetFormations, DownloadMode, Hubs) | DatasetFormations, DownloadMode, Hubs) | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .task_datasets.builder import build_task_dataset | |||||
| from .utils.dataset_builder import ExternalDataset | |||||
| from .utils.dataset_utils import (get_dataset_files, | from .utils.dataset_utils import (get_dataset_files, | ||||
| get_target_dataset_structure, | get_target_dataset_structure, | ||||
| load_dataset_builder) | load_dataset_builder) | ||||
| @@ -67,9 +70,16 @@ class MsDataset: | |||||
| def __len__(self): | def __len__(self): | ||||
| return len(self._hf_ds) | return len(self._hf_ds) | ||||
| @property | |||||
| def config_kwargs(self): | |||||
| if isinstance(self._hf_ds, ExternalDataset): | |||||
| return self._hf_ds.config_kwargs | |||||
| else: | |||||
| return None | |||||
| @classmethod | @classmethod | ||||
| def from_hf_dataset(cls, | def from_hf_dataset(cls, | ||||
| hf_ds: Union[Dataset, DatasetDict], | |||||
| hf_ds: Union[Dataset, DatasetDict, ExternalDataset], | |||||
| target: str = None) -> Union[dict, 'MsDataset']: | target: str = None) -> Union[dict, 'MsDataset']: | ||||
| if isinstance(hf_ds, Dataset): | if isinstance(hf_ds, Dataset): | ||||
| return cls(hf_ds, target) | return cls(hf_ds, target) | ||||
| @@ -77,6 +87,8 @@ class MsDataset: | |||||
| if len(hf_ds.keys()) == 1: | if len(hf_ds.keys()) == 1: | ||||
| return cls(next(iter(hf_ds.values())), target) | return cls(next(iter(hf_ds.values())), target) | ||||
| return {k: cls(v, target) for k, v in hf_ds.items()} | return {k: cls(v, target) for k, v in hf_ds.items()} | ||||
| elif isinstance(hf_ds, ExternalDataset): | |||||
| return cls(hf_ds) | |||||
| else: | else: | ||||
| raise TypeError( | raise TypeError( | ||||
| f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' | f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' | ||||
| @@ -96,7 +108,8 @@ class MsDataset: | |||||
| Mapping[str, Union[str, | Mapping[str, Union[str, | ||||
| Sequence[str]]]]] = None, | Sequence[str]]]]] = None, | ||||
| download_mode: Optional[DownloadMode] = DownloadMode. | download_mode: Optional[DownloadMode] = DownloadMode. | ||||
| REUSE_DATASET_IF_EXISTS | |||||
| REUSE_DATASET_IF_EXISTS, | |||||
| **config_kwargs, | |||||
| ) -> Union[dict, 'MsDataset']: | ) -> Union[dict, 'MsDataset']: | ||||
| """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | ||||
| Args: | Args: | ||||
| @@ -113,6 +126,7 @@ class MsDataset: | |||||
| hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope | hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope | ||||
| download_mode (DownloadMode or str, optional): How to treat existing datasets. default | download_mode (DownloadMode or str, optional): How to treat existing datasets. default | ||||
| DownloadMode.REUSE_DATASET_IF_EXISTS | DownloadMode.REUSE_DATASET_IF_EXISTS | ||||
| **config_kwargs (additional keyword arguments): Keyword arguments to be passed | |||||
| Returns: | Returns: | ||||
| MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | ||||
| @@ -128,7 +142,8 @@ class MsDataset: | |||||
| split=split, | split=split, | ||||
| data_dir=data_dir, | data_dir=data_dir, | ||||
| data_files=data_files, | data_files=data_files, | ||||
| download_mode=download_mode.value) | |||||
| download_mode=download_mode.value, | |||||
| **config_kwargs) | |||||
| return MsDataset.from_hf_dataset(dataset, target=target) | return MsDataset.from_hf_dataset(dataset, target=target) | ||||
| elif hub == Hubs.modelscope: | elif hub == Hubs.modelscope: | ||||
| return MsDataset._load_ms_dataset( | return MsDataset._load_ms_dataset( | ||||
| @@ -140,22 +155,22 @@ class MsDataset: | |||||
| split=split, | split=split, | ||||
| data_dir=data_dir, | data_dir=data_dir, | ||||
| data_files=data_files, | data_files=data_files, | ||||
| download_mode=download_mode) | |||||
| download_mode=download_mode, | |||||
| **config_kwargs) | |||||
| @staticmethod | @staticmethod | ||||
| def _load_ms_dataset( | |||||
| dataset_name: Union[str, list], | |||||
| namespace: Optional[str] = None, | |||||
| target: Optional[str] = None, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| subset_name: Optional[str] = None, | |||||
| split: Optional[str] = None, | |||||
| data_dir: Optional[str] = None, | |||||
| data_files: Optional[Union[str, Sequence[str], | |||||
| Mapping[str, Union[str, | |||||
| Sequence[str]]]]] = None, | |||||
| download_mode: Optional[DownloadMode] = None | |||||
| ) -> Union[dict, 'MsDataset']: | |||||
| def _load_ms_dataset(dataset_name: Union[str, list], | |||||
| namespace: Optional[str] = None, | |||||
| target: Optional[str] = None, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| subset_name: Optional[str] = None, | |||||
| split: Optional[str] = None, | |||||
| data_dir: Optional[str] = None, | |||||
| data_files: Optional[Union[ | |||||
| str, Sequence[str], | |||||
| Mapping[str, Union[str, Sequence[str]]]]] = None, | |||||
| download_mode: Optional[DownloadMode] = None, | |||||
| **config_kwargs) -> Union[dict, 'MsDataset']: | |||||
| if isinstance(dataset_name, str): | if isinstance(dataset_name, str): | ||||
| dataset_formation = DatasetFormations.native | dataset_formation = DatasetFormations.native | ||||
| if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | ||||
| @@ -184,7 +199,8 @@ class MsDataset: | |||||
| data_dir=data_dir, | data_dir=data_dir, | ||||
| data_files=data_files, | data_files=data_files, | ||||
| cache_dir=MS_DATASETS_CACHE, | cache_dir=MS_DATASETS_CACHE, | ||||
| download_mode=download_mode.value) | |||||
| download_mode=download_mode.value, | |||||
| **config_kwargs) | |||||
| else: | else: | ||||
| dataset = MsDataset._load_from_ms( | dataset = MsDataset._load_from_ms( | ||||
| dataset_name, | dataset_name, | ||||
| @@ -195,7 +211,7 @@ class MsDataset: | |||||
| subset_name=subset_name, | subset_name=subset_name, | ||||
| split=split, | split=split, | ||||
| download_mode=download_mode, | download_mode=download_mode, | ||||
| ) | |||||
| **config_kwargs) | |||||
| elif isinstance(dataset_name, list): | elif isinstance(dataset_name, list): | ||||
| if target is None: | if target is None: | ||||
| target = 'target' | target = 'target' | ||||
| @@ -206,16 +222,15 @@ class MsDataset: | |||||
| return MsDataset.from_hf_dataset(dataset, target=target) | return MsDataset.from_hf_dataset(dataset, target=target) | ||||
| @staticmethod | @staticmethod | ||||
| def _load_from_ms( | |||||
| dataset_name: str, | |||||
| dataset_files: dict, | |||||
| download_dir: str, | |||||
| namespace: Optional[str] = None, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| subset_name: Optional[str] = None, | |||||
| split: Optional[str] = None, | |||||
| download_mode: Optional[DownloadMode] = None, | |||||
| ) -> Union[Dataset, DatasetDict]: | |||||
| def _load_from_ms(dataset_name: str, | |||||
| dataset_files: dict, | |||||
| download_dir: str, | |||||
| namespace: Optional[str] = None, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| subset_name: Optional[str] = None, | |||||
| split: Optional[str] = None, | |||||
| download_mode: Optional[DownloadMode] = None, | |||||
| **config_kwargs) -> Union[Dataset, DatasetDict]: | |||||
| for json_path in dataset_files['.json']: | for json_path in dataset_files['.json']: | ||||
| if json_path.endswith(f'{dataset_name}.json'): | if json_path.endswith(f'{dataset_name}.json'): | ||||
| with open(json_path, encoding='utf-8') as dataset_json_file: | with open(json_path, encoding='utf-8') as dataset_json_file: | ||||
| @@ -226,7 +241,6 @@ class MsDataset: | |||||
| meta_map, file_map = get_dataset_files(target_dataset_structure, | meta_map, file_map = get_dataset_files(target_dataset_structure, | ||||
| dataset_name, namespace, | dataset_name, namespace, | ||||
| version) | version) | ||||
| builder = load_dataset_builder( | builder = load_dataset_builder( | ||||
| dataset_name, | dataset_name, | ||||
| subset_name, | subset_name, | ||||
| @@ -235,7 +249,8 @@ class MsDataset: | |||||
| zip_data_files=file_map, | zip_data_files=file_map, | ||||
| cache_dir=MS_DATASETS_CACHE, | cache_dir=MS_DATASETS_CACHE, | ||||
| version=version, | version=version, | ||||
| split=list(target_dataset_structure.keys())) | |||||
| split=list(target_dataset_structure.keys()), | |||||
| **config_kwargs) | |||||
| download_config = DownloadConfig( | download_config = DownloadConfig( | ||||
| cache_dir=download_dir, | cache_dir=download_dir, | ||||
| @@ -253,7 +268,6 @@ class MsDataset: | |||||
| data_dir=download_dir, | data_dir=download_dir, | ||||
| ) | ) | ||||
| builder.download_and_prepare( | builder.download_and_prepare( | ||||
| download_config=download_config, | |||||
| dl_manager=dl_manager, | dl_manager=dl_manager, | ||||
| download_mode=download_mode.value, | download_mode=download_mode.value, | ||||
| try_from_hf_gcs=False) | try_from_hf_gcs=False) | ||||
| @@ -338,6 +352,8 @@ class MsDataset: | |||||
| self, | self, | ||||
| columns: Union[str, List[str]] = None, | columns: Union[str, List[str]] = None, | ||||
| preprocessors: Union[Callable, List[Callable]] = None, | preprocessors: Union[Callable, List[Callable]] = None, | ||||
| task_name: str = None, | |||||
| task_data_config: ConfigDict = None, | |||||
| **format_kwargs, | **format_kwargs, | ||||
| ): | ): | ||||
| """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | ||||
| @@ -350,6 +366,8 @@ class MsDataset: | |||||
| columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | ||||
| preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | ||||
| the output fields of processors will also be added. | the output fields of processors will also be added. | ||||
| task_name (str, default None): task name, refer to :obj:`Tasks` for more details | |||||
| task_data_config (ConfigDict, default None): config dict for model object. | |||||
| format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | ||||
| Returns: | Returns: | ||||
| @@ -360,6 +378,10 @@ class MsDataset: | |||||
| raise ImportError( | raise ImportError( | ||||
| 'The function to_torch_dataset requires pytorch to be installed' | 'The function to_torch_dataset requires pytorch to be installed' | ||||
| ) | ) | ||||
| if isinstance(self._hf_ds, ExternalDataset): | |||||
| task_data_config.update({'preprocessor': preprocessors}) | |||||
| return build_task_dataset(task_data_config, task_name, | |||||
| self._hf_ds.config_kwargs) | |||||
| if preprocessors is not None: | if preprocessors is not None: | ||||
| return self.to_torch_dataset_with_processors( | return self.to_torch_dataset_with_processors( | ||||
| preprocessors, columns=columns) | preprocessors, columns=columns) | ||||
| @@ -8,6 +8,7 @@ if TYPE_CHECKING: | |||||
| from .builder import TASK_DATASETS, build_task_dataset | from .builder import TASK_DATASETS, build_task_dataset | ||||
| from .torch_base_dataset import TorchTaskDataset | from .torch_base_dataset import TorchTaskDataset | ||||
| from .veco_dataset import VecoDataset | from .veco_dataset import VecoDataset | ||||
| from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| @@ -15,6 +16,8 @@ else: | |||||
| 'builder': ['TASK_DATASETS', 'build_task_dataset'], | 'builder': ['TASK_DATASETS', 'build_task_dataset'], | ||||
| 'torch_base_dataset': ['TorchTaskDataset'], | 'torch_base_dataset': ['TorchTaskDataset'], | ||||
| 'veco_dataset': ['VecoDataset'], | 'veco_dataset': ['VecoDataset'], | ||||
| 'image_instance_segmentation_coco_dataset': | |||||
| ['ImageInstanceSegmentationCocoDataset'] | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -2,14 +2,32 @@ import os.path as osp | |||||
| import numpy as np | import numpy as np | ||||
| from pycocotools.coco import COCO | from pycocotools.coco import COCO | ||||
| from torch.utils.data import Dataset | |||||
| class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.utils.constant import Tasks | |||||
| from .builder import TASK_DATASETS | |||||
| from .torch_base_dataset import TorchTaskDataset | |||||
| DATASET_STRUCTURE = { | |||||
| 'train': { | |||||
| 'annotation': 'annotations/instances_train.json', | |||||
| 'images': 'images/train' | |||||
| }, | |||||
| 'validation': { | |||||
| 'annotation': 'annotations/instances_val.json', | |||||
| 'images': 'images/val' | |||||
| } | |||||
| } | |||||
| @TASK_DATASETS.register_module( | |||||
| module_name=Models.cascade_mask_rcnn_swin, | |||||
| group_key=Tasks.image_segmentation) | |||||
| class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): | |||||
| """Coco-style dataset for image instance segmentation. | """Coco-style dataset for image instance segmentation. | ||||
| Args: | Args: | ||||
| ann_file (str): Annotation file path. | |||||
| split_config (dict): Annotation file path. {"train":"xxxxx"} | |||||
| classes (Sequence[str], optional): Specify classes to load. | classes (Sequence[str], optional): Specify classes to load. | ||||
| If is None, ``cls.CLASSES`` will be used. Default: None. | If is None, ``cls.CLASSES`` will be used. Default: None. | ||||
| data_root (str, optional): Data root for ``ann_file``, | data_root (str, optional): Data root for ``ann_file``, | ||||
| @@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
| 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') | 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') | ||||
| def __init__(self, | def __init__(self, | ||||
| ann_file, | |||||
| split_config: dict, | |||||
| preprocessor=None, | |||||
| classes=None, | classes=None, | ||||
| data_root=None, | |||||
| img_prefix='', | |||||
| seg_prefix=None, | seg_prefix=None, | ||||
| test_mode=False, | test_mode=False, | ||||
| filter_empty_gt=True): | |||||
| self.ann_file = ann_file | |||||
| self.data_root = data_root | |||||
| self.img_prefix = img_prefix | |||||
| filter_empty_gt=True, | |||||
| **kwargs): | |||||
| self.data_root = next(iter(split_config.values())) | |||||
| self.split = next(iter(split_config.keys())) | |||||
| self.preprocessor = preprocessor | |||||
| self.ann_file = osp.join(self.data_root, | |||||
| DATASET_STRUCTURE[self.split]['annotation']) | |||||
| self.img_prefix = osp.join(self.data_root, | |||||
| DATASET_STRUCTURE[self.split]['images']) | |||||
| self.seg_prefix = seg_prefix | self.seg_prefix = seg_prefix | ||||
| self.test_mode = test_mode | self.test_mode = test_mode | ||||
| self.filter_empty_gt = filter_empty_gt | self.filter_empty_gt = filter_empty_gt | ||||
| self.CLASSES = self.get_classes(classes) | self.CLASSES = self.get_classes(classes) | ||||
| # join paths if data_root is specified | |||||
| if self.data_root is not None: | |||||
| if not osp.isabs(self.ann_file): | |||||
| self.ann_file = osp.join(self.data_root, self.ann_file) | |||||
| if not (self.img_prefix is None or osp.isabs(self.img_prefix)): | |||||
| self.img_prefix = osp.join(self.data_root, self.img_prefix) | |||||
| if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): | |||||
| self.seg_prefix = osp.join(self.data_root, self.seg_prefix) | |||||
| # load annotations | # load annotations | ||||
| self.data_infos = self.load_annotations(self.ann_file) | self.data_infos = self.load_annotations(self.ann_file) | ||||
| @@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
| # set group flag for the sampler | # set group flag for the sampler | ||||
| self._set_group_flag() | self._set_group_flag() | ||||
| self.preprocessor = None | |||||
| def __len__(self): | def __len__(self): | ||||
| """Total number of samples of data.""" | """Total number of samples of data.""" | ||||
| return len(self.data_infos) | return len(self.data_infos) | ||||
| @@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
| raise ValueError(f'Unsupported type {type(classes)} of classes.') | raise ValueError(f'Unsupported type {type(classes)} of classes.') | ||||
| return class_names | return class_names | ||||
| def to_torch_dataset(self, preprocessors=None): | |||||
| self.preprocessor = preprocessors | |||||
| return self | |||||
| @@ -8,6 +8,7 @@ from datasets.info import DatasetInfo | |||||
| from datasets.packaged_modules import csv | from datasets.packaged_modules import csv | ||||
| from datasets.utils.filelock import FileLock | from datasets.utils.filelock import FileLock | ||||
| from modelscope.utils.constant import DownloadMode | |||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
| zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | ||||
| **config_kwargs, | **config_kwargs, | ||||
| ): | ): | ||||
| self.namespace = namespace | |||||
| super().__init__( | super().__init__( | ||||
| cache_dir=cache_dir, | cache_dir=cache_dir, | ||||
| name=subset_name, | name=subset_name, | ||||
| hash=hash, | hash=hash, | ||||
| namespace=namespace, | |||||
| data_files=meta_data_files, | data_files=meta_data_files, | ||||
| **config_kwargs) | **config_kwargs) | ||||
| @@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
| os.rmdir(self._cache_dir) | os.rmdir(self._cache_dir) | ||||
| self.zip_data_files = zip_data_files | self.zip_data_files = zip_data_files | ||||
| def _relative_data_dir(self, with_version=True, with_hash=True) -> str: | |||||
| """Relative path of this dataset in cache_dir: | |||||
| Will be: | |||||
| self.name/self.config.version/self.hash/ | |||||
| or if a namespace has been specified: | |||||
| self.namespace___self.name/self.config.version/self.hash/ | |||||
| """ | |||||
| builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' | |||||
| builder_config = self.config | |||||
| hash = self.hash | |||||
| if builder_config: | |||||
| builder_data_dir = os.path.join(builder_data_dir, self.config_id) | |||||
| if with_version: | |||||
| builder_data_dir = os.path.join(builder_data_dir, | |||||
| str(self.config.version)) | |||||
| if with_hash and hash and isinstance(hash, str): | |||||
| builder_data_dir = os.path.join(builder_data_dir, hash) | |||||
| return builder_data_dir | |||||
| def _build_cache_dir(self): | def _build_cache_dir(self): | ||||
| builder_data_dir = os.path.join( | builder_data_dir = os.path.join( | ||||
| self._cache_dir_root, | self._cache_dir_root, | ||||
| @@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
| datasets.SplitGenerator( | datasets.SplitGenerator( | ||||
| name=split_name, | name=split_name, | ||||
| gen_kwargs={ | gen_kwargs={ | ||||
| 'files': dl_manager.iter_files(files), | |||||
| 'base_dir': zip_data_files.get(split_name) | |||||
| 'files': | |||||
| dl_manager.iter_files(files), | |||||
| 'base_dir': | |||||
| os.path.join( | |||||
| zip_data_files.get(split_name), | |||||
| os.path.splitext( | |||||
| self.zip_data_files.get(split_name))[0]) | |||||
| if self.zip_data_files.get(split_name) else | |||||
| zip_data_files.get(split_name) | |||||
| })) | })) | ||||
| return splits | return splits | ||||
| @@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
| logger.error( | logger.error( | ||||
| f"Failed to read file '{file}' with error {type(e)}: {e}") | f"Failed to read file '{file}' with error {type(e)}: {e}") | ||||
| raise | raise | ||||
| class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): | |||||
| def __init__( | |||||
| self, | |||||
| dataset_name: str, | |||||
| cache_dir: str, | |||||
| namespace: str, | |||||
| subset_name: str, | |||||
| hash: str, | |||||
| meta_data_files: Mapping[str, Union[str, Sequence[str]]], | |||||
| zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||||
| **config_kwargs, | |||||
| ): | |||||
| self.name = dataset_name | |||||
| self.subset_name = subset_name | |||||
| self.namespace = namespace | |||||
| self.hash = hash | |||||
| self.data_files = meta_data_files | |||||
| self.zip_data_files = zip_data_files | |||||
| self.split_path_dict = None | |||||
| self.config = None | |||||
| self._cache_dir_root = os.path.expanduser(cache_dir) | |||||
| self._cache_dir = self._build_cache_dir() | |||||
| self._config_kwargs = config_kwargs | |||||
| def download_and_prepare(self, download_mode, dl_manager, | |||||
| **download_kwargs): | |||||
| # Prevent parallel disk operations | |||||
| lock_path = os.path.join( | |||||
| self._cache_dir_root, | |||||
| self._cache_dir.replace(os.sep, '_') + '.lock') | |||||
| with FileLock(lock_path): | |||||
| data_exists = os.path.exists(self._cache_dir) | |||||
| if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: | |||||
| logger.warning( | |||||
| f'Reusing dataset {self.name} ({self._cache_dir})') | |||||
| return | |||||
| logger.info(f'Generating dataset {self.name} ({self._cache_dir})') | |||||
| self._download_and_prepare(dl_manager=dl_manager) | |||||
| def _download_and_prepare(self, dl_manager): | |||||
| split_path_dict = dl_manager.download_and_extract(self.zip_data_files) | |||||
| self.split_path_dict = { | |||||
| k: os.path.join(v, | |||||
| os.path.splitext(self.zip_data_files[k])[0]) | |||||
| for k, v in split_path_dict.items() | |||||
| } | |||||
| def as_dataset(self): | |||||
| return ExternalDataset(self.split_path_dict, self._config_kwargs) | |||||
| class ExternalDataset(object): | |||||
| def __init__(self, split_path_dict, config_kwargs): | |||||
| config_kwargs.update({'split_config': split_path_dict}) | |||||
| self.config_kwargs = config_kwargs | |||||
| def __len__(self): | |||||
| return len(self.config_kwargs['split_config']) | |||||
| @@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder | |||||
| from modelscope.utils.constant import DEFAULT_DATASET_REVISION | from modelscope.utils.constant import DEFAULT_DATASET_REVISION | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .dataset_builder import MsCsvDatasetBuilder | |||||
| from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict, | |||||
| modelscope_api = HubApi() | modelscope_api = HubApi() | ||||
| for split, info in subset_split_into.items(): | for split, info in subset_split_into.items(): | ||||
| meta_map[split] = modelscope_api.get_dataset_file_url( | meta_map[split] = modelscope_api.get_dataset_file_url( | ||||
| info['meta'], dataset_name, namespace, revision) | |||||
| info.get('meta', ''), dataset_name, namespace, revision) | |||||
| if info.get('file'): | if info.get('file'): | ||||
| file_map[split] = info['file'] | file_map[split] = info['file'] | ||||
| return meta_map, file_map | return meta_map, file_map | ||||
| @@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, | |||||
| zip_data_files: Mapping[str, Union[str, | zip_data_files: Mapping[str, Union[str, | ||||
| Sequence[str]]], | Sequence[str]]], | ||||
| cache_dir: str, version: Optional[Union[str]], | cache_dir: str, version: Optional[Union[str]], | ||||
| split: Sequence[str]) -> DatasetBuilder: | |||||
| split: Sequence[str], | |||||
| **config_kwargs) -> DatasetBuilder: | |||||
| sub_dir = os.path.join(version, '_'.join(split)) | sub_dir = os.path.join(version, '_'.join(split)) | ||||
| builder_instance = MsCsvDatasetBuilder( | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| cache_dir=cache_dir, | |||||
| subset_name=subset_name, | |||||
| meta_data_files=meta_data_files, | |||||
| zip_data_files=zip_data_files, | |||||
| hash=sub_dir) | |||||
| meta_data_file = next(iter(meta_data_files.values())) | |||||
| if not meta_data_file: | |||||
| builder_instance = TaskSpecificDatasetBuilder( | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| cache_dir=cache_dir, | |||||
| subset_name=subset_name, | |||||
| meta_data_files=meta_data_files, | |||||
| zip_data_files=zip_data_files, | |||||
| hash=sub_dir, | |||||
| **config_kwargs) | |||||
| elif meta_data_file.endswith('.csv'): | |||||
| builder_instance = MsCsvDatasetBuilder( | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| cache_dir=cache_dir, | |||||
| subset_name=subset_name, | |||||
| meta_data_files=meta_data_files, | |||||
| zip_data_files=zip_data_files, | |||||
| hash=sub_dir) | |||||
| else: | |||||
| raise NotImplementedError( | |||||
| f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet' | |||||
| ) | |||||
| return builder_instance | return builder_instance | ||||
| @@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer): | |||||
| def prediction_step(self, model, inputs): | def prediction_step(self, model, inputs): | ||||
| pass | pass | ||||
| def to_task_dataset(self, datasets, mode, preprocessor=None): | |||||
| # wait for dataset interface to become stable... | |||||
| return datasets.to_torch_dataset(preprocessor) | |||||
| @@ -202,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer): | |||||
| """Veco evaluates the datasets one by one. | """Veco evaluates the datasets one by one. | ||||
| """ | """ | ||||
| from modelscope.task_datasets import VecoDataset | |||||
| from modelscope.msdatasets.task_datasets import VecoDataset | |||||
| self.model.eval() | self.model.eval() | ||||
| self._mode = ModeKeys.EVAL | self._mode = ModeKeys.EVAL | ||||
| metric_values = {} | metric_values = {} | ||||
| @@ -21,11 +21,12 @@ from modelscope.metainfo import Trainers | |||||
| from modelscope.metrics import build_metric, task_default_metrics | from modelscope.metrics import build_metric, task_default_metrics | ||||
| from modelscope.models.base import Model, TorchModel | from modelscope.models.base import Model, TorchModel | ||||
| from modelscope.msdatasets.ms_dataset import MsDataset | from modelscope.msdatasets.ms_dataset import MsDataset | ||||
| from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||||
| from modelscope.msdatasets.task_datasets.torch_base_dataset import \ | |||||
| TorchTaskDataset | |||||
| from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
| from modelscope.preprocessors.builder import build_preprocessor | from modelscope.preprocessors.builder import build_preprocessor | ||||
| from modelscope.preprocessors.common import Compose | from modelscope.preprocessors.common import Compose | ||||
| from modelscope.task_datasets.builder import build_task_dataset | |||||
| from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | |||||
| from modelscope.trainers.hooks.builder import HOOKS | from modelscope.trainers.hooks.builder import HOOKS | ||||
| from modelscope.trainers.hooks.priority import Priority, get_priority | from modelscope.trainers.hooks.priority import Priority, get_priority | ||||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | ||||
| @@ -288,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| if isinstance(datasets, TorchTaskDataset): | if isinstance(datasets, TorchTaskDataset): | ||||
| return datasets | return datasets | ||||
| elif isinstance(datasets, MsDataset): | elif isinstance(datasets, MsDataset): | ||||
| datasets = datasets.to_torch_dataset( | |||||
| cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||||
| else ConfigDict(type=None, mode=mode) | |||||
| return datasets.to_torch_dataset( | |||||
| task_data_config=cfg, | |||||
| task_name=self.cfg.task, | |||||
| preprocessors=preprocessor) | preprocessors=preprocessor) | ||||
| return datasets | |||||
| elif isinstance(datasets, List) and isinstance( | elif isinstance(datasets, List) and isinstance( | ||||
| datasets[0], MsDataset): | datasets[0], MsDataset): | ||||
| cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||||
| else ConfigDict(type=None, mode=mode) | |||||
| datasets = [ | datasets = [ | ||||
| d.to_torch_dataset(preprocessor=preprocessor) | |||||
| for d in datasets | |||||
| d.to_torch_dataset( | |||||
| task_data_config=cfg, | |||||
| task_name=self.cfg.task, | |||||
| preprocessors=preprocessor) for d in datasets | |||||
| ] | ] | ||||
| cfg = ConfigDict( | cfg = ConfigDict( | ||||
| type=self.cfg.task, mode=mode, datasets=datasets) | type=self.cfg.task, mode=mode, datasets=datasets) | ||||
| @@ -585,8 +593,13 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| subset_name=data_cfg.subset_name if hasattr( | subset_name=data_cfg.subset_name if hasattr( | ||||
| data_cfg, 'subset_name') else None, | data_cfg, 'subset_name') else None, | ||||
| hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | ||||
| **data_cfg, | |||||
| ) | ) | ||||
| torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) | |||||
| cfg = ConfigDict(type=self.cfg.model.type, mode=mode) | |||||
| torch_dataset = dataset.to_torch_dataset( | |||||
| task_data_config=cfg, | |||||
| task_name=self.cfg.task, | |||||
| preprocessors=self.preprocessor) | |||||
| dataset = self.to_task_dataset(torch_dataset, mode) | dataset = self.to_task_dataset(torch_dataset, mode) | ||||
| return dataset | return dataset | ||||
| @@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1]) | |||||
| REGISTER_MODULE = 'register_module' | REGISTER_MODULE = 'register_module' | ||||
| IGNORED_PACKAGES = ['modelscope', '.'] | IGNORED_PACKAGES = ['modelscope', '.'] | ||||
| SCAN_SUB_FOLDERS = [ | SCAN_SUB_FOLDERS = [ | ||||
| 'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets', | |||||
| 'trainers' | |||||
| 'models', 'metrics', 'pipelines', 'preprocessors', | |||||
| 'msdatasets/task_datasets', 'trainers' | |||||
| ] | ] | ||||
| INDEXER_FILE = 'ast_indexer' | INDEXER_FILE = 'ast_indexer' | ||||
| DECORATOR_KEY = 'decorators' | DECORATOR_KEY = 'decorators' | ||||
| @@ -1,6 +1,5 @@ | |||||
| addict | addict | ||||
| #version above 2.1.0 introduces backward-compatability issue which is being resolved | |||||
| datasets==2.1.0 | |||||
| datasets | |||||
| easydict | easydict | ||||
| einops | einops | ||||
| filelock>=3.3.0 | filelock>=3.3.0 | ||||
| @@ -4,6 +4,7 @@ from modelscope.models import Model | |||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| from modelscope.preprocessors import SequenceClassificationPreprocessor | from modelscope.preprocessors import SequenceClassificationPreprocessor | ||||
| from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
| from modelscope.utils.constant import DownloadMode | |||||
| from modelscope.utils.test_utils import require_tf, require_torch, test_level | from modelscope.utils.test_utils import require_tf, require_torch, test_level | ||||
| @@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor): | |||||
| class MsDatasetTest(unittest.TestCase): | class MsDatasetTest(unittest.TestCase): | ||||
| @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||||
| def test_coco(self): | |||||
| ms_ds_train = MsDataset.load( | |||||
| 'pets_small', | |||||
| namespace='modelscope', | |||||
| split='train', | |||||
| download_mode=DownloadMode.FORCE_REDOWNLOAD, | |||||
| classes=('1', '2')) | |||||
| print(ms_ds_train._hf_ds.config_kwargs) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| def test_ms_csv_basic(self): | def test_ms_csv_basic(self): | ||||
| ms_ds_train = MsDataset.load( | ms_ds_train = MsDataset.load( | ||||
| @@ -2,7 +2,7 @@ | |||||
| import unittest | import unittest | ||||
| from modelscope.task_datasets.veco_dataset import VecoDataset | |||||
| from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset | |||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -8,10 +8,13 @@ from functools import partial | |||||
| from modelscope.hub.snapshot_download import snapshot_download | from modelscope.hub.snapshot_download import snapshot_download | ||||
| from modelscope.metainfo import Trainers | from modelscope.metainfo import Trainers | ||||
| from modelscope.models.cv.image_instance_segmentation import ( | |||||
| CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset) | |||||
| from modelscope.models.cv.image_instance_segmentation import \ | |||||
| CascadeMaskRCNNSwinModel | |||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.msdatasets.task_datasets import \ | |||||
| ImageInstanceSegmentationCocoDataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.config import Config | |||||
| from modelscope.utils.config import Config, ConfigDict | |||||
| from modelscope.utils.constant import ModelFile | from modelscope.utils.constant import ModelFile | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): | |||||
| config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) | config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) | ||||
| cfg = Config.from_file(config_path) | cfg = Config.from_file(config_path) | ||||
| data_root = cfg.dataset.data_root | |||||
| classes = tuple(cfg.dataset.classes) | |||||
| max_epochs = cfg.train.max_epochs | max_epochs = cfg.train.max_epochs | ||||
| samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu | samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu | ||||
| if data_root is None: | |||||
| try: | |||||
| train_data_cfg = cfg.dataset.train | |||||
| val_data_cfg = cfg.dataset.val | |||||
| except Exception: | |||||
| train_data_cfg = None | |||||
| val_data_cfg = None | |||||
| if train_data_cfg is None: | |||||
| # use default toy data | # use default toy data | ||||
| dataset_path = os.path.join(cache_path, 'toydata.zip') | |||||
| with zipfile.ZipFile(dataset_path, 'r') as zipf: | |||||
| zipf.extractall(cache_path) | |||||
| data_root = cache_path + '/toydata/' | |||||
| classes = ('Cat', 'Dog') | |||||
| self.train_dataset = ImageInstanceSegmentationCocoDataset( | |||||
| data_root + 'annotations/instances_train.json', | |||||
| classes=classes, | |||||
| data_root=data_root, | |||||
| img_prefix=data_root + 'images/train/', | |||||
| seg_prefix=None, | |||||
| test_mode=False) | |||||
| self.eval_dataset = ImageInstanceSegmentationCocoDataset( | |||||
| data_root + 'annotations/instances_val.json', | |||||
| classes=classes, | |||||
| data_root=data_root, | |||||
| img_prefix=data_root + 'images/val/', | |||||
| seg_prefix=None, | |||||
| test_mode=True) | |||||
| train_data_cfg = ConfigDict( | |||||
| name='pets_small', | |||||
| split='train', | |||||
| classes=('Cat', 'Dog'), | |||||
| test_mode=False) | |||||
| if val_data_cfg is None: | |||||
| val_data_cfg = ConfigDict( | |||||
| name='pets_small', | |||||
| split='validation', | |||||
| classes=('Cat', 'Dog'), | |||||
| test_mode=True) | |||||
| self.train_dataset = MsDataset.load( | |||||
| dataset_name=train_data_cfg.name, | |||||
| split=train_data_cfg.split, | |||||
| classes=train_data_cfg.classes, | |||||
| test_mode=train_data_cfg.test_mode) | |||||
| assert self.train_dataset.config_kwargs[ | |||||
| 'classes'] == train_data_cfg.classes | |||||
| assert next( | |||||
| iter(self.train_dataset.config_kwargs['split_config'].values())) | |||||
| self.eval_dataset = MsDataset.load( | |||||
| dataset_name=val_data_cfg.name, | |||||
| split=val_data_cfg.split, | |||||
| classes=val_data_cfg.classes, | |||||
| test_mode=val_data_cfg.test_mode) | |||||
| assert self.eval_dataset.config_kwargs[ | |||||
| 'classes'] == val_data_cfg.classes | |||||
| assert next( | |||||
| iter(self.eval_dataset.config_kwargs['split_config'].values())) | |||||
| from mmcv.parallel import collate | from mmcv.parallel import collate | ||||