* 修复了zip文件不同打包模式下返回路径错误问题。 * 修复了替换了数据集文件重新下载时校验失败问题。 * 修复dataset oss文件在 REUSE 模式下重复下载的问题。 * 修复了csv数据集的meta json文件中某个split的meta和file字段都为''时加载所有split失败的问题。 * 修复了不同版本datasets路径不一致的问题。master
| @@ -59,10 +59,13 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): | |||
| preprocessor=None, | |||
| classes=None, | |||
| seg_prefix=None, | |||
| folder_name=None, | |||
| test_mode=False, | |||
| filter_empty_gt=True, | |||
| **kwargs): | |||
| self.data_root = next(iter(split_config.values())) | |||
| data_root = next(iter(split_config.values())) | |||
| self.data_root = osp.join(data_root, | |||
| folder_name) if folder_name else data_root | |||
| self.split = next(iter(split_config.keys())) | |||
| self.preprocessor = preprocessor | |||
| @@ -8,7 +8,7 @@ from datasets.info import DatasetInfo | |||
| from datasets.packaged_modules import csv | |||
| from datasets.utils.filelock import FileLock | |||
| from modelscope.utils.constant import DownloadMode | |||
| from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @@ -27,7 +27,6 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||
| **config_kwargs, | |||
| ): | |||
| self.namespace = namespace | |||
| super().__init__( | |||
| cache_dir=cache_dir, | |||
| name=subset_name, | |||
| @@ -37,7 +36,7 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| self.name = dataset_name | |||
| self.info.builder_name = self.name | |||
| self._cache_dir = self._build_cache_dir() | |||
| self._cache_dir = self._build_cache_dir(namespace=namespace) | |||
| lock_path = os.path.join( | |||
| self._cache_dir_root, | |||
| self._cache_dir.replace(os.sep, '_') + '.lock') | |||
| @@ -48,7 +47,6 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| logger.info( | |||
| f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}' | |||
| ) | |||
| self.info = DatasetInfo.from_directory(self._cache_dir) | |||
| # dir exists but no data, remove the empty dir as data aren't available anymore | |||
| else: | |||
| logger.warning( | |||
| @@ -57,14 +55,17 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| os.rmdir(self._cache_dir) | |||
| self.zip_data_files = zip_data_files | |||
| def _relative_data_dir(self, with_version=True, with_hash=True) -> str: | |||
| def _relative_data_dir(self, | |||
| with_version=True, | |||
| with_hash=True, | |||
| namespace=DEFAULT_DATASET_NAMESPACE) -> str: | |||
| """Relative path of this dataset in cache_dir: | |||
| Will be: | |||
| self.name/self.config.version/self.hash/ | |||
| or if a namespace has been specified: | |||
| self.namespace___self.name/self.config.version/self.hash/ | |||
| """ | |||
| builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' | |||
| builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}' | |||
| builder_config = self.config | |||
| hash = self.hash | |||
| if builder_config: | |||
| @@ -76,10 +77,11 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| builder_data_dir = os.path.join(builder_data_dir, hash) | |||
| return builder_data_dir | |||
| def _build_cache_dir(self): | |||
| def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE): | |||
| builder_data_dir = os.path.join( | |||
| self._cache_dir_root, | |||
| self._relative_data_dir(with_version=False, with_hash=True)) | |||
| self._relative_data_dir( | |||
| with_version=False, with_hash=True, namespace=namespace)) | |||
| return builder_data_dir | |||
| @@ -97,15 +99,8 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| datasets.SplitGenerator( | |||
| name=split_name, | |||
| gen_kwargs={ | |||
| 'files': | |||
| dl_manager.iter_files(files), | |||
| 'base_dir': | |||
| os.path.join( | |||
| zip_data_files.get(split_name), | |||
| os.path.splitext( | |||
| self.zip_data_files.get(split_name))[0]) | |||
| if self.zip_data_files.get(split_name) else | |||
| zip_data_files.get(split_name) | |||
| 'files': dl_manager.iter_files(files), | |||
| 'base_dir': zip_data_files.get(split_name) | |||
| })) | |||
| return splits | |||
| @@ -181,12 +176,8 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): | |||
| self._download_and_prepare(dl_manager=dl_manager) | |||
| def _download_and_prepare(self, dl_manager): | |||
| split_path_dict = dl_manager.download_and_extract(self.zip_data_files) | |||
| self.split_path_dict = { | |||
| k: os.path.join(v, | |||
| os.path.splitext(self.zip_data_files[k])[0]) | |||
| for k, v in split_path_dict.items() | |||
| } | |||
| self.split_path_dict = dl_manager.download_and_extract( | |||
| self.zip_data_files) | |||
| def as_dataset(self): | |||
| return ExternalDataset(self.split_path_dict, self._config_kwargs) | |||
| @@ -11,6 +11,14 @@ from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | |||
| logger = get_logger() | |||
| def format_dataset_structure(dataset_structure): | |||
| return { | |||
| k: v | |||
| for k, v in dataset_structure.items() | |||
| if (v.get('meta') or v.get('file')) | |||
| } | |||
| def get_target_dataset_structure(dataset_structure: dict, | |||
| subset_name: Optional[str] = None, | |||
| split: Optional[str] = None): | |||
| @@ -56,7 +64,8 @@ def get_target_dataset_structure(dataset_structure: dict, | |||
| f'No subset_name specified, defaulting to the {target_subset_name}' | |||
| ) | |||
| # verify dataset split | |||
| target_dataset_structure = dataset_structure[target_subset_name] | |||
| target_dataset_structure = format_dataset_structure( | |||
| dataset_structure[target_subset_name]) | |||
| if split and split not in target_dataset_structure: | |||
| raise ValueError( | |||
| f'split {split} not found. Available: {target_dataset_structure.keys()}' | |||
| @@ -34,8 +34,8 @@ class DatasetDownloadManager(DownloadManager): | |||
| url_or_filename = str(url_or_filename) | |||
| if is_relative_path(url_or_filename): | |||
| # fetch oss files | |||
| return self.oss_utilities.download(url_or_filename, | |||
| self.download_config.cache_dir) | |||
| return self.oss_utilities.download( | |||
| url_or_filename, download_config=download_config) | |||
| else: | |||
| return cached_path( | |||
| url_or_filename, download_config=download_config) | |||
| @@ -24,7 +24,8 @@ class OssUtilities: | |||
| rate = int(100 * (float(consumed_bytes) / float(total_bytes))) | |||
| print('\r{0}% '.format(rate), end='', flush=True) | |||
| def download(self, oss_file_name, cache_dir): | |||
| def download(self, oss_file_name, download_config): | |||
| cache_dir = download_config.cache_dir | |||
| candidate_key = os.path.join(self.oss_dir, oss_file_name) | |||
| candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name) | |||
| file_oss_key = candidate_key if self.bucket.object_exists( | |||
| @@ -32,8 +33,9 @@ class OssUtilities: | |||
| filename = hash_url_to_filename(file_oss_key, etag=None) | |||
| local_path = os.path.join(cache_dir, filename) | |||
| self.bucket.get_object_to_file( | |||
| file_oss_key, local_path, progress_callback=self._percentage) | |||
| if download_config.force_download or not os.path.exists(local_path): | |||
| self.bucket.get_object_to_file( | |||
| file_oss_key, local_path, progress_callback=self._percentage) | |||
| return local_path | |||
| def upload(self, oss_file_name: str, local_file_path: str) -> str: | |||
| @@ -37,9 +37,10 @@ class MsDatasetTest(unittest.TestCase): | |||
| 'pets_small', | |||
| namespace=DEFAULT_DATASET_NAMESPACE, | |||
| split='train', | |||
| download_mode=DownloadMode.FORCE_REDOWNLOAD, | |||
| classes=('1', '2')) | |||
| print(ms_ds_train._hf_ds.config_kwargs) | |||
| classes=('1', '2'), | |||
| folder_name='Pets') | |||
| print(ms_ds_train.config_kwargs) | |||
| assert next(iter(ms_ds_train.config_kwargs['split_config'].values())) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_ms_csv_basic(self): | |||
| @@ -44,18 +44,21 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): | |||
| name='pets_small', | |||
| split='train', | |||
| classes=('Cat', 'Dog'), | |||
| folder_name='Pets', | |||
| test_mode=False) | |||
| if val_data_cfg is None: | |||
| val_data_cfg = ConfigDict( | |||
| name='pets_small', | |||
| split='validation', | |||
| classes=('Cat', 'Dog'), | |||
| folder_name='Pets', | |||
| test_mode=True) | |||
| self.train_dataset = MsDataset.load( | |||
| dataset_name=train_data_cfg.name, | |||
| split=train_data_cfg.split, | |||
| classes=train_data_cfg.classes, | |||
| folder_name=train_data_cfg.folder_name, | |||
| test_mode=train_data_cfg.test_mode) | |||
| assert self.train_dataset.config_kwargs[ | |||
| 'classes'] == train_data_cfg.classes | |||
| @@ -66,6 +69,7 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): | |||
| dataset_name=val_data_cfg.name, | |||
| split=val_data_cfg.split, | |||
| classes=val_data_cfg.classes, | |||
| folder_name=val_data_cfg.folder_name, | |||
| test_mode=val_data_cfg.test_mode) | |||
| assert self.eval_dataset.config_kwargs[ | |||
| 'classes'] == val_data_cfg.classes | |||