From 39485426e7c08aa1ab77fbd64639c289a156d796 Mon Sep 17 00:00:00 2001 From: "feiwu.yfw" Date: Fri, 26 Aug 2022 22:41:13 +0800 Subject: [PATCH] [to #42322933]:fix msdataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修复了zip文件不同打包模式下返回路径错误问题。 * 修复了替换了数据集文件重新下载时校验失败问题。 * 修复dataset oss文件在 REUSE 模式下重复下载的问题。 * 修复了csv数据集的meta json文件中某个split的meta和file字段都为''时加载所有split失败的问题。 * 修复了不同版本datasets路径不一致的问题。 --- ...mage_instance_segmentation_coco_dataset.py | 5 ++- .../msdatasets/utils/dataset_builder.py | 37 +++++++------------ modelscope/msdatasets/utils/dataset_utils.py | 11 +++++- modelscope/msdatasets/utils/download_utils.py | 4 +- modelscope/msdatasets/utils/oss_utils.py | 8 ++-- tests/msdatasets/test_ms_dataset.py | 7 ++-- ...est_image_instance_segmentation_trainer.py | 4 ++ 7 files changed, 43 insertions(+), 33 deletions(-) diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py index 04c8e142..a001fe36 100644 --- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py +++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py @@ -59,10 +59,13 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): preprocessor=None, classes=None, seg_prefix=None, + folder_name=None, test_mode=False, filter_empty_gt=True, **kwargs): - self.data_root = next(iter(split_config.values())) + data_root = next(iter(split_config.values())) + self.data_root = osp.join(data_root, + folder_name) if folder_name else data_root self.split = next(iter(split_config.keys())) self.preprocessor = preprocessor diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py index 85489c58..7180cb5b 100644 --- a/modelscope/msdatasets/utils/dataset_builder.py +++ b/modelscope/msdatasets/utils/dataset_builder.py @@ -8,7 +8,7 @@ from datasets.info import DatasetInfo from datasets.packaged_modules import csv from datasets.utils.filelock import FileLock -from modelscope.utils.constant import DownloadMode +from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode from modelscope.utils.logger import get_logger logger = get_logger() @@ -27,7 +27,6 @@ class MsCsvDatasetBuilder(csv.Csv): zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, **config_kwargs, ): - self.namespace = namespace super().__init__( cache_dir=cache_dir, name=subset_name, @@ -37,7 +36,7 @@ class MsCsvDatasetBuilder(csv.Csv): self.name = dataset_name self.info.builder_name = self.name - self._cache_dir = self._build_cache_dir() + self._cache_dir = self._build_cache_dir(namespace=namespace) lock_path = os.path.join( self._cache_dir_root, self._cache_dir.replace(os.sep, '_') + '.lock') @@ -48,7 +47,6 @@ class MsCsvDatasetBuilder(csv.Csv): logger.info( f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}' ) - self.info = DatasetInfo.from_directory(self._cache_dir) # dir exists but no data, remove the empty dir as data aren't available anymore else: logger.warning( @@ -57,14 +55,17 @@ class MsCsvDatasetBuilder(csv.Csv): os.rmdir(self._cache_dir) self.zip_data_files = zip_data_files - def _relative_data_dir(self, with_version=True, with_hash=True) -> str: + def _relative_data_dir(self, + with_version=True, + with_hash=True, + namespace=DEFAULT_DATASET_NAMESPACE) -> str: """Relative path of this dataset in cache_dir: Will be: self.name/self.config.version/self.hash/ or if a namespace has been specified: self.namespace___self.name/self.config.version/self.hash/ """ - builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' + builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}' builder_config = self.config hash = self.hash if builder_config: @@ -76,10 +77,11 @@ class MsCsvDatasetBuilder(csv.Csv): builder_data_dir = os.path.join(builder_data_dir, hash) return builder_data_dir - def _build_cache_dir(self): + def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE): builder_data_dir = os.path.join( self._cache_dir_root, - self._relative_data_dir(with_version=False, with_hash=True)) + self._relative_data_dir( + with_version=False, with_hash=True, namespace=namespace)) return builder_data_dir @@ -97,15 +99,8 @@ class MsCsvDatasetBuilder(csv.Csv): datasets.SplitGenerator( name=split_name, gen_kwargs={ - 'files': - dl_manager.iter_files(files), - 'base_dir': - os.path.join( - zip_data_files.get(split_name), - os.path.splitext( - self.zip_data_files.get(split_name))[0]) - if self.zip_data_files.get(split_name) else - zip_data_files.get(split_name) + 'files': dl_manager.iter_files(files), + 'base_dir': zip_data_files.get(split_name) })) return splits @@ -181,12 +176,8 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): self._download_and_prepare(dl_manager=dl_manager) def _download_and_prepare(self, dl_manager): - split_path_dict = dl_manager.download_and_extract(self.zip_data_files) - self.split_path_dict = { - k: os.path.join(v, - os.path.splitext(self.zip_data_files[k])[0]) - for k, v in split_path_dict.items() - } + self.split_path_dict = dl_manager.download_and_extract( + self.zip_data_files) def as_dataset(self): return ExternalDataset(self.split_path_dict, self._config_kwargs) diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index 09556d84..08a6de84 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -11,6 +11,14 @@ from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder logger = get_logger() +def format_dataset_structure(dataset_structure): + return { + k: v + for k, v in dataset_structure.items() + if (v.get('meta') or v.get('file')) + } + + def get_target_dataset_structure(dataset_structure: dict, subset_name: Optional[str] = None, split: Optional[str] = None): @@ -56,7 +64,8 @@ def get_target_dataset_structure(dataset_structure: dict, f'No subset_name specified, defaulting to the {target_subset_name}' ) # verify dataset split - target_dataset_structure = dataset_structure[target_subset_name] + target_dataset_structure = format_dataset_structure( + dataset_structure[target_subset_name]) if split and split not in target_dataset_structure: raise ValueError( f'split {split} not found. Available: {target_dataset_structure.keys()}' diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py index bc637f0e..eb1c99ef 100644 --- a/modelscope/msdatasets/utils/download_utils.py +++ b/modelscope/msdatasets/utils/download_utils.py @@ -34,8 +34,8 @@ class DatasetDownloadManager(DownloadManager): url_or_filename = str(url_or_filename) if is_relative_path(url_or_filename): # fetch oss files - return self.oss_utilities.download(url_or_filename, - self.download_config.cache_dir) + return self.oss_utilities.download( + url_or_filename, download_config=download_config) else: return cached_path( url_or_filename, download_config=download_config) diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py index 033c8b96..82d43bef 100644 --- a/modelscope/msdatasets/utils/oss_utils.py +++ b/modelscope/msdatasets/utils/oss_utils.py @@ -24,7 +24,8 @@ class OssUtilities: rate = int(100 * (float(consumed_bytes) / float(total_bytes))) print('\r{0}% '.format(rate), end='', flush=True) - def download(self, oss_file_name, cache_dir): + def download(self, oss_file_name, download_config): + cache_dir = download_config.cache_dir candidate_key = os.path.join(self.oss_dir, oss_file_name) candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name) file_oss_key = candidate_key if self.bucket.object_exists( @@ -32,8 +33,9 @@ class OssUtilities: filename = hash_url_to_filename(file_oss_key, etag=None) local_path = os.path.join(cache_dir, filename) - self.bucket.get_object_to_file( - file_oss_key, local_path, progress_callback=self._percentage) + if download_config.force_download or not os.path.exists(local_path): + self.bucket.get_object_to_file( + file_oss_key, local_path, progress_callback=self._percentage) return local_path def upload(self, oss_file_name: str, local_file_path: str) -> str: diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 0d8c8a4d..1d62d2d1 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -37,9 +37,10 @@ class MsDatasetTest(unittest.TestCase): 'pets_small', namespace=DEFAULT_DATASET_NAMESPACE, split='train', - download_mode=DownloadMode.FORCE_REDOWNLOAD, - classes=('1', '2')) - print(ms_ds_train._hf_ds.config_kwargs) + classes=('1', '2'), + folder_name='Pets') + print(ms_ds_train.config_kwargs) + assert next(iter(ms_ds_train.config_kwargs['split_config'].values())) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ms_csv_basic(self): diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py index c8557ff5..774f8fa8 100644 --- a/tests/trainers/test_image_instance_segmentation_trainer.py +++ b/tests/trainers/test_image_instance_segmentation_trainer.py @@ -44,18 +44,21 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): name='pets_small', split='train', classes=('Cat', 'Dog'), + folder_name='Pets', test_mode=False) if val_data_cfg is None: val_data_cfg = ConfigDict( name='pets_small', split='validation', classes=('Cat', 'Dog'), + folder_name='Pets', test_mode=True) self.train_dataset = MsDataset.load( dataset_name=train_data_cfg.name, split=train_data_cfg.split, classes=train_data_cfg.classes, + folder_name=train_data_cfg.folder_name, test_mode=train_data_cfg.test_mode) assert self.train_dataset.config_kwargs[ 'classes'] == train_data_cfg.classes @@ -66,6 +69,7 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): dataset_name=val_data_cfg.name, split=val_data_cfg.split, classes=val_data_cfg.classes, + folder_name=val_data_cfg.folder_name, test_mode=val_data_cfg.test_mode) assert self.eval_dataset.config_kwargs[ 'classes'] == val_data_cfg.classes