Browse Source

[to #42322933]:fix msdataset

* 修复了zip文件不同打包模式下返回路径错误问题。
* 修复了替换了数据集文件重新下载时校验失败问题。
* 修复dataset oss文件在 REUSE 模式下重复下载的问题。
* 修复了csv数据集的meta json文件中某个split的meta和file字段都为''时加载所有split失败的问题。
 * 修复了不同版本datasets路径不一致的问题。
master
feiwu.yfw 3 years ago
parent
commit
39485426e7
7 changed files with 43 additions and 33 deletions
  1. +4
    -1
      modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
  2. +14
    -23
      modelscope/msdatasets/utils/dataset_builder.py
  3. +10
    -1
      modelscope/msdatasets/utils/dataset_utils.py
  4. +2
    -2
      modelscope/msdatasets/utils/download_utils.py
  5. +5
    -3
      modelscope/msdatasets/utils/oss_utils.py
  6. +4
    -3
      tests/msdatasets/test_ms_dataset.py
  7. +4
    -0
      tests/trainers/test_image_instance_segmentation_trainer.py

+ 4
- 1
modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py View File

@@ -59,10 +59,13 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
preprocessor=None,
classes=None,
seg_prefix=None,
folder_name=None,
test_mode=False,
filter_empty_gt=True,
**kwargs):
self.data_root = next(iter(split_config.values()))
data_root = next(iter(split_config.values()))
self.data_root = osp.join(data_root,
folder_name) if folder_name else data_root
self.split = next(iter(split_config.keys()))
self.preprocessor = preprocessor



+ 14
- 23
modelscope/msdatasets/utils/dataset_builder.py View File

@@ -8,7 +8,7 @@ from datasets.info import DatasetInfo
from datasets.packaged_modules import csv
from datasets.utils.filelock import FileLock

from modelscope.utils.constant import DownloadMode
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
from modelscope.utils.logger import get_logger

logger = get_logger()
@@ -27,7 +27,6 @@ class MsCsvDatasetBuilder(csv.Csv):
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs,
):
self.namespace = namespace
super().__init__(
cache_dir=cache_dir,
name=subset_name,
@@ -37,7 +36,7 @@ class MsCsvDatasetBuilder(csv.Csv):

self.name = dataset_name
self.info.builder_name = self.name
self._cache_dir = self._build_cache_dir()
self._cache_dir = self._build_cache_dir(namespace=namespace)
lock_path = os.path.join(
self._cache_dir_root,
self._cache_dir.replace(os.sep, '_') + '.lock')
@@ -48,7 +47,6 @@ class MsCsvDatasetBuilder(csv.Csv):
logger.info(
f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}'
)
self.info = DatasetInfo.from_directory(self._cache_dir)
# dir exists but no data, remove the empty dir as data aren't available anymore
else:
logger.warning(
@@ -57,14 +55,17 @@ class MsCsvDatasetBuilder(csv.Csv):
os.rmdir(self._cache_dir)
self.zip_data_files = zip_data_files

def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
def _relative_data_dir(self,
with_version=True,
with_hash=True,
namespace=DEFAULT_DATASET_NAMESPACE) -> str:
"""Relative path of this dataset in cache_dir:
Will be:
self.name/self.config.version/self.hash/
or if a namespace has been specified:
self.namespace___self.name/self.config.version/self.hash/
"""
builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}'
builder_config = self.config
hash = self.hash
if builder_config:
@@ -76,10 +77,11 @@ class MsCsvDatasetBuilder(csv.Csv):
builder_data_dir = os.path.join(builder_data_dir, hash)
return builder_data_dir

def _build_cache_dir(self):
def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
builder_data_dir = os.path.join(
self._cache_dir_root,
self._relative_data_dir(with_version=False, with_hash=True))
self._relative_data_dir(
with_version=False, with_hash=True, namespace=namespace))

return builder_data_dir

@@ -97,15 +99,8 @@ class MsCsvDatasetBuilder(csv.Csv):
datasets.SplitGenerator(
name=split_name,
gen_kwargs={
'files':
dl_manager.iter_files(files),
'base_dir':
os.path.join(
zip_data_files.get(split_name),
os.path.splitext(
self.zip_data_files.get(split_name))[0])
if self.zip_data_files.get(split_name) else
zip_data_files.get(split_name)
'files': dl_manager.iter_files(files),
'base_dir': zip_data_files.get(split_name)
}))
return splits

@@ -181,12 +176,8 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
self._download_and_prepare(dl_manager=dl_manager)

def _download_and_prepare(self, dl_manager):
split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
self.split_path_dict = {
k: os.path.join(v,
os.path.splitext(self.zip_data_files[k])[0])
for k, v in split_path_dict.items()
}
self.split_path_dict = dl_manager.download_and_extract(
self.zip_data_files)

def as_dataset(self):
return ExternalDataset(self.split_path_dict, self._config_kwargs)


+ 10
- 1
modelscope/msdatasets/utils/dataset_utils.py View File

@@ -11,6 +11,14 @@ from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
logger = get_logger()


def format_dataset_structure(dataset_structure):
return {
k: v
for k, v in dataset_structure.items()
if (v.get('meta') or v.get('file'))
}


def get_target_dataset_structure(dataset_structure: dict,
subset_name: Optional[str] = None,
split: Optional[str] = None):
@@ -56,7 +64,8 @@ def get_target_dataset_structure(dataset_structure: dict,
f'No subset_name specified, defaulting to the {target_subset_name}'
)
# verify dataset split
target_dataset_structure = dataset_structure[target_subset_name]
target_dataset_structure = format_dataset_structure(
dataset_structure[target_subset_name])
if split and split not in target_dataset_structure:
raise ValueError(
f'split {split} not found. Available: {target_dataset_structure.keys()}'


+ 2
- 2
modelscope/msdatasets/utils/download_utils.py View File

@@ -34,8 +34,8 @@ class DatasetDownloadManager(DownloadManager):
url_or_filename = str(url_or_filename)
if is_relative_path(url_or_filename):
# fetch oss files
return self.oss_utilities.download(url_or_filename,
self.download_config.cache_dir)
return self.oss_utilities.download(
url_or_filename, download_config=download_config)
else:
return cached_path(
url_or_filename, download_config=download_config)

+ 5
- 3
modelscope/msdatasets/utils/oss_utils.py View File

@@ -24,7 +24,8 @@ class OssUtilities:
rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
print('\r{0}% '.format(rate), end='', flush=True)

def download(self, oss_file_name, cache_dir):
def download(self, oss_file_name, download_config):
cache_dir = download_config.cache_dir
candidate_key = os.path.join(self.oss_dir, oss_file_name)
candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
file_oss_key = candidate_key if self.bucket.object_exists(
@@ -32,8 +33,9 @@ class OssUtilities:
filename = hash_url_to_filename(file_oss_key, etag=None)
local_path = os.path.join(cache_dir, filename)

self.bucket.get_object_to_file(
file_oss_key, local_path, progress_callback=self._percentage)
if download_config.force_download or not os.path.exists(local_path):
self.bucket.get_object_to_file(
file_oss_key, local_path, progress_callback=self._percentage)
return local_path

def upload(self, oss_file_name: str, local_file_path: str) -> str:


+ 4
- 3
tests/msdatasets/test_ms_dataset.py View File

@@ -37,9 +37,10 @@ class MsDatasetTest(unittest.TestCase):
'pets_small',
namespace=DEFAULT_DATASET_NAMESPACE,
split='train',
download_mode=DownloadMode.FORCE_REDOWNLOAD,
classes=('1', '2'))
print(ms_ds_train._hf_ds.config_kwargs)
classes=('1', '2'),
folder_name='Pets')
print(ms_ds_train.config_kwargs)
assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ms_csv_basic(self):


+ 4
- 0
tests/trainers/test_image_instance_segmentation_trainer.py View File

@@ -44,18 +44,21 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
name='pets_small',
split='train',
classes=('Cat', 'Dog'),
folder_name='Pets',
test_mode=False)
if val_data_cfg is None:
val_data_cfg = ConfigDict(
name='pets_small',
split='validation',
classes=('Cat', 'Dog'),
folder_name='Pets',
test_mode=True)

self.train_dataset = MsDataset.load(
dataset_name=train_data_cfg.name,
split=train_data_cfg.split,
classes=train_data_cfg.classes,
folder_name=train_data_cfg.folder_name,
test_mode=train_data_cfg.test_mode)
assert self.train_dataset.config_kwargs[
'classes'] == train_data_cfg.classes
@@ -66,6 +69,7 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
dataset_name=val_data_cfg.name,
split=val_data_cfg.split,
classes=val_data_cfg.classes,
folder_name=val_data_cfg.folder_name,
test_mode=val_data_cfg.test_mode)
assert self.eval_dataset.config_kwargs[
'classes'] == val_data_cfg.classes


Loading…
Cancel
Save