| @@ -26,18 +26,15 @@ from modelscope.utils.logger import get_logger | |||||
| from .errors import (InvalidParameter, NotExistError, RequestError, | from .errors import (InvalidParameter, NotExistError, RequestError, | ||||
| datahub_raise_on_error, handle_http_post_error, | datahub_raise_on_error, handle_http_post_error, | ||||
| handle_http_response, is_ok, raise_on_error) | handle_http_response, is_ok, raise_on_error) | ||||
| from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, | |||||
| model_id_to_group_owner_name) | |||||
| from .utils.utils import get_endpoint, model_id_to_group_owner_name | |||||
| logger = get_logger() | logger = get_logger() | ||||
| class HubApi: | class HubApi: | ||||
| def __init__(self, endpoint=None, dataset_endpoint=None): | |||||
| def __init__(self, endpoint=None): | |||||
| self.endpoint = endpoint if endpoint is not None else get_endpoint() | self.endpoint = endpoint if endpoint is not None else get_endpoint() | ||||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( | |||||
| ) | |||||
| def login( | def login( | ||||
| self, | self, | ||||
| @@ -288,7 +285,7 @@ class HubApi: | |||||
| return files | return files | ||||
| def list_datasets(self): | def list_datasets(self): | ||||
| path = f'{self.dataset_endpoint}/api/v1/datasets' | |||||
| path = f'{self.endpoint}/api/v1/datasets' | |||||
| headers = None | headers = None | ||||
| params = {} | params = {} | ||||
| r = requests.get(path, params=params, headers=headers) | r = requests.get(path, params=params, headers=headers) | ||||
| @@ -315,13 +312,13 @@ class HubApi: | |||||
| cache_dir): | cache_dir): | ||||
| shutil.rmtree(cache_dir) | shutil.rmtree(cache_dir) | ||||
| os.makedirs(cache_dir, exist_ok=True) | os.makedirs(cache_dir, exist_ok=True) | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
| r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
| dataset_id = resp['Data']['Id'] | dataset_id = resp['Data']['Id'] | ||||
| dataset_type = resp['Data']['Type'] | dataset_type = resp['Data']['Type'] | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||||
| r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
| @@ -339,7 +336,7 @@ class HubApi: | |||||
| file_path = file_info['Path'] | file_path = file_info['Path'] | ||||
| extension = os.path.splitext(file_path)[-1] | extension = os.path.splitext(file_path)[-1] | ||||
| if extension in dataset_meta_format: | if extension in dataset_meta_format: | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| f'Revision={revision}&FilePath={file_path}' | f'Revision={revision}&FilePath={file_path}' | ||||
| r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
| r.raise_for_status() | r.raise_for_status() | ||||
| @@ -363,7 +360,7 @@ class HubApi: | |||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| if file_name.endswith('.csv'): | if file_name.endswith('.csv'): | ||||
| file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| f'Revision={revision}&FilePath={file_name}' | f'Revision={revision}&FilePath={file_name}' | ||||
| return file_name | return file_name | ||||
| @@ -372,7 +369,7 @@ class HubApi: | |||||
| dataset_name: str, | dataset_name: str, | ||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
| return self.datahub_remote_call(datahub_url) | return self.datahub_remote_call(datahub_url) | ||||
| @@ -383,7 +380,7 @@ class HubApi: | |||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | cookies = requests.utils.dict_from_cookiejar(cookies) | ||||
| @@ -392,6 +389,19 @@ class HubApi: | |||||
| raise_on_error(resp) | raise_on_error(resp) | ||||
| return resp['Data'] | return resp['Data'] | ||||
| def list_oss_dataset_objects(self, dataset_name, namespace, max_limit, | |||||
| is_recursive, is_filter_dir, revision, | |||||
| cookies): | |||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \ | |||||
| f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | |||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||||
| resp = requests.get(url=url, cookies=cookies) | |||||
| resp = resp.json() | |||||
| raise_on_error(resp) | |||||
| resp = resp['Data'] | |||||
| return resp | |||||
| def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | ||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | ||||
| r = requests.post(url) | r = requests.post(url) | ||||
| @@ -4,8 +4,7 @@ import hashlib | |||||
| import os | import os | ||||
| from typing import Optional | from typing import Optional | ||||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, | |||||
| DEFAULT_MODELSCOPE_DOMAIN, | |||||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, | |||||
| DEFAULT_MODELSCOPE_GROUP, | DEFAULT_MODELSCOPE_GROUP, | ||||
| MODEL_ID_SEPARATOR, | MODEL_ID_SEPARATOR, | ||||
| MODELSCOPE_URL_SCHEME) | MODELSCOPE_URL_SCHEME) | ||||
| @@ -44,11 +43,6 @@ def get_endpoint(): | |||||
| return MODELSCOPE_URL_SCHEME + modelscope_domain | return MODELSCOPE_URL_SCHEME + modelscope_domain | ||||
| def get_dataset_hub_endpoint(): | |||||
| return os.environ.get('HUB_DATASET_ENDPOINT', | |||||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||||
| def compute_hash(file_path): | def compute_hash(file_path): | ||||
| BUFFER_SIZE = 1024 * 64 # 64k buffer size | BUFFER_SIZE = 1024 * 64 # 64k buffer size | ||||
| sha256_hash = hashlib.sha256() | sha256_hash = hashlib.sha256() | ||||
| @@ -1,6 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| import math | |||||
| import os | import os | ||||
| from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, | from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, | ||||
| Sequence, Union) | Sequence, Union) | ||||
| @@ -17,19 +16,18 @@ from datasets.utils.file_utils import (is_relative_path, | |||||
| relative_to_absolute_path) | relative_to_absolute_path) | ||||
| from modelscope.hub.repository import DatasetRepository | from modelscope.hub.repository import DatasetRepository | ||||
| from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||||
| from modelscope.msdatasets.utils.dataset_builder import ExternalDataset | |||||
| from modelscope.msdatasets.utils.dataset_utils import ( | |||||
| get_dataset_files, get_target_dataset_structure, load_dataset_builder) | |||||
| from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager | |||||
| from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager | |||||
| from modelscope.utils.config import ConfigDict | from modelscope.utils.config import ConfigDict | ||||
| from modelscope.utils.config_ds import MS_DATASETS_CACHE | from modelscope.utils.config_ds import MS_DATASETS_CACHE | ||||
| from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | ||||
| DEFAULT_DATASET_REVISION, | DEFAULT_DATASET_REVISION, | ||||
| DatasetFormations, DownloadMode, Hubs) | DatasetFormations, DownloadMode, Hubs) | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .task_datasets.builder import build_task_dataset | |||||
| from .utils.dataset_builder import ExternalDataset | |||||
| from .utils.dataset_utils import (get_dataset_files, | |||||
| get_target_dataset_structure, | |||||
| load_dataset_builder) | |||||
| from .utils.download_utils import DatasetDownloadManager | |||||
| from .utils.upload_utils import DatasetUploadManager | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -234,7 +232,6 @@ class MsDataset: | |||||
| # dataset organized to be compatible with hf format | # dataset organized to be compatible with hf format | ||||
| if dataset_formation == DatasetFormations.hf_compatible: | if dataset_formation == DatasetFormations.hf_compatible: | ||||
| dataset_name = dataset_scripts['.py'][0] | dataset_name = dataset_scripts['.py'][0] | ||||
| download_dataset = dataset_name | |||||
| else: | else: | ||||
| raise FileNotFoundError( | raise FileNotFoundError( | ||||
| f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | ||||
| @@ -270,7 +267,8 @@ class MsDataset: | |||||
| raise TypeError('path must be a str or a list, but got' | raise TypeError('path must be a str or a list, but got' | ||||
| f' {type(dataset_name)}') | f' {type(dataset_name)}') | ||||
| if download_dataset: | |||||
| is_ci_test = os.getenv('CI_TEST') == 'True' | |||||
| if download_dataset and not is_ci_test: | |||||
| try: | try: | ||||
| api.on_dataset_download( | api.on_dataset_download( | ||||
| dataset_name=download_dataset, namespace=namespace) | dataset_name=download_dataset, namespace=namespace) | ||||
| @@ -570,15 +568,26 @@ class MsDataset: | |||||
| local_file_path: str, | local_file_path: str, | ||||
| dataset_name: str, | dataset_name: str, | ||||
| namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | ||||
| version: Optional[str] = DEFAULT_DATASET_REVISION) -> None: | |||||
| """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first. | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| num_processes: Optional[int] = None, | |||||
| chunksize: Optional[int] = 1, | |||||
| filter_hidden_files: Optional[bool] = True) -> None: | |||||
| """Upload dataset file or directory to the ModelScope Hub. Please login to the ModelScope Hub first. | |||||
| Args: | Args: | ||||
| object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip | |||||
| local_file_path (str): Local file to upload | |||||
| object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name | |||||
| local_file_path (str): Local file or directory to upload | |||||
| dataset_name (str): Name of the dataset | dataset_name (str): Name of the dataset | ||||
| namespace(str, optional): Namespace of the dataset | namespace(str, optional): Namespace of the dataset | ||||
| version: Optional[str]: Version of the dataset | version: Optional[str]: Version of the dataset | ||||
| num_processes: Optional[int]: The number of processes used for multi-process uploading. | |||||
| This is only applicable when local_file_path is a directory, and we are uploading mutliple-files | |||||
| insided the directory. When None provided, the number returned by os.cpu_count() is used as default. | |||||
| chunksize: Optional[int]: The chunksize of objects to upload. | |||||
| For very long iterables using a large value for chunksize can make the job complete much faster than | |||||
| using the default value of 1. Available if local_file_path is a directory. | |||||
| filter_hidden_files: Optional[bool]: Whether to filter hidden files. | |||||
| Available if local_file_path is a directory. | |||||
| Returns: | Returns: | ||||
| None | None | ||||
| @@ -586,7 +595,20 @@ class MsDataset: | |||||
| """ | """ | ||||
| _upload_manager = DatasetUploadManager( | _upload_manager = DatasetUploadManager( | ||||
| dataset_name=dataset_name, namespace=namespace, version=version) | dataset_name=dataset_name, namespace=namespace, version=version) | ||||
| _upload_manager.upload(object_name, local_file_path) | |||||
| if os.path.isfile(local_file_path): | |||||
| _upload_manager.upload( | |||||
| object_name=object_name, local_file_path=local_file_path) | |||||
| elif os.path.isdir(local_file_path): | |||||
| _upload_manager.upload_dir( | |||||
| object_dir_name=object_name, | |||||
| local_dir_path=local_file_path, | |||||
| num_processes=num_processes, | |||||
| chunksize=chunksize, | |||||
| filter_hidden_files=filter_hidden_files) | |||||
| else: | |||||
| raise ValueError( | |||||
| f'{local_file_path} is not a valid file path or directory') | |||||
| @staticmethod | @staticmethod | ||||
| def clone_meta(dataset_work_dir: str, | def clone_meta(dataset_work_dir: str, | ||||
| @@ -6,7 +6,8 @@ from typing import Any, Mapping, Optional, Sequence, Union | |||||
| from datasets.builder import DatasetBuilder | from datasets.builder import DatasetBuilder | ||||
| from modelscope.utils.constant import DEFAULT_DATASET_REVISION | |||||
| from modelscope.hub.api import HubApi | |||||
| from modelscope.utils.constant import DEFAULT_DATASET_REVISION, DownloadParams | |||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | ||||
| @@ -77,6 +78,81 @@ def get_target_dataset_structure(dataset_structure: dict, | |||||
| return target_subset_name, target_dataset_structure | return target_subset_name, target_dataset_structure | ||||
| def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool, | |||||
| dataset_name: str, namespace: str, | |||||
| version: str) -> list: | |||||
| """ | |||||
| List all of objects for specific dataset. | |||||
| Args: | |||||
| hub_api (class HubApi): HubApi instance. | |||||
| max_limit (int): Max number of objects. | |||||
| is_recursive (bool): Whether to list objects recursively. | |||||
| dataset_name (str): Dataset name. | |||||
| namespace (str): Namespace. | |||||
| version (str): Dataset version. | |||||
| Returns: | |||||
| res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...] | |||||
| """ | |||||
| res = [] | |||||
| cookies = hub_api.check_cookies_upload_data(use_cookies=True) | |||||
| objects = hub_api.list_oss_dataset_objects( | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| max_limit=max_limit, | |||||
| is_recursive=is_recursive, | |||||
| is_filter_dir=True, | |||||
| revision=version, | |||||
| cookies=cookies) | |||||
| for item in objects: | |||||
| object_key = item.get('Key') | |||||
| res.append(object_key) | |||||
| return res | |||||
| def contains_dir(file_map) -> bool: | |||||
| """ | |||||
| To check whether input contains at least one directory. | |||||
| Args: | |||||
| file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'} | |||||
| Returns: | |||||
| True if input contains at least one directory, False otherwise. | |||||
| """ | |||||
| res = False | |||||
| for k, v in file_map.items(): | |||||
| if isinstance(v, str) and not v.endswith('.zip'): | |||||
| res = True | |||||
| break | |||||
| return res | |||||
| def get_split_objects_map(file_map, objects): | |||||
| """ | |||||
| Get the map between dataset split and oss objects. | |||||
| Args: | |||||
| file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val | |||||
| are dirs. | |||||
| objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png'] | |||||
| Returns: | |||||
| A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'], | |||||
| 'validation':['val/003/3_38.png']} | |||||
| """ | |||||
| res = {} | |||||
| for k, v in file_map.items(): | |||||
| res[k] = [] | |||||
| for obj_key in objects: | |||||
| for k, v in file_map.items(): | |||||
| if obj_key.startswith(v): | |||||
| res[k].append(obj_key) | |||||
| return res | |||||
| def get_dataset_files(subset_split_into: dict, | def get_dataset_files(subset_split_into: dict, | ||||
| dataset_name: str, | dataset_name: str, | ||||
| namespace: str, | namespace: str, | ||||
| @@ -95,14 +171,24 @@ def get_dataset_files(subset_split_into: dict, | |||||
| meta_map = defaultdict(dict) | meta_map = defaultdict(dict) | ||||
| file_map = defaultdict(dict) | file_map = defaultdict(dict) | ||||
| args_map = defaultdict(dict) | args_map = defaultdict(dict) | ||||
| from modelscope.hub.api import HubApi | |||||
| modelscope_api = HubApi() | modelscope_api = HubApi() | ||||
| objects = list_dataset_objects( | |||||
| hub_api=modelscope_api, | |||||
| max_limit=DownloadParams.MAX_LIST_OBJECTS_NUM.value, | |||||
| is_recursive=True, | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| version=revision) | |||||
| for split, info in subset_split_into.items(): | for split, info in subset_split_into.items(): | ||||
| meta_map[split] = modelscope_api.get_dataset_file_url( | meta_map[split] = modelscope_api.get_dataset_file_url( | ||||
| info.get('meta', ''), dataset_name, namespace, revision) | info.get('meta', ''), dataset_name, namespace, revision) | ||||
| if info.get('file'): | if info.get('file'): | ||||
| file_map[split] = info['file'] | file_map[split] = info['file'] | ||||
| args_map[split] = info.get('args') | args_map[split] = info.get('args') | ||||
| if contains_dir(file_map): | |||||
| file_map = get_split_objects_map(file_map, objects) | |||||
| return meta_map, file_map, args_map | return meta_map, file_map, args_map | ||||
| @@ -10,16 +10,14 @@ from .oss_utils import OssUtilities | |||||
| class DatasetDownloadManager(DownloadManager): | class DatasetDownloadManager(DownloadManager): | ||||
| def __init__( | |||||
| self, | |||||
| dataset_name: str, | |||||
| namespace: str, | |||||
| version: str, | |||||
| data_dir: Optional[str] = None, | |||||
| download_config: Optional[DownloadConfig] = None, | |||||
| base_path: Optional[str] = None, | |||||
| record_checksums=True, | |||||
| ): | |||||
| def __init__(self, | |||||
| dataset_name: str, | |||||
| namespace: str, | |||||
| version: str, | |||||
| data_dir: Optional[str] = None, | |||||
| download_config: Optional[DownloadConfig] = None, | |||||
| base_path: Optional[str] = None, | |||||
| record_checksums=True): | |||||
| super().__init__(dataset_name, data_dir, download_config, base_path, | super().__init__(dataset_name, data_dir, download_config, base_path, | ||||
| record_checksums) | record_checksums) | ||||
| self._namespace = namespace | self._namespace = namespace | ||||
| @@ -50,11 +50,16 @@ class OssUtilities: | |||||
| progress_callback=self._percentage) | progress_callback=self._percentage) | ||||
| return local_path | return local_path | ||||
| def upload(self, oss_object_name: str, local_file_path: str) -> str: | |||||
| def upload(self, oss_object_name: str, local_file_path: str, | |||||
| indicate_individual_progress: bool) -> str: | |||||
| retry_count = 0 | retry_count = 0 | ||||
| object_key = os.path.join(self.oss_dir, oss_object_name) | object_key = os.path.join(self.oss_dir, oss_object_name) | ||||
| resumable_store = oss2.ResumableStore( | resumable_store = oss2.ResumableStore( | ||||
| root=self.upload_resumable_tmp_store) | root=self.upload_resumable_tmp_store) | ||||
| if indicate_individual_progress: | |||||
| progress_callback = self._percentage | |||||
| else: | |||||
| progress_callback = None | |||||
| while True: | while True: | ||||
| try: | try: | ||||
| @@ -66,7 +71,7 @@ class OssUtilities: | |||||
| store=resumable_store, | store=resumable_store, | ||||
| multipart_threshold=self.upload_multipart_threshold, | multipart_threshold=self.upload_multipart_threshold, | ||||
| part_size=self.upload_part_size, | part_size=self.upload_part_size, | ||||
| progress_callback=self._percentage, | |||||
| progress_callback=progress_callback, | |||||
| num_threads=self.upload_num_threads) | num_threads=self.upload_num_threads) | ||||
| break | break | ||||
| except Exception: | except Exception: | ||||
| @@ -1,5 +1,10 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| import os | |||||
| from multiprocessing.dummy import Pool as ThreadPool | |||||
| from tqdm import tqdm | |||||
| from .oss_utils import OssUtilities | from .oss_utils import OssUtilities | ||||
| @@ -19,5 +24,38 @@ class DatasetUploadManager(object): | |||||
| def upload(self, object_name: str, local_file_path: str) -> str: | def upload(self, object_name: str, local_file_path: str) -> str: | ||||
| object_key = self.oss_utilities.upload( | object_key = self.oss_utilities.upload( | ||||
| oss_object_name=object_name, local_file_path=local_file_path) | |||||
| oss_object_name=object_name, | |||||
| local_file_path=local_file_path, | |||||
| indicate_individual_progress=True) | |||||
| return object_key | return object_key | ||||
| def upload_dir(self, object_dir_name: str, local_dir_path: str, | |||||
| num_processes: int, chunksize: int, | |||||
| filter_hidden_files: bool) -> int: | |||||
| def run_upload(args): | |||||
| self.oss_utilities.upload( | |||||
| oss_object_name=args[0], | |||||
| local_file_path=args[1], | |||||
| indicate_individual_progress=False) | |||||
| files_list = [] | |||||
| for root, dirs, files in os.walk(local_dir_path): | |||||
| for file_name in files: | |||||
| if filter_hidden_files and file_name.startswith('.'): | |||||
| continue | |||||
| # Concatenate directory name and relative path into a oss object key. e.g., train/001/1_1230.png | |||||
| object_name = os.path.join( | |||||
| object_dir_name, | |||||
| root.replace(local_dir_path, '', 1).strip('/'), file_name) | |||||
| local_file_path = os.path.join(root, file_name) | |||||
| files_list.append((object_name, local_file_path)) | |||||
| with ThreadPool(processes=num_processes) as pool: | |||||
| result = list( | |||||
| tqdm( | |||||
| pool.imap(run_upload, files_list, chunksize=chunksize), | |||||
| total=len(files_list))) | |||||
| return len(result) | |||||
| @@ -227,6 +227,13 @@ class DownloadMode(enum.Enum): | |||||
| FORCE_REDOWNLOAD = 'force_redownload' | FORCE_REDOWNLOAD = 'force_redownload' | ||||
| class DownloadParams(enum.Enum): | |||||
| """ | |||||
| Parameters for downloading dataset. | |||||
| """ | |||||
| MAX_LIST_OBJECTS_NUM = 50000 | |||||
| class DatasetFormations(enum.Enum): | class DatasetFormations(enum.Enum): | ||||
| """ How a dataset is organized and interpreted | """ How a dataset is organized and interpreted | ||||
| """ | """ | ||||
| @@ -6,9 +6,13 @@ import unittest | |||||
| import zipfile | import zipfile | ||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| from modelscope.utils.constant import ModelFile | |||||
| from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects | |||||
| from modelscope.utils import logger as logging | |||||
| from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile | |||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| logger = logging.get_logger(__name__) | |||||
| KEY_EXTRACTED = 'extracted' | KEY_EXTRACTED = 'extracted' | ||||
| @@ -39,7 +43,8 @@ class DatasetUploadTest(unittest.TestCase): | |||||
| def tearDown(self): | def tearDown(self): | ||||
| os.chdir(self.old_dir) | os.chdir(self.old_dir) | ||||
| shutil.rmtree(self.temp_dir, ignore_errors=True) | shutil.rmtree(self.temp_dir, ignore_errors=True) | ||||
| print('The test dir successfully removed!') | |||||
| logger.info( | |||||
| f'Temporary directory {self.temp_dir} successfully removed!') | |||||
| @staticmethod | @staticmethod | ||||
| def get_raw_downloaded_file_path(extracted_path): | def get_raw_downloaded_file_path(extracted_path): | ||||
| @@ -68,6 +73,40 @@ class DatasetUploadTest(unittest.TestCase): | |||||
| dataset_name=self.dataset_name, | dataset_name=self.dataset_name, | ||||
| namespace=self.namespace) | namespace=self.namespace) | ||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_ds_upload_dir(self): | |||||
| ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train') | |||||
| config_train = ms_ds_train._hf_ds.config_kwargs | |||||
| extracted_path_train = config_train.get('split_config').get('train') | |||||
| MsDataset.upload( | |||||
| object_name='train', | |||||
| local_file_path=os.path.join(extracted_path_train, | |||||
| 'Pets/images/train'), | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| MsDataset.upload( | |||||
| object_name='val', | |||||
| local_file_path=os.path.join(extracted_path_train, | |||||
| 'Pets/images/val'), | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| objects = list_dataset_objects( | |||||
| hub_api=self.api, | |||||
| max_limit=-1, | |||||
| is_recursive=True, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace, | |||||
| version=DEFAULT_DATASET_REVISION) | |||||
| logger.info(f'{len(objects)} objects have been uploaded: {objects}') | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_ds_download_dir(self): | |||||
| test_ds = MsDataset.load(self.dataset_name, self.namespace) | |||||
| assert test_ds.config_kwargs['split_config'].values() | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| def test_ds_clone_meta(self): | def test_ds_clone_meta(self): | ||||
| MsDataset.clone_meta( | MsDataset.clone_meta( | ||||