1. MsDataset支持upload数据文件(压缩包)
2. MsDataset支持clone和upload meta data
3. 使用MsDataset.load()下载数据集,支持web端显示数据集下载计数
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9831232
master
| @@ -32,6 +32,7 @@ do | |||||
| -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ | -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ | ||||
| -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ | -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ | ||||
| -e TEST_LEVEL=$TEST_LEVEL \ | -e TEST_LEVEL=$TEST_LEVEL \ | ||||
| -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | |||||
| --workdir=$CODE_DIR_IN_CONTAINER \ | --workdir=$CODE_DIR_IN_CONTAINER \ | ||||
| --net host \ | --net host \ | ||||
| ${IMAGE_NAME}:${IMAGE_VERSION} \ | ${IMAGE_NAME}:${IMAGE_VERSION} \ | ||||
| @@ -1,7 +1,6 @@ | |||||
| import os | import os | ||||
| import pickle | import pickle | ||||
| import shutil | import shutil | ||||
| import subprocess | |||||
| from collections import defaultdict | from collections import defaultdict | ||||
| from http import HTTPStatus | from http import HTTPStatus | ||||
| from http.cookiejar import CookieJar | from http.cookiejar import CookieJar | ||||
| @@ -16,8 +15,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, | |||||
| API_RESPONSE_FIELD_MESSAGE, | API_RESPONSE_FIELD_MESSAGE, | ||||
| API_RESPONSE_FIELD_USERNAME, | API_RESPONSE_FIELD_USERNAME, | ||||
| DEFAULT_CREDENTIALS_PATH) | DEFAULT_CREDENTIALS_PATH) | ||||
| from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, | |||||
| HUB_DATASET_ENDPOINT) | |||||
| from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH | |||||
| from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | ||||
| DEFAULT_MODEL_REVISION, | DEFAULT_MODEL_REVISION, | ||||
| DatasetFormations, DatasetMetaFormats, | DatasetFormations, DatasetMetaFormats, | ||||
| @@ -26,7 +24,8 @@ from modelscope.utils.logger import get_logger | |||||
| from .errors import (InvalidParameter, NotExistError, RequestError, | from .errors import (InvalidParameter, NotExistError, RequestError, | ||||
| datahub_raise_on_error, handle_http_response, is_ok, | datahub_raise_on_error, handle_http_response, is_ok, | ||||
| raise_on_error) | raise_on_error) | ||||
| from .utils.utils import get_endpoint, model_id_to_group_owner_name | |||||
| from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, | |||||
| model_id_to_group_owner_name) | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -35,7 +34,8 @@ class HubApi: | |||||
| def __init__(self, endpoint=None, dataset_endpoint=None): | def __init__(self, endpoint=None, dataset_endpoint=None): | ||||
| self.endpoint = endpoint if endpoint is not None else get_endpoint() | self.endpoint = endpoint if endpoint is not None else get_endpoint() | ||||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT | |||||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( | |||||
| ) | |||||
| def login( | def login( | ||||
| self, | self, | ||||
| @@ -376,6 +376,27 @@ class HubApi: | |||||
| f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
| return self.datahub_remote_call(datahub_url) | return self.datahub_remote_call(datahub_url) | ||||
| def get_dataset_access_config_session( | |||||
| self, | |||||
| cookies: CookieJar, | |||||
| dataset_name: str, | |||||
| namespace: str, | |||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| f'ststoken?Revision={revision}' | |||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||||
| r = requests.get(url=datahub_url, cookies=cookies) | |||||
| resp = r.json() | |||||
| datahub_raise_on_error(datahub_url, resp) | |||||
| return resp['Data'] | |||||
| def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | |||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | |||||
| r = requests.post(url) | |||||
| r.raise_for_status() | |||||
| @staticmethod | @staticmethod | ||||
| def datahub_remote_call(url): | def datahub_remote_call(url): | ||||
| r = requests.get(url) | r = requests.get(url) | ||||
| @@ -383,6 +404,9 @@ class HubApi: | |||||
| datahub_raise_on_error(url, resp) | datahub_raise_on_error(url, resp) | ||||
| return resp['Data'] | return resp['Data'] | ||||
| def check_cookies_upload_data(self, use_cookies) -> CookieJar: | |||||
| return self._check_cookie(use_cookies=use_cookies) | |||||
| class ModelScopeConfig: | class ModelScopeConfig: | ||||
| path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) | path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) | ||||
| @@ -2,7 +2,8 @@ import os | |||||
| from typing import Optional | from typing import Optional | ||||
| from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException | from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException | ||||
| from modelscope.utils.constant import DEFAULT_MODEL_REVISION | |||||
| from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||||
| DEFAULT_MODEL_REVISION) | |||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .api import ModelScopeConfig | from .api import ModelScopeConfig | ||||
| from .git import GitCommandWrapper | from .git import GitCommandWrapper | ||||
| @@ -15,14 +16,12 @@ class Repository: | |||||
| """A local representation of the model git repository. | """A local representation of the model git repository. | ||||
| """ | """ | ||||
| def __init__( | |||||
| self, | |||||
| model_dir: str, | |||||
| clone_from: str, | |||||
| revision: Optional[str] = DEFAULT_MODEL_REVISION, | |||||
| auth_token: Optional[str] = None, | |||||
| git_path: Optional[str] = None, | |||||
| ): | |||||
| def __init__(self, | |||||
| model_dir: str, | |||||
| clone_from: str, | |||||
| revision: Optional[str] = DEFAULT_MODEL_REVISION, | |||||
| auth_token: Optional[str] = None, | |||||
| git_path: Optional[str] = None): | |||||
| """ | """ | ||||
| Instantiate a Repository object by cloning the remote ModelScopeHub repo | Instantiate a Repository object by cloning the remote ModelScopeHub repo | ||||
| Args: | Args: | ||||
| @@ -86,6 +85,7 @@ class Repository: | |||||
| branch: Optional[str] = DEFAULT_MODEL_REVISION, | branch: Optional[str] = DEFAULT_MODEL_REVISION, | ||||
| force: bool = False): | force: bool = False): | ||||
| """Push local files to remote, this method will do. | """Push local files to remote, this method will do. | ||||
| git pull | |||||
| git add | git add | ||||
| git commit | git commit | ||||
| git push | git push | ||||
| @@ -117,3 +117,105 @@ class Repository: | |||||
| url=url, | url=url, | ||||
| local_branch=branch, | local_branch=branch, | ||||
| remote_branch=branch) | remote_branch=branch) | ||||
| class DatasetRepository: | |||||
| """A local representation of the dataset (metadata) git repository. | |||||
| """ | |||||
| def __init__(self, | |||||
| repo_work_dir: str, | |||||
| dataset_id: str, | |||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| auth_token: Optional[str] = None, | |||||
| git_path: Optional[str] = None): | |||||
| """ | |||||
| Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo | |||||
| Args: | |||||
| repo_work_dir(`str`): | |||||
| The dataset repo root directory. | |||||
| dataset_id: | |||||
| dataset id in ModelScope from which git clone | |||||
| revision(`Optional[str]`): | |||||
| revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash | |||||
| auth_token(`Optional[str]`): | |||||
| token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter | |||||
| as the token is already saved when you login the first time, if None, we will use saved token. | |||||
| git_path:(`Optional[str]`): | |||||
| The git command line path, if None, we use 'git' | |||||
| """ | |||||
| self.dataset_id = dataset_id | |||||
| self.repo_work_dir = repo_work_dir | |||||
| self.repo_base_dir = os.path.dirname(repo_work_dir) | |||||
| self.repo_name = os.path.basename(repo_work_dir) | |||||
| self.revision = revision | |||||
| if auth_token: | |||||
| self.auth_token = auth_token | |||||
| else: | |||||
| self.auth_token = ModelScopeConfig.get_token() | |||||
| self.git_wrapper = GitCommandWrapper(git_path) | |||||
| os.makedirs(self.repo_work_dir, exist_ok=True) | |||||
| self.repo_url = self._get_repo_url(dataset_id=dataset_id) | |||||
| def clone(self) -> str: | |||||
| # check local repo dir, directory not empty. | |||||
| if os.listdir(self.repo_work_dir): | |||||
| remote_url = self._get_remote_url() | |||||
| remote_url = self.git_wrapper.remove_token_from_url(remote_url) | |||||
| # no need clone again | |||||
| if remote_url and remote_url == self.repo_url: | |||||
| return '' | |||||
| logger.info('Cloning repo from {} '.format(self.repo_url)) | |||||
| self.git_wrapper.clone(self.repo_base_dir, self.auth_token, | |||||
| self.repo_url, self.repo_name, self.revision) | |||||
| return self.repo_work_dir | |||||
| def push(self, | |||||
| commit_message: str, | |||||
| branch: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| force: bool = False): | |||||
| """Push local files to remote, this method will do. | |||||
| git pull | |||||
| git add | |||||
| git commit | |||||
| git push | |||||
| Args: | |||||
| commit_message (str): commit message | |||||
| branch (Optional[str], optional): which branch to push. | |||||
| force (Optional[bool]): whether to use forced-push. | |||||
| """ | |||||
| if commit_message is None or not isinstance(commit_message, str): | |||||
| msg = 'commit_message must be provided!' | |||||
| raise InvalidParameter(msg) | |||||
| if not isinstance(force, bool): | |||||
| raise InvalidParameter('force must be bool') | |||||
| if not self.auth_token: | |||||
| raise NotLoginException('Must login to push, please login first.') | |||||
| self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token) | |||||
| self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name) | |||||
| remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir) | |||||
| self.git_wrapper.pull(self.repo_work_dir) | |||||
| self.git_wrapper.add(self.repo_work_dir, all_files=True) | |||||
| self.git_wrapper.commit(self.repo_work_dir, commit_message) | |||||
| self.git_wrapper.push( | |||||
| repo_dir=self.repo_work_dir, | |||||
| token=self.auth_token, | |||||
| url=remote_url, | |||||
| local_branch=branch, | |||||
| remote_branch=branch) | |||||
| def _get_repo_url(self, dataset_id): | |||||
| return f'{get_endpoint()}/datasets/{dataset_id}.git' | |||||
| def _get_remote_url(self): | |||||
| try: | |||||
| remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir) | |||||
| except GitError: | |||||
| remote = None | |||||
| return remote | |||||
| @@ -1,7 +1,8 @@ | |||||
| import hashlib | import hashlib | ||||
| import os | import os | ||||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, | |||||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, | |||||
| DEFAULT_MODELSCOPE_DOMAIN, | |||||
| DEFAULT_MODELSCOPE_GROUP, | DEFAULT_MODELSCOPE_GROUP, | ||||
| MODEL_ID_SEPARATOR, | MODEL_ID_SEPARATOR, | ||||
| MODELSCOPE_URL_SCHEME) | MODELSCOPE_URL_SCHEME) | ||||
| @@ -38,6 +39,11 @@ def get_endpoint(): | |||||
| return MODELSCOPE_URL_SCHEME + modelscope_domain | return MODELSCOPE_URL_SCHEME + modelscope_domain | ||||
| def get_dataset_hub_endpoint(): | |||||
| return os.environ.get('HUB_DATASET_ENDPOINT', | |||||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||||
| def compute_hash(file_path): | def compute_hash(file_path): | ||||
| BUFFER_SIZE = 1024 * 64 # 64k buffer size | BUFFER_SIZE = 1024 * 64 # 64k buffer size | ||||
| sha256_hash = hashlib.sha256() | sha256_hash = hashlib.sha256() | ||||
| @@ -12,9 +12,11 @@ from datasets.utils.download_manager import DownloadConfig | |||||
| from datasets.utils.file_utils import (is_relative_path, | from datasets.utils.file_utils import (is_relative_path, | ||||
| relative_to_absolute_path) | relative_to_absolute_path) | ||||
| from modelscope.msdatasets.config import MS_DATASETS_CACHE | |||||
| from modelscope.hub.repository import DatasetRepository | |||||
| from modelscope.utils.config import ConfigDict | from modelscope.utils.config import ConfigDict | ||||
| from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||||
| from modelscope.utils.config_ds import MS_DATASETS_CACHE | |||||
| from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | |||||
| DEFAULT_DATASET_REVISION, | |||||
| DatasetFormations, DownloadMode, Hubs) | DatasetFormations, DownloadMode, Hubs) | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .task_datasets.builder import build_task_dataset | from .task_datasets.builder import build_task_dataset | ||||
| @@ -23,6 +25,7 @@ from .utils.dataset_utils import (get_dataset_files, | |||||
| get_target_dataset_structure, | get_target_dataset_structure, | ||||
| load_dataset_builder) | load_dataset_builder) | ||||
| from .utils.download_utils import DatasetDownloadManager | from .utils.download_utils import DatasetDownloadManager | ||||
| from .utils.upload_utils import DatasetUploadManager | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -97,7 +100,7 @@ class MsDataset: | |||||
| @staticmethod | @staticmethod | ||||
| def load( | def load( | ||||
| dataset_name: Union[str, list], | dataset_name: Union[str, list], | ||||
| namespace: Optional[str] = 'modelscope', | |||||
| namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | |||||
| target: Optional[str] = None, | target: Optional[str] = None, | ||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | version: Optional[str] = DEFAULT_DATASET_REVISION, | ||||
| hub: Optional[Hubs] = Hubs.modelscope, | hub: Optional[Hubs] = Hubs.modelscope, | ||||
| @@ -171,15 +174,17 @@ class MsDataset: | |||||
| Mapping[str, Union[str, Sequence[str]]]]] = None, | Mapping[str, Union[str, Sequence[str]]]]] = None, | ||||
| download_mode: Optional[DownloadMode] = None, | download_mode: Optional[DownloadMode] = None, | ||||
| **config_kwargs) -> Union[dict, 'MsDataset']: | **config_kwargs) -> Union[dict, 'MsDataset']: | ||||
| from modelscope.hub.api import HubApi | |||||
| api = HubApi() | |||||
| download_dataset = '' | |||||
| if isinstance(dataset_name, str): | if isinstance(dataset_name, str): | ||||
| download_dataset = dataset_name | |||||
| dataset_formation = DatasetFormations.native | dataset_formation = DatasetFormations.native | ||||
| if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | ||||
| (os.path.isfile(dataset_name) and dataset_name.endswith('.py')): | (os.path.isfile(dataset_name) and dataset_name.endswith('.py')): | ||||
| dataset_formation = DatasetFormations.hf_compatible | dataset_formation = DatasetFormations.hf_compatible | ||||
| elif is_relative_path(dataset_name) and dataset_name.count( | elif is_relative_path(dataset_name) and dataset_name.count( | ||||
| '/') == 0: | '/') == 0: | ||||
| from modelscope.hub.api import HubApi | |||||
| api = HubApi() | |||||
| dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts( | dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts( | ||||
| dataset_name, namespace, download_mode, version) | dataset_name, namespace, download_mode, version) | ||||
| # dataset organized to be compatible with hf format | # dataset organized to be compatible with hf format | ||||
| @@ -219,6 +224,11 @@ class MsDataset: | |||||
| else: | else: | ||||
| raise TypeError('path must be a str or a list, but got' | raise TypeError('path must be a str or a list, but got' | ||||
| f' {type(dataset_name)}') | f' {type(dataset_name)}') | ||||
| if download_dataset: | |||||
| api.on_dataset_download( | |||||
| dataset_name=download_dataset, namespace=namespace) | |||||
| return MsDataset.from_hf_dataset(dataset, target=target) | return MsDataset.from_hf_dataset(dataset, target=target) | ||||
| @staticmethod | @staticmethod | ||||
| @@ -539,3 +549,100 @@ class MsDataset: | |||||
| def to_hf_dataset(self) -> Dataset: | def to_hf_dataset(self) -> Dataset: | ||||
| self._hf_ds.reset_format() | self._hf_ds.reset_format() | ||||
| return self._hf_ds | return self._hf_ds | ||||
| @staticmethod | |||||
| def upload(object_name: str, | |||||
| local_file_path: str, | |||||
| dataset_name: str, | |||||
| namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION) -> None: | |||||
| """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first. | |||||
| Args: | |||||
| object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip | |||||
| local_file_path (str): Local file to upload | |||||
| dataset_name (str): Name of the dataset | |||||
| namespace(str, optional): Namespace of the dataset | |||||
| version: Optional[str]: Version of the dataset | |||||
| Returns: | |||||
| None | |||||
| """ | |||||
| from modelscope.hub.api import HubApi | |||||
| _hub_api = HubApi() | |||||
| cookies = _hub_api.check_cookies_upload_data(use_cookies=True) | |||||
| _upload_manager = DatasetUploadManager( | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| version=version, | |||||
| cookies=cookies) | |||||
| _upload_manager.upload(object_name, local_file_path) | |||||
| @staticmethod | |||||
| def clone_meta(dataset_work_dir: str, | |||||
| dataset_id: str, | |||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| auth_token: Optional[str] = None, | |||||
| git_path: Optional[str] = None) -> None: | |||||
| """Clone meta-file of dataset from the ModelScope Hub. | |||||
| Args: | |||||
| dataset_work_dir (str): Current git working directory. | |||||
| dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name . | |||||
| revision(`Optional[str]`): | |||||
| revision of the model you want to clone from. Can be any of a branch, tag or commit hash | |||||
| auth_token(`Optional[str]`): | |||||
| token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter | |||||
| as the token is already saved when you login the first time, if None, we will use saved token. | |||||
| git_path:(`Optional[str]`): | |||||
| The git command line path, if None, we use 'git' | |||||
| Returns: | |||||
| None | |||||
| """ | |||||
| _repo = DatasetRepository( | |||||
| repo_work_dir=dataset_work_dir, | |||||
| dataset_id=dataset_id, | |||||
| revision=revision, | |||||
| auth_token=auth_token, | |||||
| git_path=git_path) | |||||
| clone_work_dir = _repo.clone() | |||||
| if clone_work_dir: | |||||
| logger.info('Already cloned repo to: {}'.format(clone_work_dir)) | |||||
| else: | |||||
| logger.warning('The repo working dir is already ex.') | |||||
| @staticmethod | |||||
| def upload_meta(dataset_work_dir: str, | |||||
| dataset_id: str, | |||||
| commit_message: str, | |||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| auth_token: Optional[str] = None, | |||||
| git_path: Optional[str] = None, | |||||
| force: bool = False) -> None: | |||||
| """Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first. | |||||
| Args: | |||||
| dataset_work_dir (str): Current working directory. | |||||
| dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name . | |||||
| commit_message (str): Commit message. | |||||
| revision(`Optional[str]`): | |||||
| revision of the model you want to clone from. Can be any of a branch, tag or commit hash | |||||
| auth_token(`Optional[str]`): | |||||
| token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter | |||||
| as the token is already saved when you login the first time, if None, we will use saved token. | |||||
| git_path:(`Optional[str]`): | |||||
| The git command line path, if None, we use 'git' | |||||
| force (Optional[bool]): whether to use forced-push. | |||||
| Returns: | |||||
| None | |||||
| """ | |||||
| _repo = DatasetRepository( | |||||
| repo_work_dir=dataset_work_dir, | |||||
| dataset_id=dataset_id, | |||||
| revision=revision, | |||||
| auth_token=auth_token, | |||||
| git_path=git_path) | |||||
| _repo.push(commit_message=commit_message, branch=revision, force=force) | |||||
| @@ -1,6 +1,5 @@ | |||||
| from __future__ import print_function | from __future__ import print_function | ||||
| import os | import os | ||||
| import sys | |||||
| import oss2 | import oss2 | ||||
| from datasets.utils.file_utils import hash_url_to_filename | from datasets.utils.file_utils import hash_url_to_filename | ||||
| @@ -19,6 +18,12 @@ class OssUtilities: | |||||
| self.oss_dir = oss_config['Dir'] | self.oss_dir = oss_config['Dir'] | ||||
| self.oss_backup_dir = oss_config['BackupDir'] | self.oss_backup_dir = oss_config['BackupDir'] | ||||
| @staticmethod | |||||
| def _percentage(consumed_bytes, total_bytes): | |||||
| if total_bytes: | |||||
| rate = int(100 * (float(consumed_bytes) / float(total_bytes))) | |||||
| print('\r{0}% '.format(rate), end='', flush=True) | |||||
| def download(self, oss_file_name, cache_dir): | def download(self, oss_file_name, cache_dir): | ||||
| candidate_key = os.path.join(self.oss_dir, oss_file_name) | candidate_key = os.path.join(self.oss_dir, oss_file_name) | ||||
| candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name) | candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name) | ||||
| @@ -27,11 +32,25 @@ class OssUtilities: | |||||
| filename = hash_url_to_filename(file_oss_key, etag=None) | filename = hash_url_to_filename(file_oss_key, etag=None) | ||||
| local_path = os.path.join(cache_dir, filename) | local_path = os.path.join(cache_dir, filename) | ||||
| def percentage(consumed_bytes, total_bytes): | |||||
| if total_bytes: | |||||
| rate = int(100 * (float(consumed_bytes) / float(total_bytes))) | |||||
| print('\r{0}% '.format(rate), end='', flush=True) | |||||
| self.bucket.get_object_to_file( | self.bucket.get_object_to_file( | ||||
| file_oss_key, local_path, progress_callback=percentage) | |||||
| file_oss_key, local_path, progress_callback=self._percentage) | |||||
| return local_path | return local_path | ||||
| def upload(self, oss_file_name: str, local_file_path: str) -> str: | |||||
| max_retries = 3 | |||||
| retry_count = 0 | |||||
| object_key = os.path.join(self.oss_dir, oss_file_name) | |||||
| while True: | |||||
| try: | |||||
| retry_count += 1 | |||||
| self.bucket.put_object_from_file( | |||||
| object_key, | |||||
| local_file_path, | |||||
| progress_callback=self._percentage) | |||||
| break | |||||
| except Exception: | |||||
| if retry_count >= max_retries: | |||||
| raise | |||||
| return object_key | |||||
| @@ -0,0 +1,23 @@ | |||||
| from http.cookiejar import CookieJar | |||||
| from .oss_utils import OssUtilities | |||||
| class DatasetUploadManager(object): | |||||
| def __init__(self, dataset_name: str, namespace: str, version: str, | |||||
| cookies: CookieJar): | |||||
| from modelscope.hub.api import HubApi | |||||
| api = HubApi() | |||||
| oss_config = api.get_dataset_access_config_session( | |||||
| cookies=cookies, | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| revision=version) | |||||
| self.oss_utilities = OssUtilities(oss_config) | |||||
| def upload(self, oss_file_name: str, local_file_path: str) -> str: | |||||
| oss_object_key = self.oss_utilities.upload( | |||||
| oss_file_name=oss_file_name, local_file_path=local_file_path) | |||||
| return oss_object_key | |||||
| @@ -254,6 +254,7 @@ class Frameworks(object): | |||||
| DEFAULT_MODEL_REVISION = 'master' | DEFAULT_MODEL_REVISION = 'master' | ||||
| DEFAULT_DATASET_REVISION = 'master' | DEFAULT_DATASET_REVISION = 'master' | ||||
| DEFAULT_DATASET_NAMESPACE = 'modelscope' | |||||
| class ModeKeys: | class ModeKeys: | ||||
| @@ -0,0 +1,95 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| import zipfile | |||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.utils.constant import ModelFile | |||||
| from modelscope.utils.test_utils import test_level | |||||
| KEY_EXTRACTED = 'extracted' | |||||
| class DatasetUploadTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| self.old_dir = os.getcwd() | |||||
| self.dataset_name = 'small_coco_for_test' | |||||
| self.dataset_file_name = self.dataset_name | |||||
| self.prepared_dataset_name = 'pets_small' | |||||
| self.token = os.getenv('TEST_UPLOAD_MS_TOKEN') | |||||
| error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN' | |||||
| self.assertIsNotNone(self.token, msg=error_msg) | |||||
| from modelscope.hub.api import HubApi | |||||
| from modelscope.hub.api import ModelScopeConfig | |||||
| self.api = HubApi() | |||||
| self.api.login(self.token) | |||||
| # get user info | |||||
| self.namespace, _ = ModelScopeConfig.get_user_info() | |||||
| self.temp_dir = tempfile.mkdtemp() | |||||
| self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name) | |||||
| self.test_meta_dir = os.path.join(self.test_work_dir, 'meta') | |||||
| if not os.path.exists(self.test_work_dir): | |||||
| os.makedirs(self.test_work_dir) | |||||
| def tearDown(self): | |||||
| os.chdir(self.old_dir) | |||||
| shutil.rmtree(self.temp_dir, ignore_errors=True) | |||||
| print('The test dir successfully removed!') | |||||
| @staticmethod | |||||
| def get_raw_downloaded_file_path(extracted_path): | |||||
| raw_downloaded_file_path = '' | |||||
| raw_data_dir = os.path.abspath( | |||||
| os.path.join(extracted_path, '../../..')) | |||||
| for root, dirs, files in os.walk(raw_data_dir): | |||||
| if KEY_EXTRACTED in dirs: | |||||
| for file in files: | |||||
| curr_file_path = os.path.join(root, file) | |||||
| if zipfile.is_zipfile(curr_file_path): | |||||
| raw_downloaded_file_path = curr_file_path | |||||
| return raw_downloaded_file_path | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_ds_upload(self): | |||||
| # Get the prepared data from hub, using default modelscope namespace | |||||
| ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train') | |||||
| config_res = ms_ds_train._hf_ds.config_kwargs | |||||
| extracted_path = config_res.get('split_config').get('train') | |||||
| raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path) | |||||
| MsDataset.upload( | |||||
| object_name=self.dataset_file_name + '.zip', | |||||
| local_file_path=raw_zipfile_path, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_ds_clone_meta(self): | |||||
| MsDataset.clone_meta( | |||||
| dataset_work_dir=self.test_meta_dir, | |||||
| dataset_id=os.path.join(self.namespace, self.dataset_name)) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_ds_upload_meta(self): | |||||
| # Clone dataset meta repo first. | |||||
| MsDataset.clone_meta( | |||||
| dataset_work_dir=self.test_meta_dir, | |||||
| dataset_id=os.path.join(self.namespace, self.dataset_name)) | |||||
| with open(os.path.join(self.test_meta_dir, ModelFile.README), | |||||
| 'a') as f: | |||||
| f.write('\nThis is a line for unit test.') | |||||
| MsDataset.upload_meta( | |||||
| dataset_work_dir=self.test_meta_dir, | |||||
| dataset_id=os.path.join(self.namespace, self.dataset_name), | |||||
| commit_message='Update for unit test.') | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -4,7 +4,7 @@ from modelscope.models import Model | |||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| from modelscope.preprocessors import SequenceClassificationPreprocessor | from modelscope.preprocessors import SequenceClassificationPreprocessor | ||||
| from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
| from modelscope.utils.constant import DownloadMode | |||||
| from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode | |||||
| from modelscope.utils.test_utils import require_tf, require_torch, test_level | from modelscope.utils.test_utils import require_tf, require_torch, test_level | ||||
| @@ -35,7 +35,7 @@ class MsDatasetTest(unittest.TestCase): | |||||
| def test_coco(self): | def test_coco(self): | ||||
| ms_ds_train = MsDataset.load( | ms_ds_train = MsDataset.load( | ||||
| 'pets_small', | 'pets_small', | ||||
| namespace='modelscope', | |||||
| namespace=DEFAULT_DATASET_NAMESPACE, | |||||
| split='train', | split='train', | ||||
| download_mode=DownloadMode.FORCE_REDOWNLOAD, | download_mode=DownloadMode.FORCE_REDOWNLOAD, | ||||
| classes=('1', '2')) | classes=('1', '2')) | ||||