Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9233644master
| @@ -1,6 +1,8 @@ | |||||
| import os | import os | ||||
| import pickle | import pickle | ||||
| import shutil | |||||
| import subprocess | import subprocess | ||||
| from collections import defaultdict | |||||
| from http.cookiejar import CookieJar | from http.cookiejar import CookieJar | ||||
| from os.path import expanduser | from os.path import expanduser | ||||
| from typing import List, Optional, Tuple, Union | from typing import List, Optional, Tuple, Union | ||||
| @@ -8,8 +10,11 @@ from typing import List, Optional, Tuple, Union | |||||
| import requests | import requests | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..msdatasets.config import DOWNLOADED_DATASETS_PATH, HUB_DATASET_ENDPOINT | |||||
| from ..utils.constant import DownloadMode | |||||
| from .constants import MODELSCOPE_URL_SCHEME | from .constants import MODELSCOPE_URL_SCHEME | ||||
| from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error | |||||
| from .errors import (InvalidParameter, NotExistError, datahub_raise_on_error, | |||||
| is_ok, raise_on_error) | |||||
| from .utils.utils import (get_endpoint, get_gitlab_domain, | from .utils.utils import (get_endpoint, get_gitlab_domain, | ||||
| model_id_to_group_owner_name) | model_id_to_group_owner_name) | ||||
| @@ -18,8 +23,9 @@ logger = get_logger() | |||||
| class HubApi: | class HubApi: | ||||
| def __init__(self, endpoint=None): | |||||
| def __init__(self, endpoint=None, dataset_endpoint=None): | |||||
| self.endpoint = endpoint if endpoint is not None else get_endpoint() | self.endpoint = endpoint if endpoint is not None else get_endpoint() | ||||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT | |||||
| def login( | def login( | ||||
| self, | self, | ||||
| @@ -241,6 +247,70 @@ class HubApi: | |||||
| files.append(file) | files.append(file) | ||||
| return files | return files | ||||
| def list_datasets(self): | |||||
| path = f'{self.dataset_endpoint}/api/v1/datasets' | |||||
| headers = None | |||||
| params = {} | |||||
| r = requests.get(path, params=params, headers=headers) | |||||
| r.raise_for_status() | |||||
| dataset_list = r.json()['Data'] | |||||
| return [x['Name'] for x in dataset_list] | |||||
| def fetch_dataset_scripts(self, | |||||
| dataset_name: str, | |||||
| namespace: str, | |||||
| download_mode: Optional[DownloadMode], | |||||
| version: Optional[str] = 'master'): | |||||
| if namespace is None: | |||||
| raise ValueError( | |||||
| f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}' | |||||
| ) | |||||
| version = version or 'master' | |||||
| cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, | |||||
| namespace, version) | |||||
| download_mode = DownloadMode(download_mode | |||||
| or DownloadMode.REUSE_DATASET_IF_EXISTS) | |||||
| if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists( | |||||
| cache_dir): | |||||
| shutil.rmtree(cache_dir) | |||||
| os.makedirs(cache_dir, exist_ok=True) | |||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
| r = requests.get(datahub_url) | |||||
| resp = r.json() | |||||
| datahub_raise_on_error(datahub_url, resp) | |||||
| dataset_id = resp['Data']['Id'] | |||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' | |||||
| r = requests.get(datahub_url) | |||||
| resp = r.json() | |||||
| datahub_raise_on_error(datahub_url, resp) | |||||
| file_list = resp['Data'] | |||||
| if file_list is None: | |||||
| raise NotExistError( | |||||
| f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, ' | |||||
| f'version = {version}] dose not exist') | |||||
| file_list = file_list['Files'] | |||||
| local_paths = defaultdict(list) | |||||
| for file_info in file_list: | |||||
| file_path = file_info['Path'] | |||||
| if file_path.endswith('.py'): | |||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ | |||||
| f'Revision={version}&Path={file_path}' | |||||
| r = requests.get(datahub_url) | |||||
| r.raise_for_status() | |||||
| content = r.json()['Data']['Content'] | |||||
| local_path = os.path.join(cache_dir, file_path) | |||||
| if os.path.exists(local_path): | |||||
| logger.warning( | |||||
| f"Reusing dataset {dataset_name}'s python file ({local_path})" | |||||
| ) | |||||
| local_paths['py'].append(local_path) | |||||
| continue | |||||
| with open(local_path, 'w') as f: | |||||
| f.writelines(content) | |||||
| local_paths['py'].append(local_path) | |||||
| return local_paths | |||||
| class ModelScopeConfig: | class ModelScopeConfig: | ||||
| path_credential = expanduser('~/.modelscope/credentials') | path_credential = expanduser('~/.modelscope/credentials') | ||||
| @@ -1,6 +1,8 @@ | |||||
| MODELSCOPE_URL_SCHEME = 'http://' | MODELSCOPE_URL_SCHEME = 'http://' | ||||
| DEFAULT_MODELSCOPE_DOMAIN = '47.94.223.21:31090' | |||||
| DEFAULT_MODELSCOPE_IP = '47.94.223.21' | |||||
| DEFAULT_MODELSCOPE_DOMAIN = DEFAULT_MODELSCOPE_IP + ':31090' | |||||
| DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102' | DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102' | ||||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_IP + ':31752' | |||||
| DEFAULT_MODELSCOPE_GROUP = 'damo' | DEFAULT_MODELSCOPE_GROUP = 'damo' | ||||
| MODEL_ID_SEPARATOR = '/' | MODEL_ID_SEPARATOR = '/' | ||||
| @@ -1,9 +1,7 @@ | |||||
| import hashlib | import hashlib | ||||
| import logging | |||||
| import os | import os | ||||
| import pickle | import pickle | ||||
| import tempfile | import tempfile | ||||
| import time | |||||
| from shutil import move, rmtree | from shutil import move, rmtree | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| @@ -2,6 +2,8 @@ import os | |||||
| from pathlib import Path | from pathlib import Path | ||||
| # Cache location | # Cache location | ||||
| from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT | |||||
| DEFAULT_CACHE_HOME = '~/.cache' | DEFAULT_CACHE_HOME = '~/.cache' | ||||
| CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) | CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) | ||||
| DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') | DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') | ||||
| @@ -18,5 +20,5 @@ DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE, | |||||
| DOWNLOADED_DATASETS_PATH = Path( | DOWNLOADED_DATASETS_PATH = Path( | ||||
| os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) | os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) | ||||
| MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT', | |||||
| 'http://47.94.223.21:31752') | |||||
| HUB_DATASET_ENDPOINT = os.environ.get('HUB_DATASET_ENDPOINT', | |||||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||||
| @@ -11,7 +11,6 @@ from datasets.utils.file_utils import (is_relative_path, | |||||
| relative_to_absolute_path) | relative_to_absolute_path) | ||||
| from modelscope.msdatasets.config import MS_DATASETS_CACHE | from modelscope.msdatasets.config import MS_DATASETS_CACHE | ||||
| from modelscope.msdatasets.utils.ms_api import MsApi | |||||
| from modelscope.utils.constant import DownloadMode, Hubs | from modelscope.utils.constant import DownloadMode, Hubs | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| @@ -146,8 +145,9 @@ class MsDataset: | |||||
| use_hf = True | use_hf = True | ||||
| elif is_relative_path(dataset_name) and dataset_name.count( | elif is_relative_path(dataset_name) and dataset_name.count( | ||||
| '/') == 0: | '/') == 0: | ||||
| ms_api = MsApi() | |||||
| dataset_scripts = ms_api.fetch_dataset_scripts( | |||||
| from modelscope.hub.api import HubApi | |||||
| api = HubApi() | |||||
| dataset_scripts = api.fetch_dataset_scripts( | |||||
| dataset_name, namespace, download_mode, version) | dataset_name, namespace, download_mode, version) | ||||
| if 'py' in dataset_scripts: # dataset copied from hf datasets | if 'py' in dataset_scripts: # dataset copied from hf datasets | ||||
| dataset_name = dataset_scripts['py'][0] | dataset_name = dataset_scripts['py'][0] | ||||
| @@ -1,84 +0,0 @@ | |||||
| import os | |||||
| import shutil | |||||
| from collections import defaultdict | |||||
| from typing import Optional | |||||
| import requests | |||||
| from modelscope.hub.errors import NotExistError, datahub_raise_on_error | |||||
| from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, | |||||
| MS_HUB_ENDPOINT) | |||||
| from modelscope.utils.constant import DownloadMode | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| class MsApi: | |||||
| def __init__(self, endpoint=MS_HUB_ENDPOINT): | |||||
| self.endpoint = endpoint | |||||
| def list_datasets(self): | |||||
| path = f'{self.endpoint}/api/v1/datasets' | |||||
| headers = None | |||||
| params = {} | |||||
| r = requests.get(path, params=params, headers=headers) | |||||
| r.raise_for_status() | |||||
| dataset_list = r.json()['Data'] | |||||
| return [x['Name'] for x in dataset_list] | |||||
| def fetch_dataset_scripts(self, | |||||
| dataset_name: str, | |||||
| namespace: str, | |||||
| download_mode: Optional[DownloadMode], | |||||
| version: Optional[str] = 'master'): | |||||
| if namespace is None: | |||||
| raise ValueError( | |||||
| f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}' | |||||
| ) | |||||
| version = version or 'master' | |||||
| cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, | |||||
| namespace, version) | |||||
| download_mode = DownloadMode(download_mode | |||||
| or DownloadMode.REUSE_DATASET_IF_EXISTS) | |||||
| if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists( | |||||
| cache_dir): | |||||
| shutil.rmtree(cache_dir) | |||||
| os.makedirs(cache_dir, exist_ok=True) | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
| r = requests.get(datahub_url) | |||||
| resp = r.json() | |||||
| datahub_raise_on_error(datahub_url, resp) | |||||
| dataset_id = resp['Data']['Id'] | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' | |||||
| r = requests.get(datahub_url) | |||||
| resp = r.json() | |||||
| datahub_raise_on_error(datahub_url, resp) | |||||
| file_list = resp['Data'] | |||||
| if file_list is None: | |||||
| raise NotExistError( | |||||
| f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, ' | |||||
| f'version = {version}] dose not exist') | |||||
| file_list = file_list['Files'] | |||||
| local_paths = defaultdict(list) | |||||
| for file_info in file_list: | |||||
| file_path = file_info['Path'] | |||||
| if file_path.endswith('.py'): | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ | |||||
| f'Revision={version}&Path={file_path}' | |||||
| r = requests.get(datahub_url) | |||||
| r.raise_for_status() | |||||
| content = r.json()['Data']['Content'] | |||||
| local_path = os.path.join(cache_dir, file_path) | |||||
| if os.path.exists(local_path): | |||||
| logger.warning( | |||||
| f"Reusing dataset {dataset_name}'s python file ({local_path})" | |||||
| ) | |||||
| local_paths['py'].append(local_path) | |||||
| continue | |||||
| with open(local_path, 'w') as f: | |||||
| f.writelines(content) | |||||
| local_paths['py'].append(local_path) | |||||
| return local_paths | |||||