Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9233644master
| @@ -1,6 +1,8 @@ | |||
| import os | |||
| import pickle | |||
| import shutil | |||
| import subprocess | |||
| from collections import defaultdict | |||
| from http.cookiejar import CookieJar | |||
| from os.path import expanduser | |||
| from typing import List, Optional, Tuple, Union | |||
| @@ -8,8 +10,11 @@ from typing import List, Optional, Tuple, Union | |||
| import requests | |||
| from modelscope.utils.logger import get_logger | |||
| from ..msdatasets.config import DOWNLOADED_DATASETS_PATH, HUB_DATASET_ENDPOINT | |||
| from ..utils.constant import DownloadMode | |||
| from .constants import MODELSCOPE_URL_SCHEME | |||
| from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error | |||
| from .errors import (InvalidParameter, NotExistError, datahub_raise_on_error, | |||
| is_ok, raise_on_error) | |||
| from .utils.utils import (get_endpoint, get_gitlab_domain, | |||
| model_id_to_group_owner_name) | |||
| @@ -18,8 +23,9 @@ logger = get_logger() | |||
| class HubApi: | |||
| def __init__(self, endpoint=None): | |||
| def __init__(self, endpoint=None, dataset_endpoint=None): | |||
| self.endpoint = endpoint if endpoint is not None else get_endpoint() | |||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT | |||
| def login( | |||
| self, | |||
| @@ -241,6 +247,70 @@ class HubApi: | |||
| files.append(file) | |||
| return files | |||
| def list_datasets(self): | |||
| path = f'{self.dataset_endpoint}/api/v1/datasets' | |||
| headers = None | |||
| params = {} | |||
| r = requests.get(path, params=params, headers=headers) | |||
| r.raise_for_status() | |||
| dataset_list = r.json()['Data'] | |||
| return [x['Name'] for x in dataset_list] | |||
| def fetch_dataset_scripts(self, | |||
| dataset_name: str, | |||
| namespace: str, | |||
| download_mode: Optional[DownloadMode], | |||
| version: Optional[str] = 'master'): | |||
| if namespace is None: | |||
| raise ValueError( | |||
| f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}' | |||
| ) | |||
| version = version or 'master' | |||
| cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, | |||
| namespace, version) | |||
| download_mode = DownloadMode(download_mode | |||
| or DownloadMode.REUSE_DATASET_IF_EXISTS) | |||
| if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists( | |||
| cache_dir): | |||
| shutil.rmtree(cache_dir) | |||
| os.makedirs(cache_dir, exist_ok=True) | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||
| r = requests.get(datahub_url) | |||
| resp = r.json() | |||
| datahub_raise_on_error(datahub_url, resp) | |||
| dataset_id = resp['Data']['Id'] | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' | |||
| r = requests.get(datahub_url) | |||
| resp = r.json() | |||
| datahub_raise_on_error(datahub_url, resp) | |||
| file_list = resp['Data'] | |||
| if file_list is None: | |||
| raise NotExistError( | |||
| f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, ' | |||
| f'version = {version}] dose not exist') | |||
| file_list = file_list['Files'] | |||
| local_paths = defaultdict(list) | |||
| for file_info in file_list: | |||
| file_path = file_info['Path'] | |||
| if file_path.endswith('.py'): | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ | |||
| f'Revision={version}&Path={file_path}' | |||
| r = requests.get(datahub_url) | |||
| r.raise_for_status() | |||
| content = r.json()['Data']['Content'] | |||
| local_path = os.path.join(cache_dir, file_path) | |||
| if os.path.exists(local_path): | |||
| logger.warning( | |||
| f"Reusing dataset {dataset_name}'s python file ({local_path})" | |||
| ) | |||
| local_paths['py'].append(local_path) | |||
| continue | |||
| with open(local_path, 'w') as f: | |||
| f.writelines(content) | |||
| local_paths['py'].append(local_path) | |||
| return local_paths | |||
| class ModelScopeConfig: | |||
| path_credential = expanduser('~/.modelscope/credentials') | |||
| @@ -1,6 +1,8 @@ | |||
| MODELSCOPE_URL_SCHEME = 'http://' | |||
| DEFAULT_MODELSCOPE_DOMAIN = '47.94.223.21:31090' | |||
| DEFAULT_MODELSCOPE_IP = '47.94.223.21' | |||
| DEFAULT_MODELSCOPE_DOMAIN = DEFAULT_MODELSCOPE_IP + ':31090' | |||
| DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102' | |||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_IP + ':31752' | |||
| DEFAULT_MODELSCOPE_GROUP = 'damo' | |||
| MODEL_ID_SEPARATOR = '/' | |||
| @@ -1,9 +1,7 @@ | |||
| import hashlib | |||
| import logging | |||
| import os | |||
| import pickle | |||
| import tempfile | |||
| import time | |||
| from shutil import move, rmtree | |||
| from modelscope.utils.logger import get_logger | |||
| @@ -2,6 +2,8 @@ import os | |||
| from pathlib import Path | |||
| # Cache location | |||
| from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT | |||
| DEFAULT_CACHE_HOME = '~/.cache' | |||
| CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) | |||
| DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') | |||
| @@ -18,5 +20,5 @@ DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE, | |||
| DOWNLOADED_DATASETS_PATH = Path( | |||
| os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) | |||
| MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT', | |||
| 'http://47.94.223.21:31752') | |||
| HUB_DATASET_ENDPOINT = os.environ.get('HUB_DATASET_ENDPOINT', | |||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||
| @@ -11,7 +11,6 @@ from datasets.utils.file_utils import (is_relative_path, | |||
| relative_to_absolute_path) | |||
| from modelscope.msdatasets.config import MS_DATASETS_CACHE | |||
| from modelscope.msdatasets.utils.ms_api import MsApi | |||
| from modelscope.utils.constant import DownloadMode, Hubs | |||
| from modelscope.utils.logger import get_logger | |||
| @@ -146,8 +145,9 @@ class MsDataset: | |||
| use_hf = True | |||
| elif is_relative_path(dataset_name) and dataset_name.count( | |||
| '/') == 0: | |||
| ms_api = MsApi() | |||
| dataset_scripts = ms_api.fetch_dataset_scripts( | |||
| from modelscope.hub.api import HubApi | |||
| api = HubApi() | |||
| dataset_scripts = api.fetch_dataset_scripts( | |||
| dataset_name, namespace, download_mode, version) | |||
| if 'py' in dataset_scripts: # dataset copied from hf datasets | |||
| dataset_name = dataset_scripts['py'][0] | |||
| @@ -1,84 +0,0 @@ | |||
| import os | |||
| import shutil | |||
| from collections import defaultdict | |||
| from typing import Optional | |||
| import requests | |||
| from modelscope.hub.errors import NotExistError, datahub_raise_on_error | |||
| from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, | |||
| MS_HUB_ENDPOINT) | |||
| from modelscope.utils.constant import DownloadMode | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| class MsApi: | |||
| def __init__(self, endpoint=MS_HUB_ENDPOINT): | |||
| self.endpoint = endpoint | |||
| def list_datasets(self): | |||
| path = f'{self.endpoint}/api/v1/datasets' | |||
| headers = None | |||
| params = {} | |||
| r = requests.get(path, params=params, headers=headers) | |||
| r.raise_for_status() | |||
| dataset_list = r.json()['Data'] | |||
| return [x['Name'] for x in dataset_list] | |||
| def fetch_dataset_scripts(self, | |||
| dataset_name: str, | |||
| namespace: str, | |||
| download_mode: Optional[DownloadMode], | |||
| version: Optional[str] = 'master'): | |||
| if namespace is None: | |||
| raise ValueError( | |||
| f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}' | |||
| ) | |||
| version = version or 'master' | |||
| cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, | |||
| namespace, version) | |||
| download_mode = DownloadMode(download_mode | |||
| or DownloadMode.REUSE_DATASET_IF_EXISTS) | |||
| if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists( | |||
| cache_dir): | |||
| shutil.rmtree(cache_dir) | |||
| os.makedirs(cache_dir, exist_ok=True) | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||
| r = requests.get(datahub_url) | |||
| resp = r.json() | |||
| datahub_raise_on_error(datahub_url, resp) | |||
| dataset_id = resp['Data']['Id'] | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' | |||
| r = requests.get(datahub_url) | |||
| resp = r.json() | |||
| datahub_raise_on_error(datahub_url, resp) | |||
| file_list = resp['Data'] | |||
| if file_list is None: | |||
| raise NotExistError( | |||
| f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, ' | |||
| f'version = {version}] dose not exist') | |||
| file_list = file_list['Files'] | |||
| local_paths = defaultdict(list) | |||
| for file_info in file_list: | |||
| file_path = file_info['Path'] | |||
| if file_path.endswith('.py'): | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ | |||
| f'Revision={version}&Path={file_path}' | |||
| r = requests.get(datahub_url) | |||
| r.raise_for_status() | |||
| content = r.json()['Data']['Content'] | |||
| local_path = os.path.join(cache_dir, file_path) | |||
| if os.path.exists(local_path): | |||
| logger.warning( | |||
| f"Reusing dataset {dataset_name}'s python file ({local_path})" | |||
| ) | |||
| local_paths['py'].append(local_path) | |||
| continue | |||
| with open(local_path, 'w') as f: | |||
| f.writelines(content) | |||
| local_paths['py'].append(local_path) | |||
| return local_paths | |||