diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 6cfad54d..45e39133 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -1,6 +1,8 @@ import os import pickle +import shutil import subprocess +from collections import defaultdict from http.cookiejar import CookieJar from os.path import expanduser from typing import List, Optional, Tuple, Union @@ -8,8 +10,11 @@ from typing import List, Optional, Tuple, Union import requests from modelscope.utils.logger import get_logger +from ..msdatasets.config import DOWNLOADED_DATASETS_PATH, HUB_DATASET_ENDPOINT +from ..utils.constant import DownloadMode from .constants import MODELSCOPE_URL_SCHEME -from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error +from .errors import (InvalidParameter, NotExistError, datahub_raise_on_error, + is_ok, raise_on_error) from .utils.utils import (get_endpoint, get_gitlab_domain, model_id_to_group_owner_name) @@ -18,8 +23,9 @@ logger = get_logger() class HubApi: - def __init__(self, endpoint=None): + def __init__(self, endpoint=None, dataset_endpoint=None): self.endpoint = endpoint if endpoint is not None else get_endpoint() + self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT def login( self, @@ -241,6 +247,70 @@ class HubApi: files.append(file) return files + def list_datasets(self): + path = f'{self.dataset_endpoint}/api/v1/datasets' + headers = None + params = {} + r = requests.get(path, params=params, headers=headers) + r.raise_for_status() + dataset_list = r.json()['Data'] + return [x['Name'] for x in dataset_list] + + def fetch_dataset_scripts(self, + dataset_name: str, + namespace: str, + download_mode: Optional[DownloadMode], + version: Optional[str] = 'master'): + if namespace is None: + raise ValueError( + f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}' + ) + version = version or 'master' + cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, + namespace, version) + download_mode = DownloadMode(download_mode + or DownloadMode.REUSE_DATASET_IF_EXISTS) + if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists( + cache_dir): + shutil.rmtree(cache_dir) + os.makedirs(cache_dir, exist_ok=True) + datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' + r = requests.get(datahub_url) + resp = r.json() + datahub_raise_on_error(datahub_url, resp) + dataset_id = resp['Data']['Id'] + datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' + r = requests.get(datahub_url) + resp = r.json() + datahub_raise_on_error(datahub_url, resp) + file_list = resp['Data'] + if file_list is None: + raise NotExistError( + f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, ' + f'version = {version}] dose not exist') + + file_list = file_list['Files'] + local_paths = defaultdict(list) + for file_info in file_list: + file_path = file_info['Path'] + if file_path.endswith('.py'): + datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ + f'Revision={version}&Path={file_path}' + r = requests.get(datahub_url) + r.raise_for_status() + content = r.json()['Data']['Content'] + local_path = os.path.join(cache_dir, file_path) + if os.path.exists(local_path): + logger.warning( + f"Reusing dataset {dataset_name}'s python file ({local_path})" + ) + local_paths['py'].append(local_path) + continue + with open(local_path, 'w') as f: + f.writelines(content) + local_paths['py'].append(local_path) + return local_paths + class ModelScopeConfig: path_credential = expanduser('~/.modelscope/credentials') diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index 0ee451c2..91c08786 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -1,6 +1,8 @@ MODELSCOPE_URL_SCHEME = 'http://' -DEFAULT_MODELSCOPE_DOMAIN = '47.94.223.21:31090' +DEFAULT_MODELSCOPE_IP = '47.94.223.21' +DEFAULT_MODELSCOPE_DOMAIN = DEFAULT_MODELSCOPE_IP + ':31090' DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102' +DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_IP + ':31752' DEFAULT_MODELSCOPE_GROUP = 'damo' MODEL_ID_SEPARATOR = '/' diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py index 7675e49b..fc30fa27 100644 --- a/modelscope/hub/utils/caching.py +++ b/modelscope/hub/utils/caching.py @@ -1,9 +1,7 @@ import hashlib -import logging import os import pickle import tempfile -import time from shutil import move, rmtree from modelscope.utils.logger import get_logger diff --git a/modelscope/msdatasets/config.py b/modelscope/msdatasets/config.py index 22390ed7..0357e823 100644 --- a/modelscope/msdatasets/config.py +++ b/modelscope/msdatasets/config.py @@ -2,6 +2,8 @@ import os from pathlib import Path # Cache location +from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT + DEFAULT_CACHE_HOME = '~/.cache' CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') @@ -18,5 +20,5 @@ DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE, DOWNLOADED_DATASETS_PATH = Path( os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) -MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT', - 'http://47.94.223.21:31752') +HUB_DATASET_ENDPOINT = os.environ.get('HUB_DATASET_ENDPOINT', + DEFAULT_MODELSCOPE_DATA_ENDPOINT) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 90964b36..fa7d1bf2 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -11,7 +11,6 @@ from datasets.utils.file_utils import (is_relative_path, relative_to_absolute_path) from modelscope.msdatasets.config import MS_DATASETS_CACHE -from modelscope.msdatasets.utils.ms_api import MsApi from modelscope.utils.constant import DownloadMode, Hubs from modelscope.utils.logger import get_logger @@ -146,8 +145,9 @@ class MsDataset: use_hf = True elif is_relative_path(dataset_name) and dataset_name.count( '/') == 0: - ms_api = MsApi() - dataset_scripts = ms_api.fetch_dataset_scripts( + from modelscope.hub.api import HubApi + api = HubApi() + dataset_scripts = api.fetch_dataset_scripts( dataset_name, namespace, download_mode, version) if 'py' in dataset_scripts: # dataset copied from hf datasets dataset_name = dataset_scripts['py'][0] diff --git a/modelscope/msdatasets/utils/__init__.py b/modelscope/msdatasets/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/msdatasets/utils/ms_api.py b/modelscope/msdatasets/utils/ms_api.py deleted file mode 100644 index c9b49ca1..00000000 --- a/modelscope/msdatasets/utils/ms_api.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import shutil -from collections import defaultdict -from typing import Optional - -import requests - -from modelscope.hub.errors import NotExistError, datahub_raise_on_error -from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, - MS_HUB_ENDPOINT) -from modelscope.utils.constant import DownloadMode -from modelscope.utils.logger import get_logger - -logger = get_logger() - - -class MsApi: - - def __init__(self, endpoint=MS_HUB_ENDPOINT): - self.endpoint = endpoint - - def list_datasets(self): - path = f'{self.endpoint}/api/v1/datasets' - headers = None - params = {} - r = requests.get(path, params=params, headers=headers) - r.raise_for_status() - dataset_list = r.json()['Data'] - return [x['Name'] for x in dataset_list] - - def fetch_dataset_scripts(self, - dataset_name: str, - namespace: str, - download_mode: Optional[DownloadMode], - version: Optional[str] = 'master'): - if namespace is None: - raise ValueError( - f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}' - ) - version = version or 'master' - cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, - namespace, version) - download_mode = DownloadMode(download_mode - or DownloadMode.REUSE_DATASET_IF_EXISTS) - if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists( - cache_dir): - shutil.rmtree(cache_dir) - os.makedirs(cache_dir, exist_ok=True) - datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' - r = requests.get(datahub_url) - resp = r.json() - datahub_raise_on_error(datahub_url, resp) - dataset_id = resp['Data']['Id'] - datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' - r = requests.get(datahub_url) - resp = r.json() - datahub_raise_on_error(datahub_url, resp) - file_list = resp['Data'] - if file_list is None: - raise NotExistError( - f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, ' - f'version = {version}] dose not exist') - - file_list = file_list['Files'] - local_paths = defaultdict(list) - for file_info in file_list: - file_path = file_info['Path'] - if file_path.endswith('.py'): - datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ - f'Revision={version}&Path={file_path}' - r = requests.get(datahub_url) - r.raise_for_status() - content = r.json()['Data']['Content'] - local_path = os.path.join(cache_dir, file_path) - if os.path.exists(local_path): - logger.warning( - f"Reusing dataset {dataset_name}'s python file ({local_path})" - ) - local_paths['py'].append(local_path) - continue - with open(local_path, 'w') as f: - f.writelines(content) - local_paths['py'].append(local_path) - return local_paths