yingda.chen 3 years ago
parent
commit
9c72c06e0b
7 changed files with 82 additions and 94 deletions
  1. +72
    -2
      modelscope/hub/api.py
  2. +3
    -1
      modelscope/hub/constants.py
  3. +0
    -2
      modelscope/hub/utils/caching.py
  4. +4
    -2
      modelscope/msdatasets/config.py
  5. +3
    -3
      modelscope/msdatasets/ms_dataset.py
  6. +0
    -0
      modelscope/msdatasets/utils/__init__.py
  7. +0
    -84
      modelscope/msdatasets/utils/ms_api.py

+ 72
- 2
modelscope/hub/api.py View File

@@ -1,6 +1,8 @@
import os import os
import pickle import pickle
import shutil
import subprocess import subprocess
from collections import defaultdict
from http.cookiejar import CookieJar from http.cookiejar import CookieJar
from os.path import expanduser from os.path import expanduser
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
@@ -8,8 +10,11 @@ from typing import List, Optional, Tuple, Union
import requests import requests


from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from ..msdatasets.config import DOWNLOADED_DATASETS_PATH, HUB_DATASET_ENDPOINT
from ..utils.constant import DownloadMode
from .constants import MODELSCOPE_URL_SCHEME from .constants import MODELSCOPE_URL_SCHEME
from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error
from .errors import (InvalidParameter, NotExistError, datahub_raise_on_error,
is_ok, raise_on_error)
from .utils.utils import (get_endpoint, get_gitlab_domain, from .utils.utils import (get_endpoint, get_gitlab_domain,
model_id_to_group_owner_name) model_id_to_group_owner_name)


@@ -18,8 +23,9 @@ logger = get_logger()


class HubApi: class HubApi:


def __init__(self, endpoint=None):
def __init__(self, endpoint=None, dataset_endpoint=None):
self.endpoint = endpoint if endpoint is not None else get_endpoint() self.endpoint = endpoint if endpoint is not None else get_endpoint()
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT


def login( def login(
self, self,
@@ -241,6 +247,70 @@ class HubApi:
files.append(file) files.append(file)
return files return files


def list_datasets(self):
path = f'{self.dataset_endpoint}/api/v1/datasets'
headers = None
params = {}
r = requests.get(path, params=params, headers=headers)
r.raise_for_status()
dataset_list = r.json()['Data']
return [x['Name'] for x in dataset_list]

def fetch_dataset_scripts(self,
dataset_name: str,
namespace: str,
download_mode: Optional[DownloadMode],
version: Optional[str] = 'master'):
if namespace is None:
raise ValueError(
f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
)
version = version or 'master'
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
namespace, version)
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
file_list = resp['Data']
if file_list is None:
raise NotExistError(
f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
f'version = {version}] dose not exist')

file_list = file_list['Files']
local_paths = defaultdict(list)
for file_info in file_list:
file_path = file_info['Path']
if file_path.endswith('.py'):
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
f'Revision={version}&Path={file_path}'
r = requests.get(datahub_url)
r.raise_for_status()
content = r.json()['Data']['Content']
local_path = os.path.join(cache_dir, file_path)
if os.path.exists(local_path):
logger.warning(
f"Reusing dataset {dataset_name}'s python file ({local_path})"
)
local_paths['py'].append(local_path)
continue
with open(local_path, 'w') as f:
f.writelines(content)
local_paths['py'].append(local_path)
return local_paths



class ModelScopeConfig: class ModelScopeConfig:
path_credential = expanduser('~/.modelscope/credentials') path_credential = expanduser('~/.modelscope/credentials')


+ 3
- 1
modelscope/hub/constants.py View File

@@ -1,6 +1,8 @@
MODELSCOPE_URL_SCHEME = 'http://' MODELSCOPE_URL_SCHEME = 'http://'
DEFAULT_MODELSCOPE_DOMAIN = '47.94.223.21:31090'
DEFAULT_MODELSCOPE_IP = '47.94.223.21'
DEFAULT_MODELSCOPE_DOMAIN = DEFAULT_MODELSCOPE_IP + ':31090'
DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102' DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102'
DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_IP + ':31752'


DEFAULT_MODELSCOPE_GROUP = 'damo' DEFAULT_MODELSCOPE_GROUP = 'damo'
MODEL_ID_SEPARATOR = '/' MODEL_ID_SEPARATOR = '/'


+ 0
- 2
modelscope/hub/utils/caching.py View File

@@ -1,9 +1,7 @@
import hashlib import hashlib
import logging
import os import os
import pickle import pickle
import tempfile import tempfile
import time
from shutil import move, rmtree from shutil import move, rmtree


from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


+ 4
- 2
modelscope/msdatasets/config.py View File

@@ -2,6 +2,8 @@ import os
from pathlib import Path from pathlib import Path


# Cache location # Cache location
from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT

DEFAULT_CACHE_HOME = '~/.cache' DEFAULT_CACHE_HOME = '~/.cache'
CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
@@ -18,5 +20,5 @@ DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
DOWNLOADED_DATASETS_PATH = Path( DOWNLOADED_DATASETS_PATH = Path(
os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))


MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
'http://47.94.223.21:31752')
HUB_DATASET_ENDPOINT = os.environ.get('HUB_DATASET_ENDPOINT',
DEFAULT_MODELSCOPE_DATA_ENDPOINT)

+ 3
- 3
modelscope/msdatasets/ms_dataset.py View File

@@ -11,7 +11,6 @@ from datasets.utils.file_utils import (is_relative_path,
relative_to_absolute_path) relative_to_absolute_path)


from modelscope.msdatasets.config import MS_DATASETS_CACHE from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.msdatasets.utils.ms_api import MsApi
from modelscope.utils.constant import DownloadMode, Hubs from modelscope.utils.constant import DownloadMode, Hubs
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


@@ -146,8 +145,9 @@ class MsDataset:
use_hf = True use_hf = True
elif is_relative_path(dataset_name) and dataset_name.count( elif is_relative_path(dataset_name) and dataset_name.count(
'/') == 0: '/') == 0:
ms_api = MsApi()
dataset_scripts = ms_api.fetch_dataset_scripts(
from modelscope.hub.api import HubApi
api = HubApi()
dataset_scripts = api.fetch_dataset_scripts(
dataset_name, namespace, download_mode, version) dataset_name, namespace, download_mode, version)
if 'py' in dataset_scripts: # dataset copied from hf datasets if 'py' in dataset_scripts: # dataset copied from hf datasets
dataset_name = dataset_scripts['py'][0] dataset_name = dataset_scripts['py'][0]


+ 0
- 0
modelscope/msdatasets/utils/__init__.py View File


+ 0
- 84
modelscope/msdatasets/utils/ms_api.py View File

@@ -1,84 +0,0 @@
import os
import shutil
from collections import defaultdict
from typing import Optional

import requests

from modelscope.hub.errors import NotExistError, datahub_raise_on_error
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
MS_HUB_ENDPOINT)
from modelscope.utils.constant import DownloadMode
from modelscope.utils.logger import get_logger

logger = get_logger()


class MsApi:

def __init__(self, endpoint=MS_HUB_ENDPOINT):
self.endpoint = endpoint

def list_datasets(self):
path = f'{self.endpoint}/api/v1/datasets'
headers = None
params = {}
r = requests.get(path, params=params, headers=headers)
r.raise_for_status()
dataset_list = r.json()['Data']
return [x['Name'] for x in dataset_list]

def fetch_dataset_scripts(self,
dataset_name: str,
namespace: str,
download_mode: Optional[DownloadMode],
version: Optional[str] = 'master'):
if namespace is None:
raise ValueError(
f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
)
version = version or 'master'
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
namespace, version)
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
file_list = resp['Data']
if file_list is None:
raise NotExistError(
f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
f'version = {version}] dose not exist')

file_list = file_list['Files']
local_paths = defaultdict(list)
for file_info in file_list:
file_path = file_info['Path']
if file_path.endswith('.py'):
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
f'Revision={version}&Path={file_path}'
r = requests.get(datahub_url)
r.raise_for_status()
content = r.json()['Data']['Content']
local_path = os.path.join(cache_dir, file_path)
if os.path.exists(local_path):
logger.warning(
f"Reusing dataset {dataset_name}'s python file ({local_path})"
)
local_paths['py'].append(local_path)
continue
with open(local_path, 'w') as f:
f.writelines(content)
local_paths['py'].append(local_path)
return local_paths

Loading…
Cancel
Save