yingda.chen 3 years ago
parent
commit
9c72c06e0b
7 changed files with 82 additions and 94 deletions
  1. +72
    -2
      modelscope/hub/api.py
  2. +3
    -1
      modelscope/hub/constants.py
  3. +0
    -2
      modelscope/hub/utils/caching.py
  4. +4
    -2
      modelscope/msdatasets/config.py
  5. +3
    -3
      modelscope/msdatasets/ms_dataset.py
  6. +0
    -0
      modelscope/msdatasets/utils/__init__.py
  7. +0
    -84
      modelscope/msdatasets/utils/ms_api.py

+ 72
- 2
modelscope/hub/api.py View File

@@ -1,6 +1,8 @@
import os
import pickle
import shutil
import subprocess
from collections import defaultdict
from http.cookiejar import CookieJar
from os.path import expanduser
from typing import List, Optional, Tuple, Union
@@ -8,8 +10,11 @@ from typing import List, Optional, Tuple, Union
import requests

from modelscope.utils.logger import get_logger
from ..msdatasets.config import DOWNLOADED_DATASETS_PATH, HUB_DATASET_ENDPOINT
from ..utils.constant import DownloadMode
from .constants import MODELSCOPE_URL_SCHEME
from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error
from .errors import (InvalidParameter, NotExistError, datahub_raise_on_error,
is_ok, raise_on_error)
from .utils.utils import (get_endpoint, get_gitlab_domain,
model_id_to_group_owner_name)

@@ -18,8 +23,9 @@ logger = get_logger()

class HubApi:

def __init__(self, endpoint=None):
def __init__(self, endpoint=None, dataset_endpoint=None):
self.endpoint = endpoint if endpoint is not None else get_endpoint()
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT

def login(
self,
@@ -241,6 +247,70 @@ class HubApi:
files.append(file)
return files

def list_datasets(self):
path = f'{self.dataset_endpoint}/api/v1/datasets'
headers = None
params = {}
r = requests.get(path, params=params, headers=headers)
r.raise_for_status()
dataset_list = r.json()['Data']
return [x['Name'] for x in dataset_list]

def fetch_dataset_scripts(self,
dataset_name: str,
namespace: str,
download_mode: Optional[DownloadMode],
version: Optional[str] = 'master'):
if namespace is None:
raise ValueError(
f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
)
version = version or 'master'
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
namespace, version)
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
file_list = resp['Data']
if file_list is None:
raise NotExistError(
f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
f'version = {version}] dose not exist')

file_list = file_list['Files']
local_paths = defaultdict(list)
for file_info in file_list:
file_path = file_info['Path']
if file_path.endswith('.py'):
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
f'Revision={version}&Path={file_path}'
r = requests.get(datahub_url)
r.raise_for_status()
content = r.json()['Data']['Content']
local_path = os.path.join(cache_dir, file_path)
if os.path.exists(local_path):
logger.warning(
f"Reusing dataset {dataset_name}'s python file ({local_path})"
)
local_paths['py'].append(local_path)
continue
with open(local_path, 'w') as f:
f.writelines(content)
local_paths['py'].append(local_path)
return local_paths


class ModelScopeConfig:
path_credential = expanduser('~/.modelscope/credentials')


+ 3
- 1
modelscope/hub/constants.py View File

@@ -1,6 +1,8 @@
MODELSCOPE_URL_SCHEME = 'http://'
DEFAULT_MODELSCOPE_DOMAIN = '47.94.223.21:31090'
DEFAULT_MODELSCOPE_IP = '47.94.223.21'
DEFAULT_MODELSCOPE_DOMAIN = DEFAULT_MODELSCOPE_IP + ':31090'
DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102'
DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_IP + ':31752'

DEFAULT_MODELSCOPE_GROUP = 'damo'
MODEL_ID_SEPARATOR = '/'


+ 0
- 2
modelscope/hub/utils/caching.py View File

@@ -1,9 +1,7 @@
import hashlib
import logging
import os
import pickle
import tempfile
import time
from shutil import move, rmtree

from modelscope.utils.logger import get_logger


+ 4
- 2
modelscope/msdatasets/config.py View File

@@ -2,6 +2,8 @@ import os
from pathlib import Path

# Cache location
from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT

DEFAULT_CACHE_HOME = '~/.cache'
CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
@@ -18,5 +20,5 @@ DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
DOWNLOADED_DATASETS_PATH = Path(
os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))

MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
'http://47.94.223.21:31752')
HUB_DATASET_ENDPOINT = os.environ.get('HUB_DATASET_ENDPOINT',
DEFAULT_MODELSCOPE_DATA_ENDPOINT)

+ 3
- 3
modelscope/msdatasets/ms_dataset.py View File

@@ -11,7 +11,6 @@ from datasets.utils.file_utils import (is_relative_path,
relative_to_absolute_path)

from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.msdatasets.utils.ms_api import MsApi
from modelscope.utils.constant import DownloadMode, Hubs
from modelscope.utils.logger import get_logger

@@ -146,8 +145,9 @@ class MsDataset:
use_hf = True
elif is_relative_path(dataset_name) and dataset_name.count(
'/') == 0:
ms_api = MsApi()
dataset_scripts = ms_api.fetch_dataset_scripts(
from modelscope.hub.api import HubApi
api = HubApi()
dataset_scripts = api.fetch_dataset_scripts(
dataset_name, namespace, download_mode, version)
if 'py' in dataset_scripts: # dataset copied from hf datasets
dataset_name = dataset_scripts['py'][0]


+ 0
- 0
modelscope/msdatasets/utils/__init__.py View File


+ 0
- 84
modelscope/msdatasets/utils/ms_api.py View File

@@ -1,84 +0,0 @@
import os
import shutil
from collections import defaultdict
from typing import Optional

import requests

from modelscope.hub.errors import NotExistError, datahub_raise_on_error
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
MS_HUB_ENDPOINT)
from modelscope.utils.constant import DownloadMode
from modelscope.utils.logger import get_logger

logger = get_logger()


class MsApi:

def __init__(self, endpoint=MS_HUB_ENDPOINT):
self.endpoint = endpoint

def list_datasets(self):
path = f'{self.endpoint}/api/v1/datasets'
headers = None
params = {}
r = requests.get(path, params=params, headers=headers)
r.raise_for_status()
dataset_list = r.json()['Data']
return [x['Name'] for x in dataset_list]

def fetch_dataset_scripts(self,
dataset_name: str,
namespace: str,
download_mode: Optional[DownloadMode],
version: Optional[str] = 'master'):
if namespace is None:
raise ValueError(
f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
)
version = version or 'master'
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
namespace, version)
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
file_list = resp['Data']
if file_list is None:
raise NotExistError(
f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
f'version = {version}] dose not exist')

file_list = file_list['Files']
local_paths = defaultdict(list)
for file_info in file_list:
file_path = file_info['Path']
if file_path.endswith('.py'):
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
f'Revision={version}&Path={file_path}'
r = requests.get(datahub_url)
r.raise_for_status()
content = r.json()['Data']['Content']
local_path = os.path.join(cache_dir, file_path)
if os.path.exists(local_path):
logger.warning(
f"Reusing dataset {dataset_name}'s python file ({local_path})"
)
local_paths['py'].append(local_path)
continue
with open(local_path, 'w') as f:
f.writelines(content)
local_paths['py'].append(local_path)
return local_paths

Loading…
Cancel
Save