xingjun.wxj 3 years ago
parent
commit
7cb72cc46e
6 changed files with 51 additions and 23 deletions
  1. +1
    -1
      modelscope/hub/api.py
  2. +1
    -1
      modelscope/hub/errors.py
  3. +10
    -6
      modelscope/hub/git.py
  4. +22
    -4
      modelscope/hub/repository.py
  5. +17
    -10
      modelscope/msdatasets/ms_dataset.py
  6. +0
    -1
      tests/msdatasets/test_dataset_upload.py

+ 1
- 1
modelscope/hub/api.py View File

@@ -389,7 +389,7 @@ class HubApi:
cookies = requests.utils.dict_from_cookiejar(cookies)
r = requests.get(url=datahub_url, cookies=cookies)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
raise_on_error(resp)
return resp['Data']

def on_dataset_download(self, dataset_name: str, namespace: str) -> None:


+ 1
- 1
modelscope/hub/errors.py View File

@@ -60,7 +60,7 @@ def raise_on_error(rsp):
Args:
rsp (_type_): The server response
"""
if rsp['Code'] == HTTPStatus.OK and rsp['Success']:
if rsp['Code'] == HTTPStatus.OK:
return True
else:
raise RequestError(rsp['Message'])


+ 10
- 6
modelscope/hub/git.py View File

@@ -51,12 +51,16 @@ class GitCommandWrapper(metaclass=Singleton):
response.check_returncode()
return response
except subprocess.CalledProcessError as error:
logger.error(
'There are error run git command, you may need to login first.'
)
raise GitError(
'stdout: %s, stderr: %s' %
(response.stdout.decode('utf8'), error.stderr.decode('utf8')))
if response.returncode == 1:
logger.info('Nothing to commit.')
return response
else:
logger.error(
'There are error run git command, you may need to login first.'
)
raise GitError('stdout: %s, stderr: %s' %
(response.stdout.decode('utf8'),
error.stderr.decode('utf8')))

def config_auth_token(self, repo_dir, auth_token):
url = self.get_repo_remote_url(repo_dir)


+ 22
- 4
modelscope/hub/repository.py View File

@@ -40,6 +40,11 @@ class Repository:
self.model_dir = model_dir
self.model_base_dir = os.path.dirname(model_dir)
self.model_repo_name = os.path.basename(model_dir)

if not revision:
err_msg = 'a non-default value of revision cannot be empty.'
raise InvalidParameter(err_msg)

if auth_token:
self.auth_token = auth_token
else:
@@ -145,10 +150,21 @@ class DatasetRepository:
The git command line path, if None, we use 'git'
"""
self.dataset_id = dataset_id
self.repo_work_dir = repo_work_dir
self.repo_base_dir = os.path.dirname(repo_work_dir)
self.repo_name = os.path.basename(repo_work_dir)
if not repo_work_dir or not isinstance(repo_work_dir, str):
err_msg = 'dataset_work_dir must be provided!'
raise InvalidParameter(err_msg)
self.repo_work_dir = repo_work_dir.rstrip('/')
if not self.repo_work_dir:
err_msg = 'dataset_work_dir can not be root dir!'
raise InvalidParameter(err_msg)
self.repo_base_dir = os.path.dirname(self.repo_work_dir)
self.repo_name = os.path.basename(self.repo_work_dir)

if not revision:
err_msg = 'a non-default value of revision cannot be empty.'
raise InvalidParameter(err_msg)
self.revision = revision

if auth_token:
self.auth_token = auth_token
else:
@@ -199,7 +215,9 @@ class DatasetRepository:
self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)

remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
remote_url = self._get_remote_url()
remote_url = self.git_wrapper.remove_token_from_url(remote_url)

self.git_wrapper.pull(self.repo_work_dir)
self.git_wrapper.add(self.repo_work_dir, all_files=True)
self.git_wrapper.commit(self.repo_work_dir, commit_message)


+ 17
- 10
modelscope/msdatasets/ms_dataset.py View File

@@ -220,18 +220,23 @@ class MsDataset:
api = HubApi()
download_dataset = ''
if isinstance(dataset_name, str):
download_dataset = dataset_name
dataset_formation = DatasetFormations.native
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
(os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(
dataset_name):
dataset_formation = DatasetFormations.hf_compatible
elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'):
dataset_formation = DatasetFormations.hf_compatible
file_name = os.path.basename(dataset_name)
download_dataset = os.path.splitext(file_name)[0]
elif is_relative_path(dataset_name) and dataset_name.count(
'/') == 0:
download_dataset = dataset_name
dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
dataset_name, namespace, download_mode, version)
# dataset organized to be compatible with hf format
if dataset_formation == DatasetFormations.hf_compatible:
dataset_name = dataset_scripts['.py'][0]
download_dataset = dataset_name
else:
raise FileNotFoundError(
f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
@@ -268,8 +273,11 @@ class MsDataset:
f' {type(dataset_name)}')

if download_dataset:
api.on_dataset_download(
dataset_name=download_dataset, namespace=namespace)
try:
api.on_dataset_download(
dataset_name=download_dataset, namespace=namespace)
except Exception as e:
logger.error(e)

return MsDataset.from_hf_dataset(dataset, target=target)

@@ -587,7 +595,7 @@ class MsDataset:
"""Clone meta-file of dataset from the ModelScope Hub.
Args:
dataset_work_dir (str): Current git working directory.
dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
revision(`Optional[str]`):
revision of the model you want to clone from. Can be any of a branch, tag or commit hash
auth_token(`Optional[str]`):
@@ -609,11 +617,11 @@ class MsDataset:
if clone_work_dir:
logger.info('Already cloned repo to: {}'.format(clone_work_dir))
else:
logger.warning('The repo working dir is already ex.')
logger.warning(
'Repo dir already exists: {}'.format(clone_work_dir))

@staticmethod
def upload_meta(dataset_work_dir: str,
dataset_id: str,
commit_message: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION,
auth_token: Optional[str] = None,
@@ -623,7 +631,6 @@ class MsDataset:

Args:
dataset_work_dir (str): Current working directory.
dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
commit_message (str): Commit message.
revision(`Optional[str]`):
revision of the model you want to clone from. Can be any of a branch, tag or commit hash
@@ -640,7 +647,7 @@ class MsDataset:
"""
_repo = DatasetRepository(
repo_work_dir=dataset_work_dir,
dataset_id=dataset_id,
dataset_id='',
revision=revision,
auth_token=auth_token,
git_path=git_path)


+ 0
- 1
tests/msdatasets/test_dataset_upload.py View File

@@ -87,7 +87,6 @@ class DatasetUploadTest(unittest.TestCase):

MsDataset.upload_meta(
dataset_work_dir=self.test_meta_dir,
dataset_id=os.path.join(self.namespace, self.dataset_name),
commit_message='Update for unit test.')




Loading…
Cancel
Save