From 7cb72cc46e4d0fc5b7f92ab43ae27bdfcae788d2 Mon Sep 17 00:00:00 2001 From: "xingjun.wxj" Date: Wed, 14 Sep 2022 19:24:48 +0800 Subject: [PATCH] [to #42322933]MsDataset upload bugfix for 0830 version. CR link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10109035#tab=changes&file=8348e8153b2f4a6dbd52e471b4980542355408ed Please refer to aone links: 1. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44889184 2. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44858810 3. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44857728 4. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44658972 --- modelscope/hub/api.py | 2 +- modelscope/hub/errors.py | 2 +- modelscope/hub/git.py | 16 +++++++++------ modelscope/hub/repository.py | 26 ++++++++++++++++++++---- modelscope/msdatasets/ms_dataset.py | 27 ++++++++++++++++--------- tests/msdatasets/test_dataset_upload.py | 1 - 6 files changed, 51 insertions(+), 23 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 721f5637..85da6a31 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -389,7 +389,7 @@ class HubApi: cookies = requests.utils.dict_from_cookiejar(cookies) r = requests.get(url=datahub_url, cookies=cookies) resp = r.json() - datahub_raise_on_error(datahub_url, resp) + raise_on_error(resp) return resp['Data'] def on_dataset_download(self, dataset_name: str, namespace: str) -> None: diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py index e9c008b0..284dbed4 100644 --- a/modelscope/hub/errors.py +++ b/modelscope/hub/errors.py @@ -60,7 +60,7 @@ def raise_on_error(rsp): Args: rsp (_type_): The server response """ - if rsp['Code'] == HTTPStatus.OK and rsp['Success']: + if rsp['Code'] == HTTPStatus.OK: return True else: raise RequestError(rsp['Message']) diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py index 264cd59a..13e1910d 100644 --- a/modelscope/hub/git.py +++ b/modelscope/hub/git.py @@ -51,12 +51,16 @@ class GitCommandWrapper(metaclass=Singleton): response.check_returncode() return response except subprocess.CalledProcessError as error: - logger.error( - 'There are error run git command, you may need to login first.' - ) - raise GitError( - 'stdout: %s, stderr: %s' % - (response.stdout.decode('utf8'), error.stderr.decode('utf8'))) + if response.returncode == 1: + logger.info('Nothing to commit.') + return response + else: + logger.error( + 'There are error run git command, you may need to login first.' + ) + raise GitError('stdout: %s, stderr: %s' % + (response.stdout.decode('utf8'), + error.stderr.decode('utf8'))) def config_auth_token(self, repo_dir, auth_token): url = self.get_repo_remote_url(repo_dir) diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py index 6f560f7a..8d5fd30b 100644 --- a/modelscope/hub/repository.py +++ b/modelscope/hub/repository.py @@ -40,6 +40,11 @@ class Repository: self.model_dir = model_dir self.model_base_dir = os.path.dirname(model_dir) self.model_repo_name = os.path.basename(model_dir) + + if not revision: + err_msg = 'a non-default value of revision cannot be empty.' + raise InvalidParameter(err_msg) + if auth_token: self.auth_token = auth_token else: @@ -145,10 +150,21 @@ class DatasetRepository: The git command line path, if None, we use 'git' """ self.dataset_id = dataset_id - self.repo_work_dir = repo_work_dir - self.repo_base_dir = os.path.dirname(repo_work_dir) - self.repo_name = os.path.basename(repo_work_dir) + if not repo_work_dir or not isinstance(repo_work_dir, str): + err_msg = 'dataset_work_dir must be provided!' + raise InvalidParameter(err_msg) + self.repo_work_dir = repo_work_dir.rstrip('/') + if not self.repo_work_dir: + err_msg = 'dataset_work_dir can not be root dir!' + raise InvalidParameter(err_msg) + self.repo_base_dir = os.path.dirname(self.repo_work_dir) + self.repo_name = os.path.basename(self.repo_work_dir) + + if not revision: + err_msg = 'a non-default value of revision cannot be empty.' + raise InvalidParameter(err_msg) self.revision = revision + if auth_token: self.auth_token = auth_token else: @@ -199,7 +215,9 @@ class DatasetRepository: self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token) self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name) - remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir) + remote_url = self._get_remote_url() + remote_url = self.git_wrapper.remove_token_from_url(remote_url) + self.git_wrapper.pull(self.repo_work_dir) self.git_wrapper.add(self.repo_work_dir, all_files=True) self.git_wrapper.commit(self.repo_work_dir, commit_message) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 691db4fe..a0203df9 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -220,18 +220,23 @@ class MsDataset: api = HubApi() download_dataset = '' if isinstance(dataset_name, str): - download_dataset = dataset_name dataset_formation = DatasetFormations.native - if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ - (os.path.isfile(dataset_name) and dataset_name.endswith('.py')): + if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir( + dataset_name): dataset_formation = DatasetFormations.hf_compatible + elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'): + dataset_formation = DatasetFormations.hf_compatible + file_name = os.path.basename(dataset_name) + download_dataset = os.path.splitext(file_name)[0] elif is_relative_path(dataset_name) and dataset_name.count( '/') == 0: + download_dataset = dataset_name dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts( dataset_name, namespace, download_mode, version) # dataset organized to be compatible with hf format if dataset_formation == DatasetFormations.hf_compatible: dataset_name = dataset_scripts['.py'][0] + download_dataset = dataset_name else: raise FileNotFoundError( f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " @@ -268,8 +273,11 @@ class MsDataset: f' {type(dataset_name)}') if download_dataset: - api.on_dataset_download( - dataset_name=download_dataset, namespace=namespace) + try: + api.on_dataset_download( + dataset_name=download_dataset, namespace=namespace) + except Exception as e: + logger.error(e) return MsDataset.from_hf_dataset(dataset, target=target) @@ -587,7 +595,7 @@ class MsDataset: """Clone meta-file of dataset from the ModelScope Hub. Args: dataset_work_dir (str): Current git working directory. - dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name . + dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name . revision(`Optional[str]`): revision of the model you want to clone from. Can be any of a branch, tag or commit hash auth_token(`Optional[str]`): @@ -609,11 +617,11 @@ class MsDataset: if clone_work_dir: logger.info('Already cloned repo to: {}'.format(clone_work_dir)) else: - logger.warning('The repo working dir is already ex.') + logger.warning( + 'Repo dir already exists: {}'.format(clone_work_dir)) @staticmethod def upload_meta(dataset_work_dir: str, - dataset_id: str, commit_message: str, revision: Optional[str] = DEFAULT_DATASET_REVISION, auth_token: Optional[str] = None, @@ -623,7 +631,6 @@ class MsDataset: Args: dataset_work_dir (str): Current working directory. - dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name . commit_message (str): Commit message. revision(`Optional[str]`): revision of the model you want to clone from. Can be any of a branch, tag or commit hash @@ -640,7 +647,7 @@ class MsDataset: """ _repo = DatasetRepository( repo_work_dir=dataset_work_dir, - dataset_id=dataset_id, + dataset_id='', revision=revision, auth_token=auth_token, git_path=git_path) diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py index 61b1c6a4..1179414d 100644 --- a/tests/msdatasets/test_dataset_upload.py +++ b/tests/msdatasets/test_dataset_upload.py @@ -87,7 +87,6 @@ class DatasetUploadTest(unittest.TestCase): MsDataset.upload_meta( dataset_work_dir=self.test_meta_dir, - dataset_id=os.path.join(self.namespace, self.dataset_name), commit_message='Update for unit test.')