CR link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10109035#tab=changes&file=8348e8153b2f4a6dbd52e471b4980542355408ed Please refer to aone links: 1. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44889184 2. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44858810 3. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44857728 4. https://aone.alibaba-inc.com/v2/project/1162242/bug#viewIdentifier=b622c099e2199bc034401fbe&openWorkitemIdentifier=44658972master
| @@ -389,7 +389,7 @@ class HubApi: | |||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | cookies = requests.utils.dict_from_cookiejar(cookies) | ||||
| r = requests.get(url=datahub_url, cookies=cookies) | r = requests.get(url=datahub_url, cookies=cookies) | ||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | |||||
| raise_on_error(resp) | |||||
| return resp['Data'] | return resp['Data'] | ||||
| def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | ||||
| @@ -60,7 +60,7 @@ def raise_on_error(rsp): | |||||
| Args: | Args: | ||||
| rsp (_type_): The server response | rsp (_type_): The server response | ||||
| """ | """ | ||||
| if rsp['Code'] == HTTPStatus.OK and rsp['Success']: | |||||
| if rsp['Code'] == HTTPStatus.OK: | |||||
| return True | return True | ||||
| else: | else: | ||||
| raise RequestError(rsp['Message']) | raise RequestError(rsp['Message']) | ||||
| @@ -51,12 +51,16 @@ class GitCommandWrapper(metaclass=Singleton): | |||||
| response.check_returncode() | response.check_returncode() | ||||
| return response | return response | ||||
| except subprocess.CalledProcessError as error: | except subprocess.CalledProcessError as error: | ||||
| logger.error( | |||||
| 'There are error run git command, you may need to login first.' | |||||
| ) | |||||
| raise GitError( | |||||
| 'stdout: %s, stderr: %s' % | |||||
| (response.stdout.decode('utf8'), error.stderr.decode('utf8'))) | |||||
| if response.returncode == 1: | |||||
| logger.info('Nothing to commit.') | |||||
| return response | |||||
| else: | |||||
| logger.error( | |||||
| 'There are error run git command, you may need to login first.' | |||||
| ) | |||||
| raise GitError('stdout: %s, stderr: %s' % | |||||
| (response.stdout.decode('utf8'), | |||||
| error.stderr.decode('utf8'))) | |||||
| def config_auth_token(self, repo_dir, auth_token): | def config_auth_token(self, repo_dir, auth_token): | ||||
| url = self.get_repo_remote_url(repo_dir) | url = self.get_repo_remote_url(repo_dir) | ||||
| @@ -40,6 +40,11 @@ class Repository: | |||||
| self.model_dir = model_dir | self.model_dir = model_dir | ||||
| self.model_base_dir = os.path.dirname(model_dir) | self.model_base_dir = os.path.dirname(model_dir) | ||||
| self.model_repo_name = os.path.basename(model_dir) | self.model_repo_name = os.path.basename(model_dir) | ||||
| if not revision: | |||||
| err_msg = 'a non-default value of revision cannot be empty.' | |||||
| raise InvalidParameter(err_msg) | |||||
| if auth_token: | if auth_token: | ||||
| self.auth_token = auth_token | self.auth_token = auth_token | ||||
| else: | else: | ||||
| @@ -145,10 +150,21 @@ class DatasetRepository: | |||||
| The git command line path, if None, we use 'git' | The git command line path, if None, we use 'git' | ||||
| """ | """ | ||||
| self.dataset_id = dataset_id | self.dataset_id = dataset_id | ||||
| self.repo_work_dir = repo_work_dir | |||||
| self.repo_base_dir = os.path.dirname(repo_work_dir) | |||||
| self.repo_name = os.path.basename(repo_work_dir) | |||||
| if not repo_work_dir or not isinstance(repo_work_dir, str): | |||||
| err_msg = 'dataset_work_dir must be provided!' | |||||
| raise InvalidParameter(err_msg) | |||||
| self.repo_work_dir = repo_work_dir.rstrip('/') | |||||
| if not self.repo_work_dir: | |||||
| err_msg = 'dataset_work_dir can not be root dir!' | |||||
| raise InvalidParameter(err_msg) | |||||
| self.repo_base_dir = os.path.dirname(self.repo_work_dir) | |||||
| self.repo_name = os.path.basename(self.repo_work_dir) | |||||
| if not revision: | |||||
| err_msg = 'a non-default value of revision cannot be empty.' | |||||
| raise InvalidParameter(err_msg) | |||||
| self.revision = revision | self.revision = revision | ||||
| if auth_token: | if auth_token: | ||||
| self.auth_token = auth_token | self.auth_token = auth_token | ||||
| else: | else: | ||||
| @@ -199,7 +215,9 @@ class DatasetRepository: | |||||
| self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token) | self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token) | ||||
| self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name) | self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name) | ||||
| remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir) | |||||
| remote_url = self._get_remote_url() | |||||
| remote_url = self.git_wrapper.remove_token_from_url(remote_url) | |||||
| self.git_wrapper.pull(self.repo_work_dir) | self.git_wrapper.pull(self.repo_work_dir) | ||||
| self.git_wrapper.add(self.repo_work_dir, all_files=True) | self.git_wrapper.add(self.repo_work_dir, all_files=True) | ||||
| self.git_wrapper.commit(self.repo_work_dir, commit_message) | self.git_wrapper.commit(self.repo_work_dir, commit_message) | ||||
| @@ -220,18 +220,23 @@ class MsDataset: | |||||
| api = HubApi() | api = HubApi() | ||||
| download_dataset = '' | download_dataset = '' | ||||
| if isinstance(dataset_name, str): | if isinstance(dataset_name, str): | ||||
| download_dataset = dataset_name | |||||
| dataset_formation = DatasetFormations.native | dataset_formation = DatasetFormations.native | ||||
| if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | |||||
| (os.path.isfile(dataset_name) and dataset_name.endswith('.py')): | |||||
| if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir( | |||||
| dataset_name): | |||||
| dataset_formation = DatasetFormations.hf_compatible | dataset_formation = DatasetFormations.hf_compatible | ||||
| elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'): | |||||
| dataset_formation = DatasetFormations.hf_compatible | |||||
| file_name = os.path.basename(dataset_name) | |||||
| download_dataset = os.path.splitext(file_name)[0] | |||||
| elif is_relative_path(dataset_name) and dataset_name.count( | elif is_relative_path(dataset_name) and dataset_name.count( | ||||
| '/') == 0: | '/') == 0: | ||||
| download_dataset = dataset_name | |||||
| dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts( | dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts( | ||||
| dataset_name, namespace, download_mode, version) | dataset_name, namespace, download_mode, version) | ||||
| # dataset organized to be compatible with hf format | # dataset organized to be compatible with hf format | ||||
| if dataset_formation == DatasetFormations.hf_compatible: | if dataset_formation == DatasetFormations.hf_compatible: | ||||
| dataset_name = dataset_scripts['.py'][0] | dataset_name = dataset_scripts['.py'][0] | ||||
| download_dataset = dataset_name | |||||
| else: | else: | ||||
| raise FileNotFoundError( | raise FileNotFoundError( | ||||
| f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | ||||
| @@ -268,8 +273,11 @@ class MsDataset: | |||||
| f' {type(dataset_name)}') | f' {type(dataset_name)}') | ||||
| if download_dataset: | if download_dataset: | ||||
| api.on_dataset_download( | |||||
| dataset_name=download_dataset, namespace=namespace) | |||||
| try: | |||||
| api.on_dataset_download( | |||||
| dataset_name=download_dataset, namespace=namespace) | |||||
| except Exception as e: | |||||
| logger.error(e) | |||||
| return MsDataset.from_hf_dataset(dataset, target=target) | return MsDataset.from_hf_dataset(dataset, target=target) | ||||
| @@ -587,7 +595,7 @@ class MsDataset: | |||||
| """Clone meta-file of dataset from the ModelScope Hub. | """Clone meta-file of dataset from the ModelScope Hub. | ||||
| Args: | Args: | ||||
| dataset_work_dir (str): Current git working directory. | dataset_work_dir (str): Current git working directory. | ||||
| dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name . | |||||
| dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name . | |||||
| revision(`Optional[str]`): | revision(`Optional[str]`): | ||||
| revision of the model you want to clone from. Can be any of a branch, tag or commit hash | revision of the model you want to clone from. Can be any of a branch, tag or commit hash | ||||
| auth_token(`Optional[str]`): | auth_token(`Optional[str]`): | ||||
| @@ -609,11 +617,11 @@ class MsDataset: | |||||
| if clone_work_dir: | if clone_work_dir: | ||||
| logger.info('Already cloned repo to: {}'.format(clone_work_dir)) | logger.info('Already cloned repo to: {}'.format(clone_work_dir)) | ||||
| else: | else: | ||||
| logger.warning('The repo working dir is already ex.') | |||||
| logger.warning( | |||||
| 'Repo dir already exists: {}'.format(clone_work_dir)) | |||||
| @staticmethod | @staticmethod | ||||
| def upload_meta(dataset_work_dir: str, | def upload_meta(dataset_work_dir: str, | ||||
| dataset_id: str, | |||||
| commit_message: str, | commit_message: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION, | revision: Optional[str] = DEFAULT_DATASET_REVISION, | ||||
| auth_token: Optional[str] = None, | auth_token: Optional[str] = None, | ||||
| @@ -623,7 +631,6 @@ class MsDataset: | |||||
| Args: | Args: | ||||
| dataset_work_dir (str): Current working directory. | dataset_work_dir (str): Current working directory. | ||||
| dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name . | |||||
| commit_message (str): Commit message. | commit_message (str): Commit message. | ||||
| revision(`Optional[str]`): | revision(`Optional[str]`): | ||||
| revision of the model you want to clone from. Can be any of a branch, tag or commit hash | revision of the model you want to clone from. Can be any of a branch, tag or commit hash | ||||
| @@ -640,7 +647,7 @@ class MsDataset: | |||||
| """ | """ | ||||
| _repo = DatasetRepository( | _repo = DatasetRepository( | ||||
| repo_work_dir=dataset_work_dir, | repo_work_dir=dataset_work_dir, | ||||
| dataset_id=dataset_id, | |||||
| dataset_id='', | |||||
| revision=revision, | revision=revision, | ||||
| auth_token=auth_token, | auth_token=auth_token, | ||||
| git_path=git_path) | git_path=git_path) | ||||
| @@ -87,7 +87,6 @@ class DatasetUploadTest(unittest.TestCase): | |||||
| MsDataset.upload_meta( | MsDataset.upload_meta( | ||||
| dataset_work_dir=self.test_meta_dir, | dataset_work_dir=self.test_meta_dir, | ||||
| dataset_id=os.path.join(self.namespace, self.dataset_name), | |||||
| commit_message='Update for unit test.') | commit_message='Update for unit test.') | ||||