From 3f972785648283c4c86f20806b270b28eb3149de Mon Sep 17 00:00:00 2001 From: "feiwu.yfw" Date: Thu, 1 Sep 2022 15:26:45 +0800 Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=E6=95=B0=E6=8D=AE=E9=9B=86?= =?UTF-8?q?=E6=96=AD=E7=82=B9=E7=BB=AD=E4=BC=A0=E4=B8=8B=E8=BD=BD+?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=95=B0=E6=8D=AE=E9=9B=86=E5=91=BD=E5=90=8D?= =?UTF-8?q?=E5=AD=98=E5=9C=A8=E5=A4=A7=E5=86=99=E5=AD=97=E6=AF=8D=E5=AF=BC?= =?UTF-8?q?=E8=87=B4=E5=8A=A0=E8=BD=BD=E5=A4=B1=E8=B4=A5=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98=20=20=20=20=20=20=20=20=20Link:=20https://code.alibab?= =?UTF-8?q?a-inc.com/Ali-MaaS/MaaS-lib/codereview/9973942?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix msdataset dataset name * add resume download --- modelscope/msdatasets/utils/dataset_builder.py | 8 +++++--- modelscope/msdatasets/utils/oss_utils.py | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py index 7180cb5b..825400c4 100644 --- a/modelscope/msdatasets/utils/dataset_builder.py +++ b/modelscope/msdatasets/utils/dataset_builder.py @@ -5,6 +5,7 @@ import datasets import pandas as pd import pyarrow as pa from datasets.info import DatasetInfo +from datasets.naming import camelcase_to_snakecase from datasets.packaged_modules import csv from datasets.utils.filelock import FileLock @@ -34,8 +35,8 @@ class MsCsvDatasetBuilder(csv.Csv): data_files=meta_data_files, **config_kwargs) - self.name = dataset_name - self.info.builder_name = self.name + self.name = camelcase_to_snakecase(dataset_name) + self.info.builder_name = dataset_name self._cache_dir = self._build_cache_dir(namespace=namespace) lock_path = os.path.join( self._cache_dir_root, @@ -65,7 +66,7 @@ class MsCsvDatasetBuilder(csv.Csv): or if a namespace has been specified: self.namespace___self.name/self.config.version/self.hash/ """ - builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}' + builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}' builder_config = self.config hash = self.hash if builder_config: @@ -156,6 +157,7 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): self.zip_data_files = zip_data_files self.split_path_dict = None self.config = None + self.info = DatasetInfo.from_dict({'builder_name': dataset_name}) self._cache_dir_root = os.path.expanduser(cache_dir) self._cache_dir = self._build_cache_dir() self._config_kwargs = config_kwargs diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py index 82d43bef..63a1cf77 100644 --- a/modelscope/msdatasets/utils/oss_utils.py +++ b/modelscope/msdatasets/utils/oss_utils.py @@ -34,8 +34,12 @@ class OssUtilities: local_path = os.path.join(cache_dir, filename) if download_config.force_download or not os.path.exists(local_path): - self.bucket.get_object_to_file( - file_oss_key, local_path, progress_callback=self._percentage) + oss2.resumable_download( + self.bucket, + file_oss_key, + local_path, + multiget_threshold=0, + progress_callback=self._percentage) return local_path def upload(self, oss_file_name: str, local_file_path: str) -> str: