* merge pydataset to the repo
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8955999
master
| @@ -95,12 +95,13 @@ print(f'Output written to {osp.abspath("result.png")}') | |||||
| ``` | ``` | ||||
| 此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为 | 此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为 | ||||
| ```python | ```python | ||||
| import cv2 | import cv2 | ||||
| import os.path as osp | import os.path as osp | ||||
| from maas_lib.pipelines import pipeline | from maas_lib.pipelines import pipeline | ||||
| from maas_lib.utils.constant import Tasks | from maas_lib.utils.constant import Tasks | ||||
| from ali_maas_datasets import PyDataset | |||||
| from pydatasets import PyDataset | |||||
| # 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | # 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | ||||
| input_location = [ | input_location = [ | ||||
| @@ -4,8 +4,8 @@ import os.path as osp | |||||
| from abc import ABC, abstractmethod | from abc import ABC, abstractmethod | ||||
| from typing import Any, Dict, Generator, List, Tuple, Union | from typing import Any, Dict, Generator, List, Tuple, Union | ||||
| from ali_maas_datasets import PyDataset | |||||
| from maas_hub.snapshot_download import snapshot_download | from maas_hub.snapshot_download import snapshot_download | ||||
| from pydatasets import PyDataset | |||||
| from maas_lib.models import Model | from maas_lib.models import Model | ||||
| from maas_lib.pipelines import util | from maas_lib.pipelines import util | ||||
| @@ -0,0 +1 @@ | |||||
| from .py_dataset import PyDataset | |||||
| @@ -0,0 +1,126 @@ | |||||
| import logging | |||||
| from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, | |||||
| Union) | |||||
| from datasets import Dataset, load_dataset | |||||
| from maas_lib.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| class PyDataset: | |||||
| _hf_ds = None # holds the underlying HuggingFace Dataset | |||||
| """A PyDataset backed by hugging face datasets.""" | |||||
| def __init__(self, hf_ds: Dataset): | |||||
| self._hf_ds = hf_ds | |||||
| self.target = None | |||||
| def __iter__(self): | |||||
| if isinstance(self._hf_ds, Dataset): | |||||
| for item in self._hf_ds: | |||||
| if self.target is not None: | |||||
| yield item[self.target] | |||||
| else: | |||||
| yield item | |||||
| else: | |||||
| for ds in self._hf_ds.values(): | |||||
| for item in ds: | |||||
| if self.target is not None: | |||||
| yield item[self.target] | |||||
| else: | |||||
| yield item | |||||
| @classmethod | |||||
| def from_hf_dataset(cls, | |||||
| hf_ds: Dataset, | |||||
| target: str = None) -> 'PyDataset': | |||||
| dataset = cls(hf_ds) | |||||
| dataset.target = target | |||||
| return dataset | |||||
| @staticmethod | |||||
| def load( | |||||
| path: Union[str, list], | |||||
| target: Optional[str] = None, | |||||
| version: Optional[str] = None, | |||||
| name: Optional[str] = None, | |||||
| split: Optional[str] = None, | |||||
| data_dir: Optional[str] = None, | |||||
| data_files: Optional[Union[str, Sequence[str], | |||||
| Mapping[str, Union[str, | |||||
| Sequence[str]]]]] = None | |||||
| ) -> 'PyDataset': | |||||
| """Load a pydataset from the MaaS Hub, Hugging Face Hub, urls, or a local dataset. | |||||
| Args: | |||||
| path (str): Path or name of the dataset. | |||||
| target (str, optional): Name of the column to output. | |||||
| version (str, optional): Version of the dataset script to load: | |||||
| name (str, optional): Defining the subset_name of the dataset. | |||||
| data_dir (str, optional): Defining the data_dir of the dataset configuration. I | |||||
| data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). | |||||
| split (str, optional): Which split of the data to load. | |||||
| Returns: | |||||
| pydataset (obj:`PyDataset`): PyDataset object for a certain dataset. | |||||
| """ | |||||
| if isinstance(path, str): | |||||
| dataset = load_dataset( | |||||
| path, | |||||
| name=name, | |||||
| revision=version, | |||||
| split=split, | |||||
| data_dir=data_dir, | |||||
| data_files=data_files) | |||||
| elif isinstance(path, list): | |||||
| if target is None: | |||||
| target = 'target' | |||||
| dataset = Dataset.from_dict({target: [p] for p in path}) | |||||
| else: | |||||
| raise TypeError('path must be a str or a list, but got' | |||||
| f' {type(path)}') | |||||
| return PyDataset.from_hf_dataset(dataset, target=target) | |||||
| def to_torch_dataset( | |||||
| self, | |||||
| columns: Union[str, List[str]] = None, | |||||
| output_all_columns: bool = False, | |||||
| **format_kwargs, | |||||
| ): | |||||
| self._hf_ds.reset_format() | |||||
| self._hf_ds.set_format( | |||||
| type='torch', | |||||
| columns=columns, | |||||
| output_all_columns=output_all_columns, | |||||
| format_kwargs=format_kwargs) | |||||
| return self._hf_ds | |||||
| def to_tf_dataset( | |||||
| self, | |||||
| columns: Union[str, List[str]], | |||||
| batch_size: int, | |||||
| shuffle: bool, | |||||
| collate_fn: Callable, | |||||
| drop_remainder: bool = None, | |||||
| collate_fn_args: Dict[str, Any] = None, | |||||
| label_cols: Union[str, List[str]] = None, | |||||
| dummy_labels: bool = False, | |||||
| prefetch: bool = True, | |||||
| ): | |||||
| self._hf_ds.reset_format() | |||||
| return self._hf_ds.to_tf_dataset( | |||||
| columns, | |||||
| batch_size, | |||||
| shuffle, | |||||
| collate_fn, | |||||
| drop_remainder=drop_remainder, | |||||
| collate_fn_args=collate_fn_args, | |||||
| label_cols=label_cols, | |||||
| dummy_labels=dummy_labels, | |||||
| prefetch=prefetch) | |||||
| def to_hf_dataset(self) -> Dataset: | |||||
| self._hf_ds.reset_format() | |||||
| return self._hf_ds | |||||
| @@ -1,3 +1,2 @@ | |||||
| http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl | http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl | ||||
| https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl | https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl | ||||
| https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl | |||||
| @@ -1,6 +1,6 @@ | |||||
| addict | addict | ||||
| datasets | |||||
| https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl | https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl | ||||
| https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl | |||||
| numpy | numpy | ||||
| opencv-python-headless | opencv-python-headless | ||||
| Pillow | Pillow | ||||
| @@ -6,7 +6,7 @@ import tempfile | |||||
| import unittest | import unittest | ||||
| import cv2 | import cv2 | ||||
| from ali_maas_datasets import PyDataset | |||||
| from pydatasets import PyDataset | |||||
| from maas_lib.fileio import File | from maas_lib.fileio import File | ||||
| from maas_lib.pipelines import pipeline, util | from maas_lib.pipelines import pipeline, util | ||||
| @@ -5,7 +5,7 @@ import unittest | |||||
| import zipfile | import zipfile | ||||
| from pathlib import Path | from pathlib import Path | ||||
| from ali_maas_datasets import PyDataset | |||||
| from pydatasets import PyDataset | |||||
| from maas_lib.fileio import File | from maas_lib.fileio import File | ||||
| from maas_lib.models import Model | from maas_lib.models import Model | ||||
| @@ -0,0 +1,43 @@ | |||||
| import unittest | |||||
| import datasets as hfdata | |||||
| from pydatasets import PyDataset | |||||
| class PyDatasetTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| # ds1 initiazed from in memory json | |||||
| self.json_data = { | |||||
| 'dummy': [{ | |||||
| 'a': i, | |||||
| 'x': i * 10, | |||||
| 'c': i * 100 | |||||
| } for i in range(1, 11)] | |||||
| } | |||||
| hfds1 = hfdata.Dataset.from_dict(self.json_data) | |||||
| self.ds1 = PyDataset.from_hf_dataset(hfds1) | |||||
| # ds2 initialized from hg hub | |||||
| hfds2 = hfdata.load_dataset( | |||||
| 'glue', 'mrpc', revision='2.0.0', split='train') | |||||
| self.ds2 = PyDataset.from_hf_dataset(hfds2) | |||||
| def tearDown(self): | |||||
| pass | |||||
| def test_to_hf_dataset(self): | |||||
| hfds = self.ds1.to_hf_dataset() | |||||
| hfds1 = hfdata.Dataset.from_dict(self.json_data) | |||||
| self.assertEqual(hfds.data, hfds1.data) | |||||
| # simple map function | |||||
| hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']}) | |||||
| self.assertEqual(len(hfds['new_feature']), 10) | |||||
| hfds2 = self.ds2.to_hf_dataset() | |||||
| self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi')) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||