From 235880f300dada0fc4596ee36214caba47e2aa11 Mon Sep 17 00:00:00 2001 From: "feiwu.yfw" Date: Wed, 8 Jun 2022 18:29:39 +0800 Subject: [PATCH] [to #42339763] merge pydataset into maas-lib * merge pydataset to the repo Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8955999 --- docs/source/quick_start.md | 3 +- maas_lib/pipelines/base.py | 2 +- pydatasets/__init__.py | 1 + pydatasets/py_dataset.py | 126 ++++++++++++++++++++ requirements/maas.txt | 1 - requirements/runtime.txt | 2 +- tests/pipelines/test_image_matting.py | 2 +- tests/pipelines/test_text_classification.py | 2 +- tests/pydataset/__init__.py | 0 tests/pydataset/test_py_dataset.py | 43 +++++++ 10 files changed, 176 insertions(+), 6 deletions(-) create mode 100644 pydatasets/__init__.py create mode 100644 pydatasets/py_dataset.py create mode 100644 tests/pydataset/__init__.py create mode 100644 tests/pydataset/test_py_dataset.py diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 3c961097..4a76f690 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -95,12 +95,13 @@ print(f'Output written to {osp.abspath("result.png")}') ``` 此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为 + ```python import cv2 import os.path as osp from maas_lib.pipelines import pipeline from maas_lib.utils.constant import Tasks -from ali_maas_datasets import PyDataset +from pydatasets import PyDataset # 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 input_location = [ diff --git a/maas_lib/pipelines/base.py b/maas_lib/pipelines/base.py index 3b1103f6..5e387c62 100644 --- a/maas_lib/pipelines/base.py +++ b/maas_lib/pipelines/base.py @@ -4,8 +4,8 @@ import os.path as osp from abc import ABC, abstractmethod from typing import Any, Dict, Generator, List, Tuple, Union -from ali_maas_datasets import PyDataset from maas_hub.snapshot_download import snapshot_download +from pydatasets import PyDataset from maas_lib.models import Model from maas_lib.pipelines import util diff --git a/pydatasets/__init__.py b/pydatasets/__init__.py new file mode 100644 index 00000000..a1ed1d93 --- /dev/null +++ b/pydatasets/__init__.py @@ -0,0 +1 @@ +from .py_dataset import PyDataset diff --git a/pydatasets/py_dataset.py b/pydatasets/py_dataset.py new file mode 100644 index 00000000..2e9a378f --- /dev/null +++ b/pydatasets/py_dataset.py @@ -0,0 +1,126 @@ +import logging +from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, + Union) + +from datasets import Dataset, load_dataset + +from maas_lib.utils.logger import get_logger + +logger = get_logger() + + +class PyDataset: + _hf_ds = None # holds the underlying HuggingFace Dataset + """A PyDataset backed by hugging face datasets.""" + + def __init__(self, hf_ds: Dataset): + self._hf_ds = hf_ds + self.target = None + + def __iter__(self): + if isinstance(self._hf_ds, Dataset): + for item in self._hf_ds: + if self.target is not None: + yield item[self.target] + else: + yield item + else: + for ds in self._hf_ds.values(): + for item in ds: + if self.target is not None: + yield item[self.target] + else: + yield item + + @classmethod + def from_hf_dataset(cls, + hf_ds: Dataset, + target: str = None) -> 'PyDataset': + dataset = cls(hf_ds) + dataset.target = target + return dataset + + @staticmethod + def load( + path: Union[str, list], + target: Optional[str] = None, + version: Optional[str] = None, + name: Optional[str] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], + Mapping[str, Union[str, + Sequence[str]]]]] = None + ) -> 'PyDataset': + """Load a pydataset from the MaaS Hub, Hugging Face Hub, urls, or a local dataset. + Args: + + path (str): Path or name of the dataset. + target (str, optional): Name of the column to output. + version (str, optional): Version of the dataset script to load: + name (str, optional): Defining the subset_name of the dataset. + data_dir (str, optional): Defining the data_dir of the dataset configuration. I + data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). + split (str, optional): Which split of the data to load. + + Returns: + pydataset (obj:`PyDataset`): PyDataset object for a certain dataset. + """ + if isinstance(path, str): + dataset = load_dataset( + path, + name=name, + revision=version, + split=split, + data_dir=data_dir, + data_files=data_files) + elif isinstance(path, list): + if target is None: + target = 'target' + dataset = Dataset.from_dict({target: [p] for p in path}) + else: + raise TypeError('path must be a str or a list, but got' + f' {type(path)}') + return PyDataset.from_hf_dataset(dataset, target=target) + + def to_torch_dataset( + self, + columns: Union[str, List[str]] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + self._hf_ds.reset_format() + self._hf_ds.set_format( + type='torch', + columns=columns, + output_all_columns=output_all_columns, + format_kwargs=format_kwargs) + return self._hf_ds + + def to_tf_dataset( + self, + columns: Union[str, List[str]], + batch_size: int, + shuffle: bool, + collate_fn: Callable, + drop_remainder: bool = None, + collate_fn_args: Dict[str, Any] = None, + label_cols: Union[str, List[str]] = None, + dummy_labels: bool = False, + prefetch: bool = True, + ): + self._hf_ds.reset_format() + return self._hf_ds.to_tf_dataset( + columns, + batch_size, + shuffle, + collate_fn, + drop_remainder=drop_remainder, + collate_fn_args=collate_fn_args, + label_cols=label_cols, + dummy_labels=dummy_labels, + prefetch=prefetch) + + def to_hf_dataset(self) -> Dataset: + self._hf_ds.reset_format() + return self._hf_ds diff --git a/requirements/maas.txt b/requirements/maas.txt index 66b9aeca..3b64c375 100644 --- a/requirements/maas.txt +++ b/requirements/maas.txt @@ -1,3 +1,2 @@ http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl -https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 5d24e660..b57358fc 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,6 +1,6 @@ addict +datasets https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl -https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl numpy opencv-python-headless Pillow diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py index 1713b34e..4fb475bb 100644 --- a/tests/pipelines/test_image_matting.py +++ b/tests/pipelines/test_image_matting.py @@ -6,7 +6,7 @@ import tempfile import unittest import cv2 -from ali_maas_datasets import PyDataset +from pydatasets import PyDataset from maas_lib.fileio import File from maas_lib.pipelines import pipeline, util diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index cbdd8964..2db7e67f 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -5,7 +5,7 @@ import unittest import zipfile from pathlib import Path -from ali_maas_datasets import PyDataset +from pydatasets import PyDataset from maas_lib.fileio import File from maas_lib.models import Model diff --git a/tests/pydataset/__init__.py b/tests/pydataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pydataset/test_py_dataset.py b/tests/pydataset/test_py_dataset.py new file mode 100644 index 00000000..f6bdb8e9 --- /dev/null +++ b/tests/pydataset/test_py_dataset.py @@ -0,0 +1,43 @@ +import unittest + +import datasets as hfdata +from pydatasets import PyDataset + + +class PyDatasetTest(unittest.TestCase): + + def setUp(self): + # ds1 initiazed from in memory json + self.json_data = { + 'dummy': [{ + 'a': i, + 'x': i * 10, + 'c': i * 100 + } for i in range(1, 11)] + } + hfds1 = hfdata.Dataset.from_dict(self.json_data) + self.ds1 = PyDataset.from_hf_dataset(hfds1) + + # ds2 initialized from hg hub + hfds2 = hfdata.load_dataset( + 'glue', 'mrpc', revision='2.0.0', split='train') + self.ds2 = PyDataset.from_hf_dataset(hfds2) + + def tearDown(self): + pass + + def test_to_hf_dataset(self): + hfds = self.ds1.to_hf_dataset() + hfds1 = hfdata.Dataset.from_dict(self.json_data) + self.assertEqual(hfds.data, hfds1.data) + + # simple map function + hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']}) + self.assertEqual(len(hfds['new_feature']), 10) + + hfds2 = self.ds2.to_hf_dataset() + self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi')) + + +if __name__ == '__main__': + unittest.main()