Browse Source

[to #42339763] merge pydataset into maas-lib

* merge pydataset to the repo
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8955999
master
feiwu.yfw yingda.chen 3 years ago
parent
commit
235880f300
10 changed files with 176 additions and 6 deletions
  1. +2
    -1
      docs/source/quick_start.md
  2. +1
    -1
      maas_lib/pipelines/base.py
  3. +1
    -0
      pydatasets/__init__.py
  4. +126
    -0
      pydatasets/py_dataset.py
  5. +0
    -1
      requirements/maas.txt
  6. +1
    -1
      requirements/runtime.txt
  7. +1
    -1
      tests/pipelines/test_image_matting.py
  8. +1
    -1
      tests/pipelines/test_text_classification.py
  9. +0
    -0
      tests/pydataset/__init__.py
  10. +43
    -0
      tests/pydataset/test_py_dataset.py

+ 2
- 1
docs/source/quick_start.md View File

@@ -95,12 +95,13 @@ print(f'Output written to {osp.abspath("result.png")}')
``` ```


此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为 此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为

```python ```python
import cv2 import cv2
import os.path as osp import os.path as osp
from maas_lib.pipelines import pipeline from maas_lib.pipelines import pipeline
from maas_lib.utils.constant import Tasks from maas_lib.utils.constant import Tasks
from ali_maas_datasets import PyDataset
from pydatasets import PyDataset


# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 # 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹
input_location = [ input_location = [


+ 1
- 1
maas_lib/pipelines/base.py View File

@@ -4,8 +4,8 @@ import os.path as osp
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Dict, Generator, List, Tuple, Union from typing import Any, Dict, Generator, List, Tuple, Union


from ali_maas_datasets import PyDataset
from maas_hub.snapshot_download import snapshot_download from maas_hub.snapshot_download import snapshot_download
from pydatasets import PyDataset


from maas_lib.models import Model from maas_lib.models import Model
from maas_lib.pipelines import util from maas_lib.pipelines import util


+ 1
- 0
pydatasets/__init__.py View File

@@ -0,0 +1 @@
from .py_dataset import PyDataset

+ 126
- 0
pydatasets/py_dataset.py View File

@@ -0,0 +1,126 @@
import logging
from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
Union)

from datasets import Dataset, load_dataset

from maas_lib.utils.logger import get_logger

logger = get_logger()


class PyDataset:
_hf_ds = None # holds the underlying HuggingFace Dataset
"""A PyDataset backed by hugging face datasets."""

def __init__(self, hf_ds: Dataset):
self._hf_ds = hf_ds
self.target = None

def __iter__(self):
if isinstance(self._hf_ds, Dataset):
for item in self._hf_ds:
if self.target is not None:
yield item[self.target]
else:
yield item
else:
for ds in self._hf_ds.values():
for item in ds:
if self.target is not None:
yield item[self.target]
else:
yield item

@classmethod
def from_hf_dataset(cls,
hf_ds: Dataset,
target: str = None) -> 'PyDataset':
dataset = cls(hf_ds)
dataset.target = target
return dataset

@staticmethod
def load(
path: Union[str, list],
target: Optional[str] = None,
version: Optional[str] = None,
name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str,
Sequence[str]]]]] = None
) -> 'PyDataset':
"""Load a pydataset from the MaaS Hub, Hugging Face Hub, urls, or a local dataset.
Args:

path (str): Path or name of the dataset.
target (str, optional): Name of the column to output.
version (str, optional): Version of the dataset script to load:
name (str, optional): Defining the subset_name of the dataset.
data_dir (str, optional): Defining the data_dir of the dataset configuration. I
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
split (str, optional): Which split of the data to load.

Returns:
pydataset (obj:`PyDataset`): PyDataset object for a certain dataset.
"""
if isinstance(path, str):
dataset = load_dataset(
path,
name=name,
revision=version,
split=split,
data_dir=data_dir,
data_files=data_files)
elif isinstance(path, list):
if target is None:
target = 'target'
dataset = Dataset.from_dict({target: [p] for p in path})
else:
raise TypeError('path must be a str or a list, but got'
f' {type(path)}')
return PyDataset.from_hf_dataset(dataset, target=target)

def to_torch_dataset(
self,
columns: Union[str, List[str]] = None,
output_all_columns: bool = False,
**format_kwargs,
):
self._hf_ds.reset_format()
self._hf_ds.set_format(
type='torch',
columns=columns,
output_all_columns=output_all_columns,
format_kwargs=format_kwargs)
return self._hf_ds

def to_tf_dataset(
self,
columns: Union[str, List[str]],
batch_size: int,
shuffle: bool,
collate_fn: Callable,
drop_remainder: bool = None,
collate_fn_args: Dict[str, Any] = None,
label_cols: Union[str, List[str]] = None,
dummy_labels: bool = False,
prefetch: bool = True,
):
self._hf_ds.reset_format()
return self._hf_ds.to_tf_dataset(
columns,
batch_size,
shuffle,
collate_fn,
drop_remainder=drop_remainder,
collate_fn_args=collate_fn_args,
label_cols=label_cols,
dummy_labels=dummy_labels,
prefetch=prefetch)

def to_hf_dataset(self) -> Dataset:
self._hf_ds.reset_format()
return self._hf_ds

+ 0
- 1
requirements/maas.txt View File

@@ -1,3 +1,2 @@
http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl

+ 1
- 1
requirements/runtime.txt View File

@@ -1,6 +1,6 @@
addict addict
datasets
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl
numpy numpy
opencv-python-headless opencv-python-headless
Pillow Pillow


+ 1
- 1
tests/pipelines/test_image_matting.py View File

@@ -6,7 +6,7 @@ import tempfile
import unittest import unittest


import cv2 import cv2
from ali_maas_datasets import PyDataset
from pydatasets import PyDataset


from maas_lib.fileio import File from maas_lib.fileio import File
from maas_lib.pipelines import pipeline, util from maas_lib.pipelines import pipeline, util


+ 1
- 1
tests/pipelines/test_text_classification.py View File

@@ -5,7 +5,7 @@ import unittest
import zipfile import zipfile
from pathlib import Path from pathlib import Path


from ali_maas_datasets import PyDataset
from pydatasets import PyDataset


from maas_lib.fileio import File from maas_lib.fileio import File
from maas_lib.models import Model from maas_lib.models import Model


+ 0
- 0
tests/pydataset/__init__.py View File


+ 43
- 0
tests/pydataset/test_py_dataset.py View File

@@ -0,0 +1,43 @@
import unittest

import datasets as hfdata
from pydatasets import PyDataset


class PyDatasetTest(unittest.TestCase):

def setUp(self):
# ds1 initiazed from in memory json
self.json_data = {
'dummy': [{
'a': i,
'x': i * 10,
'c': i * 100
} for i in range(1, 11)]
}
hfds1 = hfdata.Dataset.from_dict(self.json_data)
self.ds1 = PyDataset.from_hf_dataset(hfds1)

# ds2 initialized from hg hub
hfds2 = hfdata.load_dataset(
'glue', 'mrpc', revision='2.0.0', split='train')
self.ds2 = PyDataset.from_hf_dataset(hfds2)

def tearDown(self):
pass

def test_to_hf_dataset(self):
hfds = self.ds1.to_hf_dataset()
hfds1 = hfdata.Dataset.from_dict(self.json_data)
self.assertEqual(hfds.data, hfds1.data)

# simple map function
hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']})
self.assertEqual(len(hfds['new_feature']), 10)

hfds2 = self.ds2.to_hf_dataset()
self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi'))


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save