Browse Source

[to #42339763] merge pydataset into maas-lib

* merge pydataset to the repo
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8955999
master
feiwu.yfw yingda.chen 3 years ago
parent
commit
235880f300
10 changed files with 176 additions and 6 deletions
  1. +2
    -1
      docs/source/quick_start.md
  2. +1
    -1
      maas_lib/pipelines/base.py
  3. +1
    -0
      pydatasets/__init__.py
  4. +126
    -0
      pydatasets/py_dataset.py
  5. +0
    -1
      requirements/maas.txt
  6. +1
    -1
      requirements/runtime.txt
  7. +1
    -1
      tests/pipelines/test_image_matting.py
  8. +1
    -1
      tests/pipelines/test_text_classification.py
  9. +0
    -0
      tests/pydataset/__init__.py
  10. +43
    -0
      tests/pydataset/test_py_dataset.py

+ 2
- 1
docs/source/quick_start.md View File

@@ -95,12 +95,13 @@ print(f'Output written to {osp.abspath("result.png")}')
```

此外,pipeline接口也能接收Dataset作为输入,上面的代码同样可以实现为

```python
import cv2
import os.path as osp
from maas_lib.pipelines import pipeline
from maas_lib.utils.constant import Tasks
from ali_maas_datasets import PyDataset
from pydatasets import PyDataset

# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹
input_location = [


+ 1
- 1
maas_lib/pipelines/base.py View File

@@ -4,8 +4,8 @@ import os.path as osp
from abc import ABC, abstractmethod
from typing import Any, Dict, Generator, List, Tuple, Union

from ali_maas_datasets import PyDataset
from maas_hub.snapshot_download import snapshot_download
from pydatasets import PyDataset

from maas_lib.models import Model
from maas_lib.pipelines import util


+ 1
- 0
pydatasets/__init__.py View File

@@ -0,0 +1 @@
from .py_dataset import PyDataset

+ 126
- 0
pydatasets/py_dataset.py View File

@@ -0,0 +1,126 @@
import logging
from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
Union)

from datasets import Dataset, load_dataset

from maas_lib.utils.logger import get_logger

logger = get_logger()


class PyDataset:
_hf_ds = None # holds the underlying HuggingFace Dataset
"""A PyDataset backed by hugging face datasets."""

def __init__(self, hf_ds: Dataset):
self._hf_ds = hf_ds
self.target = None

def __iter__(self):
if isinstance(self._hf_ds, Dataset):
for item in self._hf_ds:
if self.target is not None:
yield item[self.target]
else:
yield item
else:
for ds in self._hf_ds.values():
for item in ds:
if self.target is not None:
yield item[self.target]
else:
yield item

@classmethod
def from_hf_dataset(cls,
hf_ds: Dataset,
target: str = None) -> 'PyDataset':
dataset = cls(hf_ds)
dataset.target = target
return dataset

@staticmethod
def load(
path: Union[str, list],
target: Optional[str] = None,
version: Optional[str] = None,
name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str,
Sequence[str]]]]] = None
) -> 'PyDataset':
"""Load a pydataset from the MaaS Hub, Hugging Face Hub, urls, or a local dataset.
Args:

path (str): Path or name of the dataset.
target (str, optional): Name of the column to output.
version (str, optional): Version of the dataset script to load:
name (str, optional): Defining the subset_name of the dataset.
data_dir (str, optional): Defining the data_dir of the dataset configuration. I
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
split (str, optional): Which split of the data to load.

Returns:
pydataset (obj:`PyDataset`): PyDataset object for a certain dataset.
"""
if isinstance(path, str):
dataset = load_dataset(
path,
name=name,
revision=version,
split=split,
data_dir=data_dir,
data_files=data_files)
elif isinstance(path, list):
if target is None:
target = 'target'
dataset = Dataset.from_dict({target: [p] for p in path})
else:
raise TypeError('path must be a str or a list, but got'
f' {type(path)}')
return PyDataset.from_hf_dataset(dataset, target=target)

def to_torch_dataset(
self,
columns: Union[str, List[str]] = None,
output_all_columns: bool = False,
**format_kwargs,
):
self._hf_ds.reset_format()
self._hf_ds.set_format(
type='torch',
columns=columns,
output_all_columns=output_all_columns,
format_kwargs=format_kwargs)
return self._hf_ds

def to_tf_dataset(
self,
columns: Union[str, List[str]],
batch_size: int,
shuffle: bool,
collate_fn: Callable,
drop_remainder: bool = None,
collate_fn_args: Dict[str, Any] = None,
label_cols: Union[str, List[str]] = None,
dummy_labels: bool = False,
prefetch: bool = True,
):
self._hf_ds.reset_format()
return self._hf_ds.to_tf_dataset(
columns,
batch_size,
shuffle,
collate_fn,
drop_remainder=drop_remainder,
collate_fn_args=collate_fn_args,
label_cols=label_cols,
dummy_labels=dummy_labels,
prefetch=prefetch)

def to_hf_dataset(self) -> Dataset:
self._hf_ds.reset_format()
return self._hf_ds

+ 0
- 1
requirements/maas.txt View File

@@ -1,3 +1,2 @@
http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/maas_lib-0.1.1-py3-none-any.whl
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl

+ 1
- 1
requirements/runtime.txt View File

@@ -1,6 +1,6 @@
addict
datasets
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
https://mit-dataset.oss-cn-beijing.aliyuncs.com/release/ali_maas_datasets-0.0.1.dev0-py3-none-any.whl
numpy
opencv-python-headless
Pillow


+ 1
- 1
tests/pipelines/test_image_matting.py View File

@@ -6,7 +6,7 @@ import tempfile
import unittest

import cv2
from ali_maas_datasets import PyDataset
from pydatasets import PyDataset

from maas_lib.fileio import File
from maas_lib.pipelines import pipeline, util


+ 1
- 1
tests/pipelines/test_text_classification.py View File

@@ -5,7 +5,7 @@ import unittest
import zipfile
from pathlib import Path

from ali_maas_datasets import PyDataset
from pydatasets import PyDataset

from maas_lib.fileio import File
from maas_lib.models import Model


+ 0
- 0
tests/pydataset/__init__.py View File


+ 43
- 0
tests/pydataset/test_py_dataset.py View File

@@ -0,0 +1,43 @@
import unittest

import datasets as hfdata
from pydatasets import PyDataset


class PyDatasetTest(unittest.TestCase):

def setUp(self):
# ds1 initiazed from in memory json
self.json_data = {
'dummy': [{
'a': i,
'x': i * 10,
'c': i * 100
} for i in range(1, 11)]
}
hfds1 = hfdata.Dataset.from_dict(self.json_data)
self.ds1 = PyDataset.from_hf_dataset(hfds1)

# ds2 initialized from hg hub
hfds2 = hfdata.load_dataset(
'glue', 'mrpc', revision='2.0.0', split='train')
self.ds2 = PyDataset.from_hf_dataset(hfds2)

def tearDown(self):
pass

def test_to_hf_dataset(self):
hfds = self.ds1.to_hf_dataset()
hfds1 = hfdata.Dataset.from_dict(self.json_data)
self.assertEqual(hfds.data, hfds1.data)

# simple map function
hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']})
self.assertEqual(len(hfds['new_feature']), 10)

hfds2 = self.ds2.to_hf_dataset()
self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi'))


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save