add TensorDataset and its ut

5 years ago · bc22c172b8
--- a/mindspore/dataset/init.py
+++ b/mindspore/dataset/init.py
@@ -19,7 +19,7 @@ can also create samplers with this module to sample data.
 """

 from .core.configuration import config
 from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, \
 from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, NumpySlicesDataset, \
    GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CocoDataset, CelebADataset,\
    TextFileDataset, Schema, Shuffle, zip, RandomDataset
 from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \
@@ -29,6 +29,6 @@ from .engine.graphdata import GraphData

 __all__ = ["config", "ImageFolderDatasetV2", "MnistDataset",
           "MindDataset", "GeneratorDataset", "TFRecordDataset",
           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset",
           "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset",
           "VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler", "RandomSampler",
           "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"]
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -40,7 +40,7 @@ from mindspore import log as logger
 from . import samplers
 from .iterators import DictIterator, TupleIterator
 from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
    check_rename, \
    check_rename, check_numpyslicesdataset, \
    check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
    check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset,\
    check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat,\
@@ -4377,3 +4377,158 @@ class TextFileDataset(SourceDataset):
            return self.num_shards > 1

        return False


 class _NumpySlicesDataset:
    """
    Mainly for dealing with several kinds of format of python data, and return one row each time.
    """
    def __init__(self, data, column_list=None):
        self.column_list = None
        # Convert dict data into tuple
        if isinstance(data, dict) or isinstance(data[0], dict):
            data = self.process_dict(data)

        if isinstance(data[0], tuple) or isinstance(data, tuple):
            self.is_tuple = True
            self.data = data
            if isinstance(data[0], tuple):
                for i in range(len(self.data)):
                    self.data[i] = np.array(self.data[i])
        else:
            self.is_tuple = False
            self.data = np.array(data)

        # Init column_name
        if column_list is not None:
            self.column_list = column_list
        elif self.column_list is None:
            self.column_list = []
            column_num = len(self.data) if self.is_tuple else 1
            for i in range(column_num):
                self.column_list.append("column_" + str(i))

    def __getitem__(self, index):
        if self.is_tuple:
            data_row = []
            for i in range(len(self.data)):
                data_row.append(self.data[i][index, ...])
            data_res = tuple(data_row)
        else:
            data_row = self.data[index, ...]
            data_row = [data_row]
            data_res = tuple(data_row)

        return data_res

    def __len__(self):
        if self.is_tuple:
            return len(self.data[0])
        return len(self.data)

    def process_dict(self, input_data):
        """
        Convert the dict like data into tuple format, when input is a tuple of dict then compose it into a dict first.
        """
        # When input is a tuple of dict, composing it
        if isinstance(input_data, tuple) and isinstance(input_data[0], dict):
            data_dict = {}
            for d in input_data:
                data_dict.update(d)
            input_data = data_dict

        # convert pandas like dict(has "values" column) into General dict
        data_keys = list(input_data.keys())
        data_col = input_data[data_keys[0]]
        if hasattr(data_col, "values"):
            new_dict = {}
            for key in data_keys:
                item1 = input_data.pop(key)
                new_dict[key] = item1.values
            input_data = new_dict

        # Convert the data in dict into tuple
        data = []
        self.column_list = []
        keys = input_data.keys()
        for key in keys:
            self.column_list.append(key)
            value = input_data[key]
            data.append(tuple(value))

        return data


 class NumpySlicesDataset(GeneratorDataset):
    """
    Create a dataset with given data slices, mainly for loading python data into dataset.

    This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table
    below shows what input args are allowed and their expected behavior.

    .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
       :widths: 25 25 50
       :header-rows: 1

       * - Parameter 'sampler'
         - Parameter 'shuffle'
         - Expected Order Behavior
       * - None
         - None
         - random order
       * - None
         - True
         - random order
       * - None
         - False
         - sequential order
       * - Sampler object
         - None
         - order defined by sampler
       * - Sampler object
         - True
         - not allowed
       * - Sampler object
         - False
         - not allowed

    Args:
        data（list, tuple or dict）Input of Given data, supported data type includes list, tuple, dict and other numpy
            format. Input data will be sliced in first dimension and generate many rows, large data is not recommend to
            load in this way as data is loading into memory.
        column_names (list[str], optional): List of column names of the dataset (default=None). If column_names not
            provided, when data is dict, column_names will be its key, otherwise it will be like column_1, column_2 ...
        num_samples (int, optional): The number of samples to be included in the dataset (default=None, all images).
        num_parallel_workers (int, optional): Number of subprocesses used to fetch the dataset in parallel (default=1).
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset. Random accessible input is required.
            (default=None, expected order behavior shown in the table).
        sampler (Sampler/Iterable, optional): Object used to choose samples from the dataset. Random accessible input is
            required (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset should be divided into (default=None).
            This argument should be specified only when 'num_samples' is "None". Random accessible input is required.
        shard_id (int, optional): The shard ID within num_shards (default=None). This argument should be specified only
            when num_shards is also specified. Random accessible input is required.

    Examples:
        >>> import mindspore.dataset as ds
        >>> # 1) Input data can be a list
        >>> data = [1, 2, 3]
        >>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"])
        >>> # 2) Input data can be a dict, and column_names will be its key
        >>> data = {"a": [1, 2], "b": [3, 4]}
        >>> dataset2 = ds.NumpySlicesDataset(data)
        >>> # 3) Input data can be a tuple (or list of tuple), and each tuple element refers to data in each column
        >>> data = ((1, 2), (3, 4), (5, 6))
        >>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"])
        >>> # 4) Load data from csv file
        >>> import pandas as pd
        >>> df = pd.read_csv("file.csv")
        >>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False)
    """
    @check_numpyslicesdataset
    def __init__(self, data, column_names=None, num_samples=None, num_parallel_workers=1, shuffle=None,
                 sampler=None, num_shards=None, shard_id=None):
        dataset = _NumpySlicesDataset(data, column_names)
        super().__init__(dataset, column_names=dataset.column_list, num_samples=num_samples,
                         num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler,
                         num_shards=num_shards, shard_id=shard_id)
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -1356,3 +1356,48 @@ def check_gnn_get_node_feature(method):
        return method(*args, **kwargs)

    return new_method


 def check_numpyslicesdataset(method):
    """A wrapper that wrap a parameter checker to the original Dataset(NumpySlicesDataset)."""

    @wraps(method)
    def new_method(*args, **kwargs):
        param_dict = make_param_dict(method, args, kwargs)

        # check data; required argument
        data = param_dict.get('data')
        if not isinstance(data, (list, tuple, dict, np.ndarray)):
            raise TypeError("Unsupported data type: {}, only support some common python data type, \
                            like list, tuple, dict, and numpy array.".format(type(data)))
        if not data:
            raise ValueError("Input data is empty.")

        # check column_names
        column_names = param_dict.get('column_names')
        if column_names is not None:
            check_columns(column_names, "column_names")

            # check num of input column in column_names
            column_num = 1 if isinstance(column_names, str) else len(column_names)
            if isinstance(data, dict):
                data_column = len(list(data.keys()))
                if column_num != data_column:
                    raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, data_column))

            # Consider input is a tuple of dict
            elif isinstance(data[0], dict):
                data_column = np.sum(len(list(data[i].keys())) for i in range(len(data)))
                if column_num != data_column:
                    raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, data_column))

            elif isinstance(data[0], tuple) or isinstance(data, tuple):
                if column_num != len(data):
                    raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, len(data)))
            else:
                if column_num != 1:
                    raise ValueError("Num of column is {0}, but required is {1} as data is list.".format(column_num, 1))

        return method(*args, **kwargs)

    return new_method
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ setuptools >= 40.8.0
 matplotlib >= 3.1.3         # for ut test
 opencv-python >= 4.2.0.32   # for ut test
 sklearn >= 0.0              # for st test
 pandas >= 1.0.2             # for ut test
--- a/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv
+++ b/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv
@@ -0,0 +1,6 @@
 age,sex,height,weight,slope,state,target
 65,0,161,45,93,fixed,1
 72,1,164,60,86,good,0
 45,0,174,70,79,bad,1
 73,1,173,65,70,good,1
 55,1,182,80,104,good,0
--- a/tests/ut/python/dataset/test_dataset_numpy_slices.py
+++ b/tests/ut/python/dataset/test_dataset_numpy_slices.py
@@ -0,0 +1,201 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import numpy as np
 import mindspore.dataset as de
 from mindspore import log as logger
 import mindspore.dataset.transforms.vision.c_transforms as vision
 import pandas as pd


 def test_numpy_slices_list_1():
    logger.info("Test Slicing a 1D list.")

    np_data = [1, 2, 3]
    ds = de.NumpySlicesDataset(np_data, shuffle=False)

    for i, data in enumerate(ds):
        assert data[0] == np_data[i]


 def test_numpy_slices_list_2():
    logger.info("Test Slicing a 2D list into 1D list.")

    np_data = [[1, 2], [3, 4]]
    ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(data[0], np_data[i]).all()


 def test_numpy_slices_list_3():
    logger.info("Test Slicing list in the first dimension.")

    np_data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
    ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(data[0], np_data[i]).all()


 def test_numpy_slices_list_append():
    logger.info("Test reading data of image list.")

    DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
    resize_height, resize_width = 2, 2

    data1 = de.TFRecordDataset(DATA_DIR)
    resize_op = vision.Resize((resize_height, resize_width))
    data1 = data1.map(input_columns=["image"], operations=[vision.Decode(True), resize_op])

    res = []
    for data in data1.create_dict_iterator():
        res.append(data["image"])

    ds = de.NumpySlicesDataset(res, column_names=["col1"], shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(data, res[i]).all()


 def test_numpy_slices_dict_1():
    logger.info("Test Dictionary structure data.")

    np_data = {"a": [1, 2], "b": [3, 4]}
    ds = de.NumpySlicesDataset(np_data, shuffle=False)
    res = [[1, 3], [2, 4]]

    for i, data in enumerate(ds):
        assert data[0] == res[i][0]
        assert data[1] == res[i][1]


 def test_numpy_slices_dict_2():
    logger.info("Test input data is a tuple of Dictionary structure data.")

    data1, data2 = {"a": [1, 2]}, {"b": [3, 4]}
    ds = de.NumpySlicesDataset((data1, data2), column_names=["col1", "col2"], shuffle=False)
    res = [[1, 3], [2, 4]]

    for i, data in enumerate(ds):
        assert data[0] == res[i][0]
        assert data[1] == res[i][1]


 def test_numpy_slices_tuple_1():
    logger.info("Test slicing a list of tuple.")

    np_data = [([1, 2], [3, 4]), ([11, 12], [13, 14]), ([21, 22], [23, 24])]
    res = [[[1, 2], [11, 12], [21, 22]], [[3, 4], [13, 14], [23, 24]]]
    ds = de.NumpySlicesDataset(np_data, shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(data[0], res[i][0]).all()
        assert np.equal(data[1], res[i][1]).all()
        assert np.equal(data[2], res[i][2]).all()

    assert sum([1 for _ in ds]) == 2


 def test_numpy_slices_tuple_2():
    logger.info("Test reading different dimension of tuple data.")
    features, labels = np.random.sample((5, 2)), np.random.sample((5, 1))
    data = (features, labels)

    ds = de.NumpySlicesDataset(data, column_names=["col1", "col2"], shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(data[0], features[i]).all()
        assert data[1] == labels[i]


 def test_numpy_slices_csv_value():
    logger.info("Test loading value of csv file.")
    csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv"

    df = pd.read_csv(csv_file)
    target = df.pop("target")
    df.pop("state")
    np_data = (df.values, target.values)

    ds = de.NumpySlicesDataset(np_data, column_names=["col1", "col2"], shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(np_data[0][i], data[0]).all()
        assert np.equal(np_data[1][i], data[1]).all()


 def test_numpy_slices_csv_dict():
    logger.info("Test loading csv file as dict.")

    csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv"
    df = pd.read_csv(csv_file)
    df.pop("state")
    res = df.values

    ds = de.NumpySlicesDataset(dict(df), shuffle=False)

    for i, data in enumerate(ds):
        assert np.equal(data, res[i]).all()


 def test_numpy_slices_num_samplers():
    logger.info("Test num_samplers.")

    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
    ds = de.NumpySlicesDataset(np_data, shuffle=False, num_samples=2)

    for i, data in enumerate(ds):
        assert np.equal(data[0], np_data[i]).all()

    assert sum([1 for _ in ds]) == 2


 def test_numpy_slices_distributed_sampler():
    logger.info("Test distributed sampler.")

    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
    ds = de.NumpySlicesDataset(np_data, shuffle=False, shard_id=0, num_shards=4)

    for i, data in enumerate(ds):
        assert np.equal(data[0], np_data[i * 4]).all()

    assert sum([1 for _ in ds]) == 2


 def test_numpy_slices_sequential_sampler():

    logger.info("Test numpy_slices_dataset with SequentialSampler and repeat.")

    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
    ds = de.NumpySlicesDataset(np_data, sampler=de.SequentialSampler()).repeat(2)

    for i, data in enumerate(ds):
        assert np.equal(data[0], np_data[i % 8]).all()


 if __name__ == "__main__":
    test_numpy_slices_list_1()
    test_numpy_slices_list_2()
    test_numpy_slices_list_3()
    test_numpy_slices_list_append()
    test_numpy_slices_dict_1()
    test_numpy_slices_dict_2()
    test_numpy_slices_tuple_1()
    test_numpy_slices_tuple_2()
    test_numpy_slices_csv_value()
    test_numpy_slices_csv_dict()
    test_numpy_slices_num_samplers()
    test_numpy_slices_distributed_sampler()
    test_numpy_slices_sequential_sampler()