From bc22c172b8e3b7a6c7dfad89d46393e20eeef223 Mon Sep 17 00:00:00 2001 From: ms_yan <6576637+ms_yan@user.noreply.gitee.com> Date: Mon, 25 May 2020 12:06:24 +0800 Subject: [PATCH] add TensorDataset and its ut --- mindspore/dataset/__init__.py | 4 +- mindspore/dataset/engine/datasets.py | 157 +++++++++++++- mindspore/dataset/engine/validators.py | 45 ++++ requirements.txt | 1 + .../dataset/testNumpySlicesDataset/heart.csv | 6 + .../dataset/test_dataset_numpy_slices.py | 201 ++++++++++++++++++ 6 files changed, 411 insertions(+), 3 deletions(-) create mode 100644 tests/ut/data/dataset/testNumpySlicesDataset/heart.csv create mode 100644 tests/ut/python/dataset/test_dataset_numpy_slices.py diff --git a/mindspore/dataset/__init__.py b/mindspore/dataset/__init__.py index a68ae8edcf..4016211236 100644 --- a/mindspore/dataset/__init__.py +++ b/mindspore/dataset/__init__.py @@ -19,7 +19,7 @@ can also create samplers with this module to sample data. """ from .core.configuration import config -from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, \ +from .engine.datasets import TFRecordDataset, ImageFolderDatasetV2, MnistDataset, MindDataset, NumpySlicesDataset, \ GeneratorDataset, ManifestDataset, Cifar10Dataset, Cifar100Dataset, VOCDataset, CocoDataset, CelebADataset,\ TextFileDataset, Schema, Shuffle, zip, RandomDataset from .engine.samplers import DistributedSampler, PKSampler, RandomSampler, SequentialSampler, SubsetRandomSampler, \ @@ -29,6 +29,6 @@ from .engine.graphdata import GraphData __all__ = ["config", "ImageFolderDatasetV2", "MnistDataset", "MindDataset", "GeneratorDataset", "TFRecordDataset", - "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", + "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", "NumpySlicesDataset", "VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler", "PKSampler", "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler", "zip", "GraphData"] diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 2beadbfff1..d667eccca5 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -40,7 +40,7 @@ from mindspore import log as logger from . import samplers from .iterators import DictIterator, TupleIterator from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \ - check_rename, \ + check_rename, check_numpyslicesdataset, \ check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \ check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset,\ check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat,\ @@ -4377,3 +4377,158 @@ class TextFileDataset(SourceDataset): return self.num_shards > 1 return False + + +class _NumpySlicesDataset: + """ + Mainly for dealing with several kinds of format of python data, and return one row each time. + """ + def __init__(self, data, column_list=None): + self.column_list = None + # Convert dict data into tuple + if isinstance(data, dict) or isinstance(data[0], dict): + data = self.process_dict(data) + + if isinstance(data[0], tuple) or isinstance(data, tuple): + self.is_tuple = True + self.data = data + if isinstance(data[0], tuple): + for i in range(len(self.data)): + self.data[i] = np.array(self.data[i]) + else: + self.is_tuple = False + self.data = np.array(data) + + # Init column_name + if column_list is not None: + self.column_list = column_list + elif self.column_list is None: + self.column_list = [] + column_num = len(self.data) if self.is_tuple else 1 + for i in range(column_num): + self.column_list.append("column_" + str(i)) + + def __getitem__(self, index): + if self.is_tuple: + data_row = [] + for i in range(len(self.data)): + data_row.append(self.data[i][index, ...]) + data_res = tuple(data_row) + else: + data_row = self.data[index, ...] + data_row = [data_row] + data_res = tuple(data_row) + + return data_res + + def __len__(self): + if self.is_tuple: + return len(self.data[0]) + return len(self.data) + + def process_dict(self, input_data): + """ + Convert the dict like data into tuple format, when input is a tuple of dict then compose it into a dict first. + """ + # When input is a tuple of dict, composing it + if isinstance(input_data, tuple) and isinstance(input_data[0], dict): + data_dict = {} + for d in input_data: + data_dict.update(d) + input_data = data_dict + + # convert pandas like dict(has "values" column) into General dict + data_keys = list(input_data.keys()) + data_col = input_data[data_keys[0]] + if hasattr(data_col, "values"): + new_dict = {} + for key in data_keys: + item1 = input_data.pop(key) + new_dict[key] = item1.values + input_data = new_dict + + # Convert the data in dict into tuple + data = [] + self.column_list = [] + keys = input_data.keys() + for key in keys: + self.column_list.append(key) + value = input_data[key] + data.append(tuple(value)) + + return data + + +class NumpySlicesDataset(GeneratorDataset): + """ + Create a dataset with given data slices, mainly for loading python data into dataset. + + This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table + below shows what input args are allowed and their expected behavior. + + .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle' + :widths: 25 25 50 + :header-rows: 1 + + * - Parameter 'sampler' + - Parameter 'shuffle' + - Expected Order Behavior + * - None + - None + - random order + * - None + - True + - random order + * - None + - False + - sequential order + * - Sampler object + - None + - order defined by sampler + * - Sampler object + - True + - not allowed + * - Sampler object + - False + - not allowed + + Args: + data(list, tuple or dict)Input of Given data, supported data type includes list, tuple, dict and other numpy + format. Input data will be sliced in first dimension and generate many rows, large data is not recommend to + load in this way as data is loading into memory. + column_names (list[str], optional): List of column names of the dataset (default=None). If column_names not + provided, when data is dict, column_names will be its key, otherwise it will be like column_1, column_2 ... + num_samples (int, optional): The number of samples to be included in the dataset (default=None, all images). + num_parallel_workers (int, optional): Number of subprocesses used to fetch the dataset in parallel (default=1). + shuffle (bool, optional): Whether or not to perform shuffle on the dataset. Random accessible input is required. + (default=None, expected order behavior shown in the table). + sampler (Sampler/Iterable, optional): Object used to choose samples from the dataset. Random accessible input is + required (default=None, expected order behavior shown in the table). + num_shards (int, optional): Number of shards that the dataset should be divided into (default=None). + This argument should be specified only when 'num_samples' is "None". Random accessible input is required. + shard_id (int, optional): The shard ID within num_shards (default=None). This argument should be specified only + when num_shards is also specified. Random accessible input is required. + + Examples: + >>> import mindspore.dataset as ds + >>> # 1) Input data can be a list + >>> data = [1, 2, 3] + >>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"]) + >>> # 2) Input data can be a dict, and column_names will be its key + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> dataset2 = ds.NumpySlicesDataset(data) + >>> # 3) Input data can be a tuple (or list of tuple), and each tuple element refers to data in each column + >>> data = ((1, 2), (3, 4), (5, 6)) + >>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"]) + >>> # 4) Load data from csv file + >>> import pandas as pd + >>> df = pd.read_csv("file.csv") + >>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False) + """ + @check_numpyslicesdataset + def __init__(self, data, column_names=None, num_samples=None, num_parallel_workers=1, shuffle=None, + sampler=None, num_shards=None, shard_id=None): + dataset = _NumpySlicesDataset(data, column_names) + super().__init__(dataset, column_names=dataset.column_list, num_samples=num_samples, + num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler, + num_shards=num_shards, shard_id=shard_id) diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py index bda4560a51..ff434c718e 100644 --- a/mindspore/dataset/engine/validators.py +++ b/mindspore/dataset/engine/validators.py @@ -1356,3 +1356,48 @@ def check_gnn_get_node_feature(method): return method(*args, **kwargs) return new_method + + +def check_numpyslicesdataset(method): + """A wrapper that wrap a parameter checker to the original Dataset(NumpySlicesDataset).""" + + @wraps(method) + def new_method(*args, **kwargs): + param_dict = make_param_dict(method, args, kwargs) + + # check data; required argument + data = param_dict.get('data') + if not isinstance(data, (list, tuple, dict, np.ndarray)): + raise TypeError("Unsupported data type: {}, only support some common python data type, \ + like list, tuple, dict, and numpy array.".format(type(data))) + if not data: + raise ValueError("Input data is empty.") + + # check column_names + column_names = param_dict.get('column_names') + if column_names is not None: + check_columns(column_names, "column_names") + + # check num of input column in column_names + column_num = 1 if isinstance(column_names, str) else len(column_names) + if isinstance(data, dict): + data_column = len(list(data.keys())) + if column_num != data_column: + raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, data_column)) + + # Consider input is a tuple of dict + elif isinstance(data[0], dict): + data_column = np.sum(len(list(data[i].keys())) for i in range(len(data))) + if column_num != data_column: + raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, data_column)) + + elif isinstance(data[0], tuple) or isinstance(data, tuple): + if column_num != len(data): + raise ValueError("Num of column is {0}, but required is {1}.".format(column_num, len(data))) + else: + if column_num != 1: + raise ValueError("Num of column is {0}, but required is {1} as data is list.".format(column_num, 1)) + + return method(*args, **kwargs) + + return new_method diff --git a/requirements.txt b/requirements.txt index b3e1736341..2f3516ad1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ setuptools >= 40.8.0 matplotlib >= 3.1.3 # for ut test opencv-python >= 4.2.0.32 # for ut test sklearn >= 0.0 # for st test +pandas >= 1.0.2 # for ut test diff --git a/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv b/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv new file mode 100644 index 0000000000..92bc9db643 --- /dev/null +++ b/tests/ut/data/dataset/testNumpySlicesDataset/heart.csv @@ -0,0 +1,6 @@ +age,sex,height,weight,slope,state,target +65,0,161,45,93,fixed,1 +72,1,164,60,86,good,0 +45,0,174,70,79,bad,1 +73,1,173,65,70,good,1 +55,1,182,80,104,good,0 \ No newline at end of file diff --git a/tests/ut/python/dataset/test_dataset_numpy_slices.py b/tests/ut/python/dataset/test_dataset_numpy_slices.py new file mode 100644 index 0000000000..d9d6e39744 --- /dev/null +++ b/tests/ut/python/dataset/test_dataset_numpy_slices.py @@ -0,0 +1,201 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import numpy as np +import mindspore.dataset as de +from mindspore import log as logger +import mindspore.dataset.transforms.vision.c_transforms as vision +import pandas as pd + + +def test_numpy_slices_list_1(): + logger.info("Test Slicing a 1D list.") + + np_data = [1, 2, 3] + ds = de.NumpySlicesDataset(np_data, shuffle=False) + + for i, data in enumerate(ds): + assert data[0] == np_data[i] + + +def test_numpy_slices_list_2(): + logger.info("Test Slicing a 2D list into 1D list.") + + np_data = [[1, 2], [3, 4]] + ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(data[0], np_data[i]).all() + + +def test_numpy_slices_list_3(): + logger.info("Test Slicing list in the first dimension.") + + np_data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + ds = de.NumpySlicesDataset(np_data, column_names=["col1"], shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(data[0], np_data[i]).all() + + +def test_numpy_slices_list_append(): + logger.info("Test reading data of image list.") + + DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"] + resize_height, resize_width = 2, 2 + + data1 = de.TFRecordDataset(DATA_DIR) + resize_op = vision.Resize((resize_height, resize_width)) + data1 = data1.map(input_columns=["image"], operations=[vision.Decode(True), resize_op]) + + res = [] + for data in data1.create_dict_iterator(): + res.append(data["image"]) + + ds = de.NumpySlicesDataset(res, column_names=["col1"], shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(data, res[i]).all() + + +def test_numpy_slices_dict_1(): + logger.info("Test Dictionary structure data.") + + np_data = {"a": [1, 2], "b": [3, 4]} + ds = de.NumpySlicesDataset(np_data, shuffle=False) + res = [[1, 3], [2, 4]] + + for i, data in enumerate(ds): + assert data[0] == res[i][0] + assert data[1] == res[i][1] + + +def test_numpy_slices_dict_2(): + logger.info("Test input data is a tuple of Dictionary structure data.") + + data1, data2 = {"a": [1, 2]}, {"b": [3, 4]} + ds = de.NumpySlicesDataset((data1, data2), column_names=["col1", "col2"], shuffle=False) + res = [[1, 3], [2, 4]] + + for i, data in enumerate(ds): + assert data[0] == res[i][0] + assert data[1] == res[i][1] + + +def test_numpy_slices_tuple_1(): + logger.info("Test slicing a list of tuple.") + + np_data = [([1, 2], [3, 4]), ([11, 12], [13, 14]), ([21, 22], [23, 24])] + res = [[[1, 2], [11, 12], [21, 22]], [[3, 4], [13, 14], [23, 24]]] + ds = de.NumpySlicesDataset(np_data, shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(data[0], res[i][0]).all() + assert np.equal(data[1], res[i][1]).all() + assert np.equal(data[2], res[i][2]).all() + + assert sum([1 for _ in ds]) == 2 + + +def test_numpy_slices_tuple_2(): + logger.info("Test reading different dimension of tuple data.") + features, labels = np.random.sample((5, 2)), np.random.sample((5, 1)) + data = (features, labels) + + ds = de.NumpySlicesDataset(data, column_names=["col1", "col2"], shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(data[0], features[i]).all() + assert data[1] == labels[i] + + +def test_numpy_slices_csv_value(): + logger.info("Test loading value of csv file.") + csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv" + + df = pd.read_csv(csv_file) + target = df.pop("target") + df.pop("state") + np_data = (df.values, target.values) + + ds = de.NumpySlicesDataset(np_data, column_names=["col1", "col2"], shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(np_data[0][i], data[0]).all() + assert np.equal(np_data[1][i], data[1]).all() + + +def test_numpy_slices_csv_dict(): + logger.info("Test loading csv file as dict.") + + csv_file = "../data/dataset/testNumpySlicesDataset/heart.csv" + df = pd.read_csv(csv_file) + df.pop("state") + res = df.values + + ds = de.NumpySlicesDataset(dict(df), shuffle=False) + + for i, data in enumerate(ds): + assert np.equal(data, res[i]).all() + + +def test_numpy_slices_num_samplers(): + logger.info("Test num_samplers.") + + np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] + ds = de.NumpySlicesDataset(np_data, shuffle=False, num_samples=2) + + for i, data in enumerate(ds): + assert np.equal(data[0], np_data[i]).all() + + assert sum([1 for _ in ds]) == 2 + + +def test_numpy_slices_distributed_sampler(): + logger.info("Test distributed sampler.") + + np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] + ds = de.NumpySlicesDataset(np_data, shuffle=False, shard_id=0, num_shards=4) + + for i, data in enumerate(ds): + assert np.equal(data[0], np_data[i * 4]).all() + + assert sum([1 for _ in ds]) == 2 + + +def test_numpy_slices_sequential_sampler(): + + logger.info("Test numpy_slices_dataset with SequentialSampler and repeat.") + + np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] + ds = de.NumpySlicesDataset(np_data, sampler=de.SequentialSampler()).repeat(2) + + for i, data in enumerate(ds): + assert np.equal(data[0], np_data[i % 8]).all() + + +if __name__ == "__main__": + test_numpy_slices_list_1() + test_numpy_slices_list_2() + test_numpy_slices_list_3() + test_numpy_slices_list_append() + test_numpy_slices_dict_1() + test_numpy_slices_dict_2() + test_numpy_slices_tuple_1() + test_numpy_slices_tuple_2() + test_numpy_slices_csv_value() + test_numpy_slices_csv_dict() + test_numpy_slices_num_samplers() + test_numpy_slices_distributed_sampler() + test_numpy_slices_sequential_sampler()