From: @ming__blue Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -88,15 +88,8 @@ def zip(datasets): | |||||
| TypeError: If datasets is not a tuple. | TypeError: If datasets is not a tuple. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir1 = "path/to/imagefolder_directory1" | |||||
| >>> dataset_dir2 = "path/to/imagefolder_directory2" | |||||
| >>> ds1 = ds.ImageFolderDataset(dataset_dir1, num_parallel_workers=8) | |||||
| >>> ds2 = ds.ImageFolderDataset(dataset_dir2, num_parallel_workers=8) | |||||
| >>> | |||||
| >>> # Create a dataset which is the combination of ds1 and ds2 | |||||
| >>> data = ds.zip((ds1, ds2)) | |||||
| >>> # Create a dataset which is the combination of dataset_1 and dataset_2 | |||||
| >>> dataset = ds.zip((dataset_1, dataset_2)) | |||||
| """ | """ | ||||
| if len(datasets) <= 1: | if len(datasets) <= 1: | ||||
| raise ValueError( | raise ValueError( | ||||
| @@ -319,28 +312,27 @@ class Dataset: | |||||
| BucketBatchByLengthDataset, dataset bucketed and batched by length. | BucketBatchByLengthDataset, dataset bucketed and batched by length. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> | |||||
| >>> # Create a dataset where every 100 rows is combined into a batch | >>> # Create a dataset where every 100 rows is combined into a batch | ||||
| >>> # and drops the last incomplete batch if there is one. | >>> # and drops the last incomplete batch if there is one. | ||||
| >>> import numpy as np | |||||
| >>> def generate_2_columns(n): | |||||
| ... for i in range(n): | |||||
| ... yield (np.array([i]), np.array([j for j in range(i + 1)])) | |||||
| >>> column_names = ["col1", "col2"] | >>> column_names = ["col1", "col2"] | ||||
| >>> dataset = ds.GeneratorDataset(generate_2_columns(202), column_names) | |||||
| >>> bucket_boundaries = [5, 10] | >>> bucket_boundaries = [5, 10] | ||||
| >>> bucket_batch_sizes = [5, 1, 1] | >>> bucket_batch_sizes = [5, 1, 1] | ||||
| >>> element_length_function = (lambda col1, col2: max(len(col1), len(col2))) | >>> element_length_function = (lambda col1, col2: max(len(col1), len(col2))) | ||||
| >>> | |||||
| >>> # Will pad col1 to shape [2, bucket_boundaries[i]] where i is the | >>> # Will pad col1 to shape [2, bucket_boundaries[i]] where i is the | ||||
| >>> # index of the bucket that is currently being batched. | >>> # index of the bucket that is currently being batched. | ||||
| >>> # Will pad col2 to a shape where each dimension is the longest in all | >>> # Will pad col2 to a shape where each dimension is the longest in all | ||||
| >>> # the elements currently being batched. | >>> # the elements currently being batched. | ||||
| >>> pad_info = {"col1", ([2, None], -1)} | |||||
| >>> pad_info = {"col1": ([2, None], -1)} | |||||
| >>> pad_to_bucket_boundary = True | >>> pad_to_bucket_boundary = True | ||||
| >>> | |||||
| >>> data = data.bucket_batch_by_length(column_names, bucket_boundaries, | |||||
| >>> bucket_batch_sizes, | |||||
| >>> element_length_function, pad_info, | |||||
| >>> pad_to_bucket_boundary) | |||||
| >>> dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries, | |||||
| ... bucket_batch_sizes, | |||||
| ... element_length_function, pad_info, | |||||
| ... pad_to_bucket_boundary) | |||||
| """ | """ | ||||
| return BucketBatchByLengthDataset(self, column_names, bucket_boundaries, bucket_batch_sizes, | return BucketBatchByLengthDataset(self, column_names, bucket_boundaries, bucket_batch_sizes, | ||||
| element_length_function, pad_info, | element_length_function, pad_info, | ||||
| @@ -397,26 +389,21 @@ class Dataset: | |||||
| BatchDataset, dataset batched. | BatchDataset, dataset batched. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> | |||||
| >>> # Create a dataset where every 100 rows is combined into a batch | >>> # Create a dataset where every 100 rows is combined into a batch | ||||
| >>> # and drops the last incomplete batch if there is one. | >>> # and drops the last incomplete batch if there is one. | ||||
| >>> data = data.batch(100, True) | |||||
| >>> | |||||
| >>> dataset = dataset.batch(100, True) | |||||
| >>> # resize image according to its batch number, if it's 5-th batch, resize to (5^2, 5^2) = (25, 25) | >>> # resize image according to its batch number, if it's 5-th batch, resize to (5^2, 5^2) = (25, 25) | ||||
| >>> def np_resize(col, batchInfo): | >>> def np_resize(col, batchInfo): | ||||
| >>> output = col.copy() | |||||
| >>> s = (batchInfo.get_batch_num() + 1) ** 2 | |||||
| >>> index = 0 | |||||
| >>> for c in col: | |||||
| >>> img = Image.fromarray(c.astype('uint8')).convert('RGB') | |||||
| >>> img = img.resize((s, s), Image.ANTIALIAS) | |||||
| >>> output[index] = np.array(img) | |||||
| >>> index += 1 | |||||
| >>> return (output,) | |||||
| >>> data = data.batch(batch_size=8, input_columns=["image"], per_batch_map=np_resize) | |||||
| ... output = col.copy() | |||||
| ... s = (batchInfo.get_batch_num() + 1) ** 2 | |||||
| ... index = 0 | |||||
| ... for c in col: | |||||
| ... img = Image.fromarray(c.astype('uint8')).convert('RGB') | |||||
| ... img = img.resize((s, s), Image.ANTIALIAS) | |||||
| ... output[index] = np.array(img) | |||||
| ... index += 1 | |||||
| ... return (output,) | |||||
| >>> dataset = dataset.batch(batch_size=8, input_columns=["image"], per_batch_map=np_resize) | |||||
| """ | """ | ||||
| return BatchDataset(self, batch_size, drop_remainder, num_parallel_workers, per_batch_map, input_columns, | return BatchDataset(self, batch_size, drop_remainder, num_parallel_workers, per_batch_map, input_columns, | ||||
| output_columns, column_order, pad_info, python_multiprocessing) | output_columns, column_order, pad_info, python_multiprocessing) | ||||
| @@ -438,13 +425,34 @@ class Dataset: | |||||
| RuntimeError: If condition name already exists. | RuntimeError: If condition name already exists. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import numpy as np | |||||
| >>> def gen(): | |||||
| ... for i in range(100): | |||||
| ... yield (np.array(i),) | |||||
| >>> | |||||
| >>> class Augment: | |||||
| ... def __init__(self, loss): | |||||
| ... self.loss = loss | |||||
| ... | |||||
| ... def preprocess(self, input_): | |||||
| ... return input_ | |||||
| ... | |||||
| ... def update(self, data): | |||||
| ... self.loss = data["loss"] | |||||
| >>> | >>> | ||||
| >>> # data is an instance of Dataset object. | |||||
| >>> data = data.sync_wait("callback1") | |||||
| >>> data = data.batch(batch_size) | |||||
| >>> for batch_data in data.create_dict_iterator(): | |||||
| >>> data = data.sync_update("callback1") | |||||
| >>> batch_size = 4 | |||||
| >>> dataset = ds.GeneratorDataset(gen, column_names=["input"]) | |||||
| >>> | |||||
| >>> aug = Augment(0) | |||||
| >>> dataset = dataset.sync_wait(condition_name="policy", callback=aug.update) | |||||
| >>> dataset = dataset.map(operations=[aug.preprocess], input_columns=["input"]) | |||||
| >>> dataset = dataset.batch(batch_size) | |||||
| >>> count = 0 | |||||
| >>> for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): | |||||
| ... assert data["input"][0] == count | |||||
| ... count += batch_size | |||||
| ... data = {"loss": count} | |||||
| ... dataset.sync_update(condition_name="policy", data=data) | |||||
| """ | """ | ||||
| return SyncWaitDataset(self, condition_name, num_batch, callback) | return SyncWaitDataset(self, condition_name, num_batch, callback) | ||||
| @@ -474,14 +482,11 @@ class Dataset: | |||||
| RuntimeError: If exist sync operators before shuffle. | RuntimeError: If exist sync operators before shuffle. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> # dataset is an instance of Dataset object. | |||||
| >>> # Optionally set the seed for the first epoch | >>> # Optionally set the seed for the first epoch | ||||
| >>> ds.config.set_seed(58) | >>> ds.config.set_seed(58) | ||||
| >>> | |||||
| >>> # Create a shuffled dataset using a shuffle buffer of size 4 | >>> # Create a shuffled dataset using a shuffle buffer of size 4 | ||||
| >>> data = data.shuffle(4) | |||||
| >>> dataset = dataset.shuffle(4) | |||||
| """ | """ | ||||
| return ShuffleDataset(self, buffer_size) | return ShuffleDataset(self, buffer_size) | ||||
| @@ -500,17 +505,14 @@ class Dataset: | |||||
| Dataset, dataset applied by the function. | Dataset, dataset applied by the function. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> # Declare a function which returns a Dataset object | >>> # Declare a function which returns a Dataset object | ||||
| >>> def flat_map_func(x): | >>> def flat_map_func(x): | ||||
| >>> data_dir = text.to_str(x[0]) | |||||
| >>> d = ds.ImageFolderDataset(data_dir) | |||||
| >>> return d | |||||
| >>> # data is an instance of a Dataset object. | |||||
| >>> data = ds.TextFileDataset(DATA_FILE) | |||||
| >>> data = data.flat_map(flat_map_func) | |||||
| ... image_folder_dataset_dir = text.to_str(x[0]) | |||||
| ... d = ds.ImageFolderDataset(image_folder_dataset_dir) | |||||
| ... return d | |||||
| >>> # dataset is an instance of a Dataset object. | |||||
| >>> dataset = ds.TextFileDataset(text_file_dataset_dir) | |||||
| >>> dataset = dataset.flat_map(flat_map_func) | |||||
| Raises: | Raises: | ||||
| TypeError: If `func` is not a function. | TypeError: If `func` is not a function. | ||||
| @@ -584,13 +586,9 @@ class Dataset: | |||||
| MapDataset, dataset after mapping operation. | MapDataset, dataset after mapping operation. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.vision.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> # data is an instance of Dataset which has 2 columns, "image" and "label". | |||||
| >>> # dataset is an instance of Dataset which has 2 columns, "image" and "label". | |||||
| >>> # ds_pyfunc is an instance of Dataset which has 3 columns, "col0", "col1", and "col2". | >>> # ds_pyfunc is an instance of Dataset which has 3 columns, "col0", "col1", and "col2". | ||||
| >>> # Each column is a 2D array of integers. | >>> # Each column is a 2D array of integers. | ||||
| >>> | |||||
| >>> # Set the global configuration value for num_parallel_workers to be 2. | >>> # Set the global configuration value for num_parallel_workers to be 2. | ||||
| >>> # Operations which use this configuration value will use 2 worker threads, | >>> # Operations which use this configuration value will use 2 worker threads, | ||||
| >>> # unless otherwise specified in the operator's constructor. | >>> # unless otherwise specified in the operator's constructor. | ||||
| @@ -599,8 +597,8 @@ class Dataset: | |||||
| >>> ds.config.set_num_parallel_workers(2) | >>> ds.config.set_num_parallel_workers(2) | ||||
| >>> | >>> | ||||
| >>> # Define two operations, where each operation accepts 1 input column and outputs 1 column. | >>> # Define two operations, where each operation accepts 1 input column and outputs 1 column. | ||||
| >>> decode_op = c_transforms.Decode(rgb_format=True) | |||||
| >>> random_jitter_op = c_transforms.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0)) | |||||
| >>> decode_op = c_vision.Decode(rgb_format=True) | |||||
| >>> random_jitter_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0)) | |||||
| >>> | >>> | ||||
| >>> # 1) Simple map example | >>> # 1) Simple map example | ||||
| >>> | >>> | ||||
| @@ -610,31 +608,31 @@ class Dataset: | |||||
| >>> # Apply decode_op on column "image". This column will be replaced by the outputted | >>> # Apply decode_op on column "image". This column will be replaced by the outputted | ||||
| >>> # column of decode_op. Since column_order is not provided, both columns "image" | >>> # column of decode_op. Since column_order is not provided, both columns "image" | ||||
| >>> # and "label" will be propagated to the child node in their original order. | >>> # and "label" will be propagated to the child node in their original order. | ||||
| >>> ds_decoded = data.map(operations, input_columns) | |||||
| >>> dataset = dataset.map(operations, input_columns) | |||||
| >>> | >>> | ||||
| >>> # Rename column "image" to "decoded_image". | >>> # Rename column "image" to "decoded_image". | ||||
| >>> output_columns = ["decoded_image"] | >>> output_columns = ["decoded_image"] | ||||
| >>> ds_decoded = data.map(operations, input_columns, output_columns) | |||||
| >>> dataset = dataset.map(operations, input_columns, output_columns) | |||||
| >>> | >>> | ||||
| >>> # Specify the order of the columns. | >>> # Specify the order of the columns. | ||||
| >>> column_order ["label", "image"] | >>> column_order ["label", "image"] | ||||
| >>> ds_decoded = data.map(operations, input_columns, None, column_order) | |||||
| >>> dataset = dataset.map(operations, input_columns, None, column_order) | |||||
| >>> | >>> | ||||
| >>> # Rename column "image" to "decoded_image" and also specify the order of the columns. | >>> # Rename column "image" to "decoded_image" and also specify the order of the columns. | ||||
| >>> column_order ["label", "decoded_image"] | >>> column_order ["label", "decoded_image"] | ||||
| >>> output_columns = ["decoded_image"] | >>> output_columns = ["decoded_image"] | ||||
| >>> ds_decoded = data.map(operations, input_columns, output_columns, column_order) | |||||
| >>> dataset = dataset.map(operations, input_columns, output_columns, column_order) | |||||
| >>> | >>> | ||||
| >>> # Rename column "image" to "decoded_image" and keep only this column. | >>> # Rename column "image" to "decoded_image" and keep only this column. | ||||
| >>> column_order ["decoded_image"] | >>> column_order ["decoded_image"] | ||||
| >>> output_columns = ["decoded_image"] | >>> output_columns = ["decoded_image"] | ||||
| >>> ds_decoded = data.map(operations, input_columns, output_columns, column_order) | |||||
| >>> dataset = dataset.map(operations, input_columns, output_columns, column_order) | |||||
| >>> | >>> | ||||
| >>> # A simple example using pyfunc: Renaming columns and specifying column order | >>> # A simple example using pyfunc: Renaming columns and specifying column order | ||||
| >>> # work in the same way as the previous examples. | >>> # work in the same way as the previous examples. | ||||
| >>> input_columns = ["col0"] | >>> input_columns = ["col0"] | ||||
| >>> operations = [(lambda x: x + 1)] | >>> operations = [(lambda x: x + 1)] | ||||
| >>> ds_mapped = ds_pyfunc.map(operations, input_columns) | |||||
| >>> dataset = dataset.map(operations, input_columns) | |||||
| >>> | >>> | ||||
| >>> # 2) Map example with more than one operation | >>> # 2) Map example with more than one operation | ||||
| >>> | >>> | ||||
| @@ -651,20 +649,20 @@ class Dataset: | |||||
| >>> # the column outputted by random_jitter_op (the very last operation). All other | >>> # the column outputted by random_jitter_op (the very last operation). All other | ||||
| >>> # columns are unchanged. Since column_order is not specified, the order of the | >>> # columns are unchanged. Since column_order is not specified, the order of the | ||||
| >>> # columns will remain the same. | >>> # columns will remain the same. | ||||
| >>> ds_mapped = data.map(operations, input_columns) | |||||
| >>> dataset = dataset.map(operations, input_columns) | |||||
| >>> | >>> | ||||
| >>> # Create a dataset that is identical to ds_mapped, except the column "image" | >>> # Create a dataset that is identical to ds_mapped, except the column "image" | ||||
| >>> # that is outputted by random_jitter_op is renamed to "image_transformed". | >>> # that is outputted by random_jitter_op is renamed to "image_transformed". | ||||
| >>> # Specifying column order works in the same way as examples in 1). | >>> # Specifying column order works in the same way as examples in 1). | ||||
| >>> output_columns = ["image_transformed"] | >>> output_columns = ["image_transformed"] | ||||
| >>> ds_mapped_and_renamed = data.map(operation, input_columns, output_columns) | |||||
| >>> dataset = dataset.map(operation, input_columns, output_columns) | |||||
| >>> | >>> | ||||
| >>> # Multiple operations using pyfunc: Renaming columns and specifying column order | >>> # Multiple operations using pyfunc: Renaming columns and specifying column order | ||||
| >>> # work in the same way as examples in 1). | >>> # work in the same way as examples in 1). | ||||
| >>> input_columns = ["col0"] | >>> input_columns = ["col0"] | ||||
| >>> operations = [(lambda x: x + x), (lambda x: x - 1)] | >>> operations = [(lambda x: x + x), (lambda x: x - 1)] | ||||
| >>> output_columns = ["col0_mapped"] | >>> output_columns = ["col0_mapped"] | ||||
| >>> ds_mapped = ds_pyfunc.map(operations, input_columns, output_columns) | |||||
| >>> dataset = dataset.map(operations, input_columns, output_columns) | |||||
| >>> | >>> | ||||
| >>> # 3) Example where number of input columns is not equal to number of output columns | >>> # 3) Example where number of input columns is not equal to number of output columns | ||||
| >>> | >>> | ||||
| @@ -687,11 +685,11 @@ class Dataset: | |||||
| >>> | >>> | ||||
| >>> # Propagate all columns to the child node in this order: | >>> # Propagate all columns to the child node in this order: | ||||
| >>> column_order = ["col0", "col2", "mod2", "mod3", "mod5", "mod7", "col1"] | >>> column_order = ["col0", "col2", "mod2", "mod3", "mod5", "mod7", "col1"] | ||||
| >>> ds_mapped = ds_pyfunc.map(operations, input_columns, output_columns, column_order) | |||||
| >>> dataset = dataset.map(operations, input_columns, output_columns, column_order) | |||||
| >>> | >>> | ||||
| >>> # Propagate some columns to the child node in this order: | >>> # Propagate some columns to the child node in this order: | ||||
| >>> column_order = ["mod7", "mod3", "col1"] | >>> column_order = ["mod7", "mod3", "col1"] | ||||
| >>> ds_mapped = ds_pyfunc.map(operations, input_columns, output_columns, column_order) | |||||
| >>> dataset = dataset.map(operations, input_columns, output_columns, column_order) | |||||
| """ | """ | ||||
| return MapDataset(self, operations, input_columns, output_columns, column_order, num_parallel_workers, | return MapDataset(self, operations, input_columns, output_columns, column_order, num_parallel_workers, | ||||
| @@ -716,10 +714,9 @@ class Dataset: | |||||
| FilterDataset, dataset filtered. | FilterDataset, dataset filtered. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> # generator data(0 ~ 63) | >>> # generator data(0 ~ 63) | ||||
| >>> # filter the data that greater than or equal to 11 | >>> # filter the data that greater than or equal to 11 | ||||
| >>> dataset_f = dataset.filter(predicate=lambda data: data < 11, input_columns = ["data"]) | |||||
| >>> dataset = dataset.filter(predicate=lambda data: data < 11, input_columns = ["data"]) | |||||
| """ | """ | ||||
| return FilterDataset(self, predicate, input_columns, num_parallel_workers) | return FilterDataset(self, predicate, input_columns, num_parallel_workers) | ||||
| @@ -742,22 +739,20 @@ class Dataset: | |||||
| RepeatDataset, dataset repeated. | RepeatDataset, dataset repeated. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> # dataset is an instance of Dataset object. | |||||
| >>> | >>> | ||||
| >>> # Create a dataset where the dataset is repeated for 50 epochs | >>> # Create a dataset where the dataset is repeated for 50 epochs | ||||
| >>> repeated = data.repeat(50) | |||||
| >>> dataset = dataset.repeat(50) | |||||
| >>> | >>> | ||||
| >>> # Create a dataset where each epoch is shuffled individually | >>> # Create a dataset where each epoch is shuffled individually | ||||
| >>> shuffled_and_repeated = data.shuffle(10) | |||||
| >>> shuffled_and_repeated = shuffled_and_repeated.repeat(50) | |||||
| >>> dataset = dataset.shuffle(10) | |||||
| >>> dataset = dataset.repeat(50) | |||||
| >>> | >>> | ||||
| >>> # Create a dataset where the dataset is first repeated for | >>> # Create a dataset where the dataset is first repeated for | ||||
| >>> # 50 epochs before shuffling. The shuffle operator will treat | >>> # 50 epochs before shuffling. The shuffle operator will treat | ||||
| >>> # the entire 50 epochs as one big dataset. | >>> # the entire 50 epochs as one big dataset. | ||||
| >>> repeat_and_shuffle = data.repeat(50) | |||||
| >>> repeat_and_shuffle = repeat_and_shuffle.shuffle(10) | |||||
| >>> dataset = dataset.repeat(50) | |||||
| >>> dataset = dataset.shuffle(10) | |||||
| """ | """ | ||||
| return RepeatDataset(self, count) | return RepeatDataset(self, count) | ||||
| @@ -773,11 +768,9 @@ class Dataset: | |||||
| SkipDataset, dataset skipped. | SkipDataset, dataset skipped. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> # dataset is an instance of Dataset object. | |||||
| >>> # Create a dataset which skips first 3 elements from data | >>> # Create a dataset which skips first 3 elements from data | ||||
| >>> data = data.skip(3) | |||||
| >>> dataset = dataset.skip(3) | |||||
| """ | """ | ||||
| return SkipDataset(self, count) | return SkipDataset(self, count) | ||||
| @@ -799,11 +792,9 @@ class Dataset: | |||||
| TakeDataset, dataset taken. | TakeDataset, dataset taken. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> # dataset is an instance of Dataset object. | |||||
| >>> # Create a dataset where the dataset includes 50 elements. | >>> # Create a dataset where the dataset includes 50 elements. | ||||
| >>> data = data.take(50) | |||||
| >>> dataset = dataset.take(50) | |||||
| """ | """ | ||||
| return TakeDataset(self, count) | return TakeDataset(self, count) | ||||
| @@ -911,14 +902,10 @@ class Dataset: | |||||
| tuple(Dataset), a tuple of datasets that have been split. | tuple(Dataset), a tuple of datasets that have been split. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_files = "/path/to/text_file/*" | |||||
| >>> | |||||
| >>> # TextFileDataset is not a mappable dataset, so this non-optimized split will be called. | >>> # TextFileDataset is not a mappable dataset, so this non-optimized split will be called. | ||||
| >>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called! | >>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called! | ||||
| >>> data = ds.TextFileDataset(dataset_files, shuffle=False) | |||||
| >>> train, test = data.split([0.9, 0.1]) | |||||
| >>> dataset = ds.TextFileDataset(text_file_dataset_dir, shuffle=False) | |||||
| >>> train_dataset, test_dataset = dataset.split([0.9, 0.1]) | |||||
| """ | """ | ||||
| if self.is_shuffled(): | if self.is_shuffled(): | ||||
| logger.warning("Dataset is shuffled before split.") | logger.warning("Dataset is shuffled before split.") | ||||
| @@ -960,11 +947,8 @@ class Dataset: | |||||
| ZipDataset, dataset zipped. | ZipDataset, dataset zipped. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # ds1 and ds2 are instances of Dataset object | |||||
| >>> # Create a dataset which is the combination of ds1 and ds2 | |||||
| >>> data = ds1.zip(ds2) | |||||
| >>> # Create a dataset which is the combination of dataset and dataset_1 | |||||
| >>> dataset = dataset.zip(dataset_1) | |||||
| """ | """ | ||||
| if isinstance(datasets, tuple): | if isinstance(datasets, tuple): | ||||
| datasets = (self, *datasets) | datasets = (self, *datasets) | ||||
| @@ -990,14 +974,10 @@ class Dataset: | |||||
| ConcatDataset, dataset concatenated. | ConcatDataset, dataset concatenated. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # ds1 and ds2 are instances of Dataset object | |||||
| >>> | |||||
| >>> # Create a dataset by concatenating ds1 and ds2 with "+" operator | |||||
| >>> data1 = ds1 + ds2 | |||||
| >>> # Create a dataset by concatenating ds1 and ds2 with concat operation | |||||
| >>> data1 = ds1.concat(ds2) | |||||
| >>> # Create a dataset by concatenating dataset_1 and dataset_2 with "+" operator | |||||
| >>> dataset = dataset_1 + dataset_2 | |||||
| >>> # Create a dataset by concatenating dataset_1 and dataset_2 with concat operation | |||||
| >>> dataset = dataset_1.concat(dataset_2) | |||||
| """ | """ | ||||
| if isinstance(datasets, Dataset): | if isinstance(datasets, Dataset): | ||||
| datasets = [self] + [datasets] | datasets = [self] + [datasets] | ||||
| @@ -1020,16 +1000,14 @@ class Dataset: | |||||
| RenameDataset, dataset renamed. | RenameDataset, dataset renamed. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object. | |||||
| >>> # dataset is an instance of Dataset object. | |||||
| >>> input_columns = ["input_col1", "input_col2", "input_col3"] | >>> input_columns = ["input_col1", "input_col2", "input_col3"] | ||||
| >>> output_columns = ["output_col1", "output_col2", "output_col3"] | >>> output_columns = ["output_col1", "output_col2", "output_col3"] | ||||
| >>> | >>> | ||||
| >>> # Create a dataset where input_col1 is renamed to output_col1, and | >>> # Create a dataset where input_col1 is renamed to output_col1, and | ||||
| >>> # input_col2 is renamed to output_col2, and input_col3 is renamed | >>> # input_col2 is renamed to output_col2, and input_col3 is renamed | ||||
| >>> # to output_col3. | >>> # to output_col3. | ||||
| >>> data = data.rename(input_columns=input_columns, output_columns=output_columns) | |||||
| >>> dataset = dataset.rename(input_columns=input_columns, output_columns=output_columns) | |||||
| """ | """ | ||||
| return RenameDataset(self, input_columns, output_columns) | return RenameDataset(self, input_columns, output_columns) | ||||
| @@ -1049,14 +1027,12 @@ class Dataset: | |||||
| ProjectDataset, dataset projected. | ProjectDataset, dataset projected. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> # dataset is an instance of Dataset object | |||||
| >>> columns_to_project = ["column3", "column1", "column2"] | >>> columns_to_project = ["column3", "column1", "column2"] | ||||
| >>> | >>> | ||||
| >>> # Create a dataset that consists of column3, column1, column2 | >>> # Create a dataset that consists of column3, column1, column2 | ||||
| >>> # in that order, regardless of the original order of columns. | >>> # in that order, regardless of the original order of columns. | ||||
| >>> data = data.project(columns=columns_to_project) | |||||
| >>> dataset = dataset.project(columns=columns_to_project) | |||||
| """ | """ | ||||
| return ProjectDataset(self, columns) | return ProjectDataset(self, columns) | ||||
| @@ -1084,11 +1060,17 @@ class Dataset: | |||||
| Vocab, vocab built from the dataset. | Vocab, vocab built from the dataset. | ||||
| Example: | Example: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> data = data.build_vocab(columns=["column3", "column1", "column2"], freq_range=(1, 10), top_k=5, | |||||
| >>> special_tokens=["<pad>", "<unk>"], special_first=True) | |||||
| >>> def gen_corpus(): | |||||
| ... # key: word, value: number of occurrences, reason for using letters is so their order is apparent | |||||
| ... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1} | |||||
| ... for k, v in corpus.items(): | |||||
| ... yield (np.array([k] * v, dtype='S'),) | |||||
| >>> column_names = ["column1","column2","column3"] | |||||
| >>> dataset = ds.GeneratorDataset(gen_corpus, column_names) | |||||
| >>> dataset = dataset.build_vocab(columns=["column3", "column1", "column2"], | |||||
| ... freq_range=(1, 10), top_k=5, | |||||
| ... special_tokens=["<pad>", "<unk>"], | |||||
| ... special_first=True,vocab='vocab') | |||||
| """ | """ | ||||
| vocab = cde.Vocab() | vocab = cde.Vocab() | ||||
| @@ -1143,13 +1125,19 @@ class Dataset: | |||||
| SentencePieceVocab, vocab built from the dataset. | SentencePieceVocab, vocab built from the dataset. | ||||
| Example: | Example: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> data = data.build_sentencepiece_vocab(columns=["column3", "column1", "column2"], vocab_size=5000, | |||||
| >>> character_coverage=0.9995, model_type=SentencePieceModel.Unigram, | |||||
| >>> params={}) | |||||
| >>> from mindspore.dataset.text import SentencePieceModel | |||||
| >>> def gen_corpus(): | |||||
| ... # key: word, value: number of occurrences, reason for using letters is so their order is apparent | |||||
| ... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1} | |||||
| ... for k, v in corpus.items(): | |||||
| ... yield (np.array([k] * v, dtype='S'),) | |||||
| >>> column_names = ["column1","column2","column3"] | |||||
| >>> dataset = ds.GeneratorDataset(gen_corpus, column_names) | |||||
| >>> dataset = dataset.build_sentencepiece_vocab(columns=["column3", "column1", "column2"], | |||||
| ... vocab_size=5000, | |||||
| ... character_coverage=0.9995, | |||||
| ... model_type=SentencePieceModel.Unigram, | |||||
| ... params={},vocab='vocab') | |||||
| """ | """ | ||||
| vocab = cde.SentencePieceVocab() | vocab = cde.SentencePieceVocab() | ||||
| @@ -1184,17 +1172,15 @@ class Dataset: | |||||
| Dataset, dataset applied by the function. | Dataset, dataset applied by the function. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> # dataset is an instance of Dataset object | |||||
| >>> | >>> | ||||
| >>> # Declare an apply_func function which returns a Dataset object | >>> # Declare an apply_func function which returns a Dataset object | ||||
| >>> def apply_func(ds): | |||||
| >>> ds = ds.batch(2) | |||||
| >>> return ds | |||||
| >>> def apply_func(data): | |||||
| ... data = data.batch(2) | |||||
| ... return data | |||||
| >>> | >>> | ||||
| >>> # Use apply to call apply_func | >>> # Use apply to call apply_func | ||||
| >>> data = data.apply(apply_func) | |||||
| >>> dataset = dataset.apply(apply_func) | |||||
| Raises: | Raises: | ||||
| TypeError: If apply_func is not a function. | TypeError: If apply_func is not a function. | ||||
| @@ -1356,16 +1342,14 @@ class Dataset: | |||||
| TupleIterator, tuple iterator over the dataset. | TupleIterator, tuple iterator over the dataset. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> # dataset is an instance of Dataset object | |||||
| >>> | >>> | ||||
| >>> # Create an iterator | >>> # Create an iterator | ||||
| >>> # The columns in the data obtained by the iterator will not be changed. | |||||
| >>> iterator = data.create_tuple_iterator() | |||||
| >>> # The columns in the dataset obtained by the iterator will not be changed. | |||||
| >>> iterator = dataset.create_tuple_iterator() | |||||
| >>> for item in iterator: | >>> for item in iterator: | ||||
| >>> # convert the returned tuple to a list and print | |||||
| >>> print(list(item)) | |||||
| ... # convert the returned tuple to a list and print | |||||
| ... print(list(item)) | |||||
| """ | """ | ||||
| if output_numpy is None: | if output_numpy is None: | ||||
| output_numpy = False | output_numpy = False | ||||
| @@ -1391,16 +1375,14 @@ class Dataset: | |||||
| DictIterator, dictionary iterator over the dataset. | DictIterator, dictionary iterator over the dataset. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> # dataset is an instance of Dataset object | |||||
| >>> | >>> | ||||
| >>> # create an iterator | >>> # create an iterator | ||||
| >>> # The columns in the data obtained by the iterator might be changed. | >>> # The columns in the data obtained by the iterator might be changed. | ||||
| >>> iterator = data.create_dict_iterator() | |||||
| >>> iterator = dataset.create_dict_iterator() | |||||
| >>> for item in iterator: | >>> for item in iterator: | ||||
| >>> # print the data in column1 | |||||
| >>> print(item["column1"]) | |||||
| ... # print the data in column1 | |||||
| ... print(item["column1"]) | |||||
| """ | """ | ||||
| if output_numpy is None: | if output_numpy is None: | ||||
| output_numpy = False | output_numpy = False | ||||
| @@ -1422,11 +1404,9 @@ class Dataset: | |||||
| tuple, tuple of the input index information. | tuple, tuple of the input index information. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # data is an instance of Dataset object | |||||
| >>> data = ds.NumpySlicesDataset([1, 2, 3], column_names=["col_1"]) | |||||
| >>> print(data.input_indexs()) | |||||
| >>> # dataset is an instance of Dataset object | |||||
| >>> dataset = ds.NumpySlicesDataset([1, 2, 3], column_names=["col_1"]) | |||||
| >>> print(dataset.input_indexs) | |||||
| """ | """ | ||||
| if self._input_indexs != (): | if self._input_indexs != (): | ||||
| return self._input_indexs | return self._input_indexs | ||||
| @@ -1718,15 +1698,12 @@ class MappableDataset(SourceDataset): | |||||
| new_sampler (Sampler): The sampler to use for the current dataset. | new_sampler (Sampler): The sampler to use for the current dataset. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/imagefolder_directory" | |||||
| >>> # Note: A SequentialSampler is created by default | >>> # Note: A SequentialSampler is created by default | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir) | |||||
| >>> | >>> | ||||
| >>> # Use a DistributedSampler instead of the SequentialSampler | >>> # Use a DistributedSampler instead of the SequentialSampler | ||||
| >>> new_sampler = ds.DistributedSampler(10, 2) | >>> new_sampler = ds.DistributedSampler(10, 2) | ||||
| >>> data.use_sampler(new_sampler) | |||||
| >>> dataset.use_sampler(new_sampler) | |||||
| """ | """ | ||||
| if new_sampler is None: | if new_sampler is None: | ||||
| raise TypeError("Input sampler can not be None.") | raise TypeError("Input sampler can not be None.") | ||||
| @@ -1804,21 +1781,17 @@ class MappableDataset(SourceDataset): | |||||
| tuple(Dataset), a tuple of datasets that have been split. | tuple(Dataset), a tuple of datasets that have been split. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called! | >>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called! | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir, shuffle=False) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, shuffle=False) | |||||
| >>> | >>> | ||||
| >>> # Set the seed, and tell split to use this seed when randomizing. | >>> # Set the seed, and tell split to use this seed when randomizing. | ||||
| >>> # This is needed because sharding will be done later | >>> # This is needed because sharding will be done later | ||||
| >>> ds.config.set_seed(58) | >>> ds.config.set_seed(58) | ||||
| >>> train, test = data.split([0.9, 0.1]) | |||||
| >>> train_dataset, test_dataset = dataset.split([0.9, 0.1]) | |||||
| >>> | >>> | ||||
| >>> # To shard the train dataset, use a DistributedSampler | >>> # To shard the train dataset, use a DistributedSampler | ||||
| >>> train_sampler = ds.DistributedSampler(10, 2) | >>> train_sampler = ds.DistributedSampler(10, 2) | ||||
| >>> train.use_sampler(train_sampler) | |||||
| >>> train_dataset.use_sampler(train_sampler) | |||||
| """ | """ | ||||
| if self.is_shuffled(): | if self.is_shuffled(): | ||||
| logger.warning("Dataset is shuffled before split.") | logger.warning("Dataset is shuffled before split.") | ||||
| @@ -3062,20 +3035,17 @@ class ImageFolderDataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # Set path to the imagefolder directory. | |||||
| >>> # This directory needs to contain sub-directories which contain the images | |||||
| >>> dataset_dir = "/path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> # 1) Read all samples (image files) in dataset_dir with 8 threads | |||||
| >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) | |||||
| >>> # 1) Read all samples (image files) in image_folder_dataset_dir with 8 threads | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8) | |||||
| >>> | >>> | ||||
| >>> # 2) Read all samples (image files) from folder cat and folder dog with label 0 and 1 | >>> # 2) Read all samples (image files) from folder cat and folder dog with label 0 and 1 | ||||
| >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, class_indexing={"cat":0, "dog":1}) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... class_indexing={"cat":0, "dog":1}) | |||||
| >>> | >>> | ||||
| >>> # 3) Read all samples (image files) in dataset_dir with extensions .JPEG and .png (case sensitive) | |||||
| >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, extensions=[".JPEG", ".png"]) | |||||
| >>> # 3) Read all samples (image files) in image_folder_dataset_dir with extensions .JPEG and .png (case sensitive) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... extensions=[".JPEG", ".png"]) | |||||
| """ | """ | ||||
| @check_imagefolderdataset | @check_imagefolderdataset | ||||
| @@ -3195,9 +3165,8 @@ class MnistDataset(MappableDataset): | |||||
| (default=None, expected order behavior shown in the table). | (default=None, expected order behavior shown in the table). | ||||
| sampler (Sampler, optional): Object used to choose samples from the | sampler (Sampler, optional): Object used to choose samples from the | ||||
| dataset (default=None, expected order behavior shown in the table). | dataset (default=None, expected order behavior shown in the table). | ||||
| num_shards (int, optional): Number of shards that the dataset will be divided | |||||
| into (default=None). When this argument is specified, 'num_samples' reflects | |||||
| the max sample number of per shard. | |||||
| num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). | |||||
| When this argument is specified, 'num_samples' reflects the max sample number of per shard. | |||||
| shard_id (int, optional): The shard ID within num_shards (default=None). This | shard_id (int, optional): The shard ID within num_shards (default=None). This | ||||
| argument can only be specified when num_shards is also specified. | argument can only be specified when num_shards is also specified. | ||||
| cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. | cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. | ||||
| @@ -3211,11 +3180,8 @@ class MnistDataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/mnist_folder" | |||||
| >>> # Read 3 samples from MNIST dataset | >>> # Read 3 samples from MNIST dataset | ||||
| >>> mnist_dataset = ds.MnistDataset(dataset_dir=dataset_dir, num_samples=3) | |||||
| >>> dataset = ds.MnistDataset(dataset_dir=mnist_dataset_dir, num_samples=3) | |||||
| >>> # Note: In mnist_dataset dataset, each dictionary has keys "image" and "label" | >>> # Note: In mnist_dataset dataset, each dictionary has keys "image" and "label" | ||||
| """ | """ | ||||
| @@ -3718,33 +3684,31 @@ class GeneratorDataset(MappableDataset): | |||||
| option could be beneficial if the Python operation is computational heavy (default=True). | option could be beneficial if the Python operation is computational heavy (default=True). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # 1) Multidimensional generator function as callable input | >>> # 1) Multidimensional generator function as callable input | ||||
| >>> def GeneratorMD(): | >>> def GeneratorMD(): | ||||
| >>> for i in range(64): | |||||
| >>> yield (np.array([[i, i + 1], [i + 2, i + 3]]),) | |||||
| ... for i in range(64): | |||||
| ... yield (np.array([[i, i + 1], [i + 2, i + 3]]),) | |||||
| >>> # Create multi_dimension_generator_dataset with GeneratorMD and column name "multi_dimensional_data" | >>> # Create multi_dimension_generator_dataset with GeneratorMD and column name "multi_dimensional_data" | ||||
| >>> multi_dimension_generator_dataset = ds.GeneratorDataset(GeneratorMD, ["multi_dimensional_data"]) | >>> multi_dimension_generator_dataset = ds.GeneratorDataset(GeneratorMD, ["multi_dimensional_data"]) | ||||
| >>> | >>> | ||||
| >>> # 2) Multi-column generator function as callable input | >>> # 2) Multi-column generator function as callable input | ||||
| >>> def GeneratorMC(maxid = 64): | >>> def GeneratorMC(maxid = 64): | ||||
| >>> for i in range(maxid): | |||||
| >>> yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]])) | |||||
| ... for i in range(maxid): | |||||
| ... yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]])) | |||||
| >>> # Create multi_column_generator_dataset with GeneratorMC and column names "col1" and "col2" | >>> # Create multi_column_generator_dataset with GeneratorMC and column names "col1" and "col2" | ||||
| >>> multi_column_generator_dataset = ds.GeneratorDataset(GeneratorMC, ["col1", "col2"]) | >>> multi_column_generator_dataset = ds.GeneratorDataset(GeneratorMC, ["col1", "col2"]) | ||||
| >>> | >>> | ||||
| >>> # 3) Iterable dataset as iterable input | >>> # 3) Iterable dataset as iterable input | ||||
| >>> class MyIterable(): | >>> class MyIterable(): | ||||
| >>> def __iter__(self): | |||||
| >>> return # User implementation | |||||
| ... def __iter__(self): | |||||
| ... return # User implementation | |||||
| >>> # Create iterable_generator_dataset with MyIterable object | >>> # Create iterable_generator_dataset with MyIterable object | ||||
| >>> iterable_generator_dataset = ds.GeneratorDataset(MyIterable(), ["col1"]) | >>> iterable_generator_dataset = ds.GeneratorDataset(MyIterable(), ["col1"]) | ||||
| >>> | >>> | ||||
| >>> # 4) Random accessible dataset as random accessible input | >>> # 4) Random accessible dataset as random accessible input | ||||
| >>> class MyRA(): | >>> class MyRA(): | ||||
| >>> def __getitem__(self, index): | |||||
| >>> return # User implementation | |||||
| ... def __getitem__(self, index): | |||||
| ... return # User implementation | |||||
| >>> # Create ra_generator_dataset with MyRA object | >>> # Create ra_generator_dataset with MyRA object | ||||
| >>> ra_generator_dataset = ds.GeneratorDataset(MyRA(), ["col1"]) | >>> ra_generator_dataset = ds.GeneratorDataset(MyRA(), ["col1"]) | ||||
| >>> # List/Dict/Tuple is also random accessible | >>> # List/Dict/Tuple is also random accessible | ||||
| @@ -3882,22 +3846,21 @@ class TFRecordDataset(SourceDataset): | |||||
| (default=None which means no cache is used). | (default=None which means no cache is used). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.common.dtype as mstype | >>> import mindspore.common.dtype as mstype | ||||
| >>> | >>> | ||||
| >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple tf data files | |||||
| >>> tfrecord_dataset_dir = ["/path/to/tfrecord_dataset_file"] # contains 1 or multiple tf data files | |||||
| >>> | >>> | ||||
| >>> # 1) Get all rows from dataset_files with no explicit schema | |||||
| >>> # 1) Get all rows from tfrecord_dataset_dir with no explicit schema | |||||
| >>> # The meta-data in the first row will be used as a schema. | >>> # The meta-data in the first row will be used as a schema. | ||||
| >>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files) | |||||
| >>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir) | |||||
| >>> | >>> | ||||
| >>> # 2) Get all rows from dataset_files with user-defined schema | |||||
| >>> schema = ds.Schema() | |||||
| >>> # 2) Get all rows from tfrecord_dataset_dir with user-defined schema | |||||
| >>> schema = ds.Schema("/path/to/tfrecord_schema_file") | |||||
| >>> schema.add_column('col_1d', de_type=mindspore.int64, shape=[2]) | >>> schema.add_column('col_1d', de_type=mindspore.int64, shape=[2]) | ||||
| >>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files, schema=schema) | |||||
| >>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir, schema=schema) | |||||
| >>> | >>> | ||||
| >>> # 3) Get all rows from dataset_files with schema file "./schema.json" | |||||
| >>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files, schema="./schema.json") | |||||
| >>> # 3) Get all rows from tfrecord_dataset_dir with schema file "./schema.json" | |||||
| >>> dataset = ds.TFRecordDataset(dataset_files=tfrecord_dataset_dir, schema="./schema.json") | |||||
| """ | """ | ||||
| def parse(self, children=None): | def parse(self, children=None): | ||||
| @@ -4075,16 +4038,12 @@ class ManifestDataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_file = "/path/to/manifest_file.manifest" | |||||
| >>> | |||||
| >>> # 1) Read all samples specified in manifest_file dataset with 8 threads for training | |||||
| >>> manifest_dataset = ds.ManifestDataset(dataset_file, usage="train", num_parallel_workers=8) | |||||
| >>> # 1) Read all samples specified in manifest_dataset_dir dataset with 8 threads for training | |||||
| >>> dataset = ds.ManifestDataset(manifest_dataset_dir, usage="train", num_parallel_workers=8) | |||||
| >>> | >>> | ||||
| >>> # 2) Read samples (specified in manifest_file.manifest) for shard 0 | >>> # 2) Read samples (specified in manifest_file.manifest) for shard 0 | ||||
| >>> # in a 2-way distributed training setup | >>> # in a 2-way distributed training setup | ||||
| >>> manifest_dataset = ds.ManifestDataset(dataset_file, num_shards=2, shard_id=0) | |||||
| >>> dataset = ds.ManifestDataset(manifest_dataset_dir, num_shards=2, shard_id=0) | |||||
| """ | """ | ||||
| @@ -4239,18 +4198,14 @@ class Cifar10Dataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/cifar10_dataset_directory" | |||||
| >>> | |||||
| >>> # 1) Get all samples from CIFAR10 dataset in sequence | >>> # 1) Get all samples from CIFAR10 dataset in sequence | ||||
| >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, shuffle=False) | |||||
| >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, shuffle=False) | |||||
| >>> | >>> | ||||
| >>> # 2) Randomly select 350 samples from CIFAR10 dataset | >>> # 2) Randomly select 350 samples from CIFAR10 dataset | ||||
| >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, num_samples=350, shuffle=True) | |||||
| >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, num_samples=350, shuffle=True) | |||||
| >>> | >>> | ||||
| >>> # 3) Get samples from CIFAR10 dataset for shard 0 in a 2-way distributed training | >>> # 3) Get samples from CIFAR10 dataset for shard 0 in a 2-way distributed training | ||||
| >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, num_shards=2, shard_id=0) | |||||
| >>> dataset = ds.Cifar10Dataset(dataset_dir=cifar10_dataset_dir, num_shards=2, shard_id=0) | |||||
| >>> | >>> | ||||
| >>> # In CIFAR10 dataset, each dictionary has keys "image" and "label" | >>> # In CIFAR10 dataset, each dictionary has keys "image" and "label" | ||||
| """ | """ | ||||
| @@ -4381,15 +4336,11 @@ class Cifar100Dataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/cifar100_dataset_directory" | |||||
| >>> | |||||
| >>> # 1) Get all samples from CIFAR100 dataset in sequence | >>> # 1) Get all samples from CIFAR100 dataset in sequence | ||||
| >>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir, shuffle=False) | |||||
| >>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, shuffle=False) | |||||
| >>> | >>> | ||||
| >>> # 2) Randomly select 350 samples from CIFAR100 dataset | >>> # 2) Randomly select 350 samples from CIFAR100 dataset | ||||
| >>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir, num_samples=350, shuffle=True) | |||||
| >>> dataset = ds.Cifar100Dataset(dataset_dir=cifar100_dataset_dir, num_samples=350, shuffle=True) | |||||
| >>> | >>> | ||||
| >>> # In CIFAR100 dataset, each dictionary has 3 keys: "image", "fine_label" and "coarse_label" | >>> # In CIFAR100 dataset, each dictionary has 3 keys: "image", "fine_label" and "coarse_label" | ||||
| """ | """ | ||||
| @@ -4544,12 +4495,11 @@ class Schema: | |||||
| RuntimeError: If schema file failed to load. | RuntimeError: If schema file failed to load. | ||||
| Example: | Example: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.common.dtype as mstype | >>> import mindspore.common.dtype as mstype | ||||
| >>> | >>> | ||||
| >>> # Create schema; specify column name, mindspore.dtype and shape of the column | >>> # Create schema; specify column name, mindspore.dtype and shape of the column | ||||
| >>> schema = ds.Schema() | >>> schema = ds.Schema() | ||||
| >>> schema.add_column('col1', de_type=mindspore.int64, shape=[2]) | |||||
| >>> schema.add_column('col1', de_type=mstype.int64, shape=[2]) | |||||
| """ | """ | ||||
| @check_schema | @check_schema | ||||
| @@ -4733,21 +4683,17 @@ class VOCDataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/voc_dataset_directory" | |||||
| >>> | |||||
| >>> # 1) Read VOC data for segmentatation training | >>> # 1) Read VOC data for segmentatation training | ||||
| >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Segmentation", usage="train") | |||||
| >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Segmentation", usage="train") | |||||
| >>> | >>> | ||||
| >>> # 2) Read VOC data for detection training | >>> # 2) Read VOC data for detection training | ||||
| >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train") | |||||
| >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train") | |||||
| >>> | >>> | ||||
| >>> # 3) Read all VOC dataset samples in dataset_dir with 8 threads in random order | |||||
| >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train", num_parallel_workers=8) | |||||
| >>> # 3) Read all VOC dataset samples in voc_dataset_dir with 8 threads in random order | |||||
| >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train", num_parallel_workers=8) | |||||
| >>> | >>> | ||||
| >>> # 4) Read then decode all VOC dataset samples in dataset_dir in sequence | |||||
| >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train", decode=True, shuffle=False) | |||||
| >>> # 4) Read then decode all VOC dataset samples in voc_dataset_dir in sequence | |||||
| >>> dataset = ds.VOCDataset(voc_dataset_dir, task="Detection", usage="train", decode=True, shuffle=False) | |||||
| >>> | >>> | ||||
| >>> # In VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target" | >>> # In VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target" | ||||
| >>> # In VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation" | >>> # In VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation" | ||||
| @@ -4928,22 +4874,17 @@ class CocoDataset(MappableDataset): | |||||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | ValueError: If shard_id is invalid (< 0 or >= num_shards). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/coco_dataset_directory/image_folder" | |||||
| >>> annotation_file = "/path/to/coco_dataset_directory/annotation_folder/annotation.json" | |||||
| >>> | |||||
| >>> # 1) Read COCO data for Detection task | >>> # 1) Read COCO data for Detection task | ||||
| >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Detection') | |||||
| >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Detection') | |||||
| >>> | >>> | ||||
| >>> # 2) Read COCO data for Stuff task | >>> # 2) Read COCO data for Stuff task | ||||
| >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Stuff') | |||||
| >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Stuff') | |||||
| >>> | >>> | ||||
| >>> # 3) Read COCO data for Panoptic task | >>> # 3) Read COCO data for Panoptic task | ||||
| >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Panoptic') | |||||
| >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Panoptic') | |||||
| >>> | >>> | ||||
| >>> # 4) Read COCO data for Keypoint task | >>> # 4) Read COCO data for Keypoint task | ||||
| >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Keypoint') | |||||
| >>> dataset = ds.CocoDataset(coco_dataset_dir, annotation_file=coco_annotation_file, task='Keypoint') | |||||
| >>> | >>> | ||||
| >>> # In COCO dataset, each dictionary has keys "image" and "annotation" | >>> # In COCO dataset, each dictionary has keys "image" and "annotation" | ||||
| """ | """ | ||||
| @@ -5071,10 +5012,7 @@ class CelebADataset(MappableDataset): | |||||
| (default=None which means no cache is used). | (default=None which means no cache is used). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "/path/to/celeba_directory" | |||||
| >>> dataset = ds.CelebADataset(dataset_dir=dataset_dir, usage='train') | |||||
| >>> dataset = ds.CelebADataset(dataset_dir=celeba_dataset_dir, usage='train') | |||||
| """ | """ | ||||
| def parse(self, children=None): | def parse(self, children=None): | ||||
| @@ -5185,10 +5123,8 @@ class CLUEDataset(SourceDataset): | |||||
| (default=None which means no cache is used). | (default=None which means no cache is used). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files | |||||
| >>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train') | |||||
| >>> clue_dataset_dir = ["/path/to/clue_dataset_file"] # contains 1 or multiple text files | |||||
| >>> dataset = ds.CLUEDataset(dataset_files=clue_dataset_dir, task='AFQMC', usage='train') | |||||
| """ | """ | ||||
| def parse(self, children=None): | def parse(self, children=None): | ||||
| @@ -5421,10 +5357,8 @@ class CSVDataset(SourceDataset): | |||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files | |||||
| >>> dataset = ds.CSVDataset(dataset_files=dataset_files, column_names=['col1', 'col2', 'col3', 'col4']) | |||||
| >>> csv_dataset_dir = ["/path/to/csv_dataset_file"] | |||||
| >>> dataset = ds.CSVDataset(dataset_files=csv_dataset_dir, column_names=['col1', 'col2', 'col3', 'col4']) | |||||
| """ | """ | ||||
| def parse(self, children=None): | def parse(self, children=None): | ||||
| @@ -5528,10 +5462,8 @@ class TextFileDataset(SourceDataset): | |||||
| (default=None which means no cache is used). | (default=None which means no cache is used). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files | |||||
| >>> dataset = ds.TextFileDataset(dataset_files=dataset_files) | |||||
| >>> # contains 1 or multiple text files | |||||
| >>> dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir) | |||||
| """ | """ | ||||
| def parse(self, children=None): | def parse(self, children=None): | ||||
| @@ -5725,24 +5657,22 @@ class NumpySlicesDataset(GeneratorDataset): | |||||
| when num_shards is also specified. Random accessible input is required. | when num_shards is also specified. Random accessible input is required. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> # 1) Input data can be a list | >>> # 1) Input data can be a list | ||||
| >>> data = [1, 2, 3] | >>> data = [1, 2, 3] | ||||
| >>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"]) | |||||
| >>> dataset = ds.NumpySlicesDataset(data, column_names=["column_1"]) | |||||
| >>> | >>> | ||||
| >>> # 2) Input data can be a dictionary, and column_names will be its keys | >>> # 2) Input data can be a dictionary, and column_names will be its keys | ||||
| >>> data = {"a": [1, 2], "b": [3, 4]} | >>> data = {"a": [1, 2], "b": [3, 4]} | ||||
| >>> dataset2 = ds.NumpySlicesDataset(data) | |||||
| >>> dataset = ds.NumpySlicesDataset(data) | |||||
| >>> | >>> | ||||
| >>> # 3) Input data can be a tuple of lists (or NumPy arrays), each tuple element refers to data in each column | >>> # 3) Input data can be a tuple of lists (or NumPy arrays), each tuple element refers to data in each column | ||||
| >>> data = ([1, 2], [3, 4], [5, 6]) | >>> data = ([1, 2], [3, 4], [5, 6]) | ||||
| >>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"]) | |||||
| >>> dataset = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"]) | |||||
| >>> | >>> | ||||
| >>> # 4) Load data from CSV file | >>> # 4) Load data from CSV file | ||||
| >>> import pandas as pd | >>> import pandas as pd | ||||
| >>> df = pd.read_csv("file.csv") | |||||
| >>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False) | |||||
| >>> df = pd.read_csv(csv_dataset_dir) | |||||
| >>> dataset = ds.NumpySlicesDataset(dict(df), shuffle=False) | |||||
| """ | """ | ||||
| @check_numpyslicesdataset | @check_numpyslicesdataset | ||||
| @@ -5787,9 +5717,9 @@ class PaddedDataset(GeneratorDataset): | |||||
| ValueError: If the padded_samples is empty. | ValueError: If the padded_samples is empty. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}] | |||||
| >>> ds1 = ds.PaddedDataset(data1) | |||||
| >>> import numpy as np | |||||
| >>> data = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)}] | |||||
| >>> dataset = ds.PaddedDataset(data) | |||||
| """ | """ | ||||
| @check_paddeddataset | @check_paddeddataset | ||||
| @@ -72,11 +72,9 @@ class GraphData: | |||||
| the server automatically exits (default=True). | the server automatically exits (default=True). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_nodes(0) | |||||
| >>> features = data_graph.get_node_feature(nodes, [1]) | |||||
| >>> graph_dataset = ds.GraphData(graph_dataset_dir, 2) | |||||
| >>> nodes = graph_dataset.get_all_nodes(0) | |||||
| >>> features = graph_dataset.get_node_feature(nodes, [1]) | |||||
| """ | """ | ||||
| @check_gnn_graphdata | @check_gnn_graphdata | ||||
| @@ -116,10 +114,7 @@ class GraphData: | |||||
| numpy.ndarray, array of nodes. | numpy.ndarray, array of nodes. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_nodes(0) | |||||
| >>> nodes = graph_dataset.get_all_nodes(0) | |||||
| Raises: | Raises: | ||||
| TypeError: If `node_type` is not integer. | TypeError: If `node_type` is not integer. | ||||
| @@ -140,10 +135,7 @@ class GraphData: | |||||
| numpy.ndarray, array of edges. | numpy.ndarray, array of edges. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_edges(0) | |||||
| >>> edges = graph_dataset.get_all_edges(0) | |||||
| Raises: | Raises: | ||||
| TypeError: If `edge_type` is not integer. | TypeError: If `edge_type` is not integer. | ||||
| @@ -183,11 +175,8 @@ class GraphData: | |||||
| numpy.ndarray, array of neighbors. | numpy.ndarray, array of neighbors. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_nodes(0) | |||||
| >>> neighbors = data_graph.get_all_neighbors(nodes, 0) | |||||
| >>> nodes = graph_dataset.get_all_nodes(0) | |||||
| >>> neighbors = graph_dataset.get_all_neighbors(nodes, 0) | |||||
| Raises: | Raises: | ||||
| TypeError: If `node_list` is not list or ndarray. | TypeError: If `node_list` is not list or ndarray. | ||||
| @@ -222,11 +211,8 @@ class GraphData: | |||||
| numpy.ndarray, array of neighbors. | numpy.ndarray, array of neighbors. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_nodes(0) | |||||
| >>> neighbors = data_graph.get_sampled_neighbors(nodes, [2, 2], [0, 0]) | |||||
| >>> nodes = graph_dataset.get_all_nodes(0) | |||||
| >>> neighbors = graph_dataset.get_sampled_neighbors(nodes, [2, 2], [0, 0]) | |||||
| Raises: | Raises: | ||||
| TypeError: If `node_list` is not list or ndarray. | TypeError: If `node_list` is not list or ndarray. | ||||
| @@ -254,11 +240,8 @@ class GraphData: | |||||
| numpy.ndarray, array of neighbors. | numpy.ndarray, array of neighbors. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_nodes(0) | |||||
| >>> neg_neighbors = data_graph.get_neg_sampled_neighbors(nodes, 5, 0) | |||||
| >>> nodes = graph_dataset.get_all_nodes(0) | |||||
| >>> neg_neighbors = graph_dataset.get_neg_sampled_neighbors(nodes, 5, 0) | |||||
| Raises: | Raises: | ||||
| TypeError: If `node_list` is not list or ndarray. | TypeError: If `node_list` is not list or ndarray. | ||||
| @@ -283,11 +266,8 @@ class GraphData: | |||||
| numpy.ndarray, array of features. | numpy.ndarray, array of features. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.get_all_nodes(0) | |||||
| >>> features = data_graph.get_node_feature(nodes, [1]) | |||||
| >>> nodes = graph_dataset.get_all_nodes(0) | |||||
| >>> features = graph_dataset.get_node_feature(nodes, [1]) | |||||
| Raises: | Raises: | ||||
| TypeError: If `node_list` is not list or ndarray. | TypeError: If `node_list` is not list or ndarray. | ||||
| @@ -315,11 +295,8 @@ class GraphData: | |||||
| numpy.ndarray, array of features. | numpy.ndarray, array of features. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> edges = data_graph.get_all_edges(0) | |||||
| >>> features = data_graph.get_edge_feature(edges, [1]) | |||||
| >>> edges = graph_dataset.get_all_edges(0) | |||||
| >>> features = graph_dataset.get_edge_feature(edges, [1]) | |||||
| Raises: | Raises: | ||||
| TypeError: If `edge_list` is not list or ndarray. | TypeError: If `edge_list` is not list or ndarray. | ||||
| @@ -370,10 +347,7 @@ class GraphData: | |||||
| numpy.ndarray, array of nodes. | numpy.ndarray, array of nodes. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> data_graph = ds.GraphData('dataset_file', 2) | |||||
| >>> nodes = data_graph.random_walk([1,2], [1,2,1,2,1]) | |||||
| >>> nodes = graph_dataset.random_walk([1,2], [1,2,1,2,1]) | |||||
| Raises: | Raises: | ||||
| TypeError: If `target_nodes` is not list or ndarray. | TypeError: If `target_nodes` is not list or ndarray. | ||||
| @@ -321,13 +321,11 @@ class DistributedSampler(BuiltinSampler): | |||||
| should be no more than num_shards. | should be no more than num_shards. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> # creates a distributed sampler with 10 shards in total. This shard is shard 5. | >>> # creates a distributed sampler with 10 shards in total. This shard is shard 5. | ||||
| >>> sampler = ds.DistributedSampler(10, 5) | >>> sampler = ds.DistributedSampler(10, 5) | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8, | |||||
| ... sampler=sampler) | |||||
| Raises: | Raises: | ||||
| ValueError: If num_shards is not positive. | ValueError: If num_shards is not positive. | ||||
| @@ -403,13 +401,11 @@ class PKSampler(BuiltinSampler): | |||||
| num_samples (int, optional): The number of samples to draw (default=None, all elements). | num_samples (int, optional): The number of samples to draw (default=None, all elements). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> # creates a PKSampler that will get 3 samples from every class. | >>> # creates a PKSampler that will get 3 samples from every class. | ||||
| >>> sampler = ds.PKSampler(3) | >>> sampler = ds.PKSampler(3) | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8, | |||||
| ... sampler=sampler) | |||||
| Raises: | Raises: | ||||
| ValueError: If num_val is not positive. | ValueError: If num_val is not positive. | ||||
| @@ -472,13 +468,11 @@ class RandomSampler(BuiltinSampler): | |||||
| num_samples (int, optional): Number of elements to sample (default=None, all elements). | num_samples (int, optional): Number of elements to sample (default=None, all elements). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> # creates a RandomSampler | >>> # creates a RandomSampler | ||||
| >>> sampler = ds.RandomSampler() | >>> sampler = ds.RandomSampler() | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8, | |||||
| ... sampler=sampler) | |||||
| Raises: | Raises: | ||||
| ValueError: If replacement is not boolean. | ValueError: If replacement is not boolean. | ||||
| @@ -528,13 +522,11 @@ class SequentialSampler(BuiltinSampler): | |||||
| num_samples (int, optional): Number of elements to sample (default=None, all elements). | num_samples (int, optional): Number of elements to sample (default=None, all elements). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> # creates a SequentialSampler | >>> # creates a SequentialSampler | ||||
| >>> sampler = ds.SequentialSampler() | >>> sampler = ds.SequentialSampler() | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8, | |||||
| ... sampler=sampler) | |||||
| """ | """ | ||||
| def __init__(self, start_index=None, num_samples=None): | def __init__(self, start_index=None, num_samples=None): | ||||
| @@ -579,15 +571,13 @@ class SubsetSampler(BuiltinSampler): | |||||
| num_samples (int, optional): Number of elements to sample (default=None, all elements). | num_samples (int, optional): Number of elements to sample (default=None, all elements). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> indices = [0, 1, 2, 3, 7, 88, 119] | |||||
| >>> indices = [0, 1, 2, 3, 4, 5] | |||||
| >>> | >>> | ||||
| >>> # creates a SubsetSampler, will sample from the provided indices | |||||
| >>> sampler = ds.SubsetSampler(indices) | |||||
| >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) | |||||
| >>> # creates a SubsetRandomSampler, will sample from the provided indices | |||||
| >>> sampler = ds.SubsetRandomSampler(indices) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8, | |||||
| ... sampler=sampler) | |||||
| """ | """ | ||||
| def __init__(self, indices, num_samples=None): | def __init__(self, indices, num_samples=None): | ||||
| @@ -679,15 +669,13 @@ class WeightedRandomSampler(BuiltinSampler): | |||||
| replacement (bool): If True, put the sample ID back for the next draw (default=True). | replacement (bool): If True, put the sample ID back for the next draw (default=True). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> | |||||
| >>> weights = [0.9, 0.01, 0.4, 0.8, 0.1, 0.1, 0.3] | >>> weights = [0.9, 0.01, 0.4, 0.8, 0.1, 0.1, 0.3] | ||||
| >>> | >>> | ||||
| >>> # creates a WeightedRandomSampler that will sample 4 elements without replacement | >>> # creates a WeightedRandomSampler that will sample 4 elements without replacement | ||||
| >>> sampler = ds.WeightedRandomSampler(weights, 4) | >>> sampler = ds.WeightedRandomSampler(weights, 4) | ||||
| >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler) | |||||
| >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir, | |||||
| ... num_parallel_workers=8, | |||||
| ... sampler=sampler) | |||||
| Raises: | Raises: | ||||
| ValueError: If num_samples is not positive. | ValueError: If num_samples is not positive. | ||||
| @@ -40,16 +40,13 @@ def serialize(dataset, json_filepath=""): | |||||
| OSError cannot open a file | OSError cannot open a file | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.transforms.c_transforms as C | |||||
| >>> DATA_DIR = "../../data/testMnistData" | |||||
| >>> data = ds.MnistDataset(DATA_DIR, 100) | |||||
| >>> one_hot_encode = C.OneHot(10) # num_classes is input argument | |||||
| >>> data = data.map(operation=one_hot_encode, input_column_names="label") | |||||
| >>> data = data.batch(batch_size=10, drop_remainder=True) | |||||
| >>> | |||||
| >>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json") # serialize it to json file | |||||
| >>> serialized_data = ds.engine.serialize(data) # serialize it to Python dict | |||||
| >>> dataset = ds.MnistDataset(mnist_dataset_dir, 100) | |||||
| >>> one_hot_encode = c_transforms.OneHot(10) # num_classes is input argument | |||||
| >>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label") | |||||
| >>> dataset = dataset.batch(batch_size=10, drop_remainder=True) | |||||
| >>> # serialize it to json file | |||||
| >>> ds.engine.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json") | |||||
| >>> serialized_data = ds.engine.serialize(dataset) # serialize it to Python dict | |||||
| """ | """ | ||||
| return dataset.to_json(json_filepath) | return dataset.to_json(json_filepath) | ||||
| @@ -69,20 +66,16 @@ def deserialize(input_dict=None, json_filepath=None): | |||||
| OSError cannot open a file. | OSError cannot open a file. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.transforms.c_transforms as C | |||||
| >>> DATA_DIR = "../../data/testMnistData" | |||||
| >>> data = ds.MnistDataset(DATA_DIR, 100) | |||||
| >>> one_hot_encode = C.OneHot(10) # num_classes is input argument | |||||
| >>> data = data.map(operation=one_hot_encode, input_column_names="label") | |||||
| >>> data = data.batch(batch_size=10, drop_remainder=True) | |||||
| >>> | |||||
| >>> dataset = ds.MnistDataset(mnist_dataset_dir, 100) | |||||
| >>> one_hot_encode = c_transforms.OneHot(10) # num_classes is input argument | |||||
| >>> dataset = dataset.map(operation=one_hot_encode, input_column_names="label") | |||||
| >>> dataset = dataset.batch(batch_size=10, drop_remainder=True) | |||||
| >>> # Use case 1: to/from json file | >>> # Use case 1: to/from json file | ||||
| >>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json") | |||||
| >>> data = ds.engine.deserialize(json_filepath="mnist_dataset_pipeline.json") | |||||
| >>> ds.engine.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json") | |||||
| >>> dataset = ds.engine.deserialize(json_filepath="/path/to/mnist_dataset_pipeline.json") | |||||
| >>> # Use case 2: to/from Python dictionary | >>> # Use case 2: to/from Python dictionary | ||||
| >>> serialized_data = ds.engine.serialize(data) | |||||
| >>> data = ds.engine.deserialize(input_dict=serialized_data) | |||||
| >>> serialized_data = ds.engine.serialize(dataset) | |||||
| >>> dataset = ds.engine.deserialize(input_dict=serialized_data) | |||||
| """ | """ | ||||
| data = None | data = None | ||||
| @@ -24,21 +24,18 @@ and use Lookup to find the index of tokens in Vocab. | |||||
| class attributes (self.xxx) to support save() and load(). | class attributes (self.xxx) to support save() and load(). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> dataset_file = "path/to/text_file_path" | |||||
| >>> text_file_dataset_dir = "/path/to/text_file_dataset_file" | |||||
| >>> # Create a dataset for text sentences saved as line data in a file | >>> # Create a dataset for text sentences saved as line data in a file | ||||
| >>> data1 = ds.TextFileDataset(dataset_file, shuffle=False) | |||||
| >>> text_file_dataset = ds.TextFileDataset(text_file_dataset_dir, shuffle=False) | |||||
| >>> # Tokenize sentences to unicode characters | >>> # Tokenize sentences to unicode characters | ||||
| >>> tokenizer = text.UnicodeCharTokenizer() | >>> tokenizer = text.UnicodeCharTokenizer() | ||||
| >>> # Load vocabulary from list | >>> # Load vocabulary from list | ||||
| >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) | >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) | ||||
| >>> # Use Lookup operator to map tokens to ids | >>> # Use Lookup operator to map tokens to ids | ||||
| >>> lookup = text.Lookup(vocab) | >>> lookup = text.Lookup(vocab) | ||||
| >>> data1 = data1.map(operations=[tokenizer, lookup]) | |||||
| >>> for i in data1.create_dict_iterator(): | |||||
| >>> print(i) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup]) | |||||
| >>> for i in text_file_dataset.create_dict_iterator(): | |||||
| ... print(i) | |||||
| >>> # if text line in dataset_file is: | >>> # if text line in dataset_file is: | ||||
| >>> # 深圳欢迎您 | >>> # 深圳欢迎您 | ||||
| >>> # then the output will be: | >>> # then the output will be: | ||||
| @@ -132,17 +129,18 @@ class JiebaTokenizer(TextTensorOperation): | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | with_offsets (bool, optional): If or not output offsets of tokens (default=False). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> from mindspore.dataset.text import JiebaMode | |||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False) | |||||
| >>> data1 = data1.map(operations=tokenizer_op) | |||||
| >>> jieba_hmm_file = "/path/to/jieba/hmm/file" | |||||
| >>> jieba_mp_file = "/path/to/jieba/mp/file" | |||||
| >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) | |||||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | ||||
| >>> # ["offsets_limit", dtype=uint32]} | |||||
| >>> tokenizer_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||||
| >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], | |||||
| >>> output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| >>> column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| ... # ["offsets_limit", dtype=uint32]} | |||||
| >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True) | |||||
| >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], | |||||
| ... output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| ... column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| """ | """ | ||||
| @check_jieba_init | @check_jieba_init | ||||
| @@ -178,14 +176,16 @@ class JiebaTokenizer(TextTensorOperation): | |||||
| the better chance the word will be tokenized (default=None, use default frequency). | the better chance the word will be tokenized (default=None, use default frequency). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP) | |||||
| >>> with open(VOCAB_FILE, 'r') as f: | |||||
| >>> from mindspore.dataset.text import JiebaMode | |||||
| >>> jieba_hmm_file = "/path/to/jieba/hmm/file" | |||||
| >>> jieba_mp_file = "/path/to/jieba/mp/file" | |||||
| >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=text.JiebaMode.MP) | |||||
| >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" | |||||
| >>> with open(sentence_piece_vocab_file, 'r') as f: | |||||
| >>> for line in f: | >>> for line in f: | ||||
| >>> word = line.split(',')[0] | |||||
| >>> jieba_op.add_word(word) | |||||
| >>> data1 = data1.map(operations=jieba_op, input_columns=["text"]) | |||||
| ... word = line.split(',')[0] | |||||
| ... jieba_op.add_word(word) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) | |||||
| """ | """ | ||||
| if freq is None: | if freq is None: | ||||
| @@ -210,12 +210,13 @@ class JiebaTokenizer(TextTensorOperation): | |||||
| word3 freq3 | word3 freq3 | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> from mindspore.dataset.text import JiebaMode | |||||
| >>> jieba_hmm_file = "/path/to/jieba/hmm/file" | |||||
| >>> jieba_mp_file = "/path/to/jieba/mp/file" | |||||
| >>> user_dict = {"男默女泪": 10} | >>> user_dict = {"男默女泪": 10} | ||||
| >>> jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||||
| >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP) | |||||
| >>> jieba_op.add_dict(user_dict) | >>> jieba_op.add_dict(user_dict) | ||||
| >>> data1 = data1.map(operations=jieba_op, input_columns=["text"]) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"]) | |||||
| """ | """ | ||||
| if isinstance(user_dict, str): | if isinstance(user_dict, str): | ||||
| @@ -283,13 +284,11 @@ class Lookup(TextTensorOperation): | |||||
| data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32) | data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32) | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> # Load vocabulary from list | >>> # Load vocabulary from list | ||||
| >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) | >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) | ||||
| >>> # Use Lookup operator to map tokens to ids | >>> # Use Lookup operator to map tokens to ids | ||||
| >>> lookup = text.Lookup(vocab) | >>> lookup = text.Lookup(vocab) | ||||
| >>> data1 = data1.map(operations=[lookup]) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=[lookup]) | |||||
| """ | """ | ||||
| @check_lookup | @check_lookup | ||||
| @@ -323,9 +322,7 @@ class Ngram(TextTensorOperation): | |||||
| (default=None, which means whitespace is used). | (default=None, which means whitespace is used). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> data1 = data1.map(operations=text.Ngram(3, separator=" ")) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator="")) | |||||
| """ | """ | ||||
| @check_ngram | @check_ngram | ||||
| @@ -349,11 +346,12 @@ class SentencePieceTokenizer(TextTensorOperation): | |||||
| out_type (Union[str, int]): The type of output. | out_type (Union[str, int]): The type of output. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) | |||||
| >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType | |||||
| >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file" | |||||
| >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 5000, 0.9995, | |||||
| ... SentencePieceModel.UNIGRAM, {}) | |||||
| >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) | >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) | ||||
| >>> data1 = data1.map(operations=tokenizer) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=tokenizer) | |||||
| """ | """ | ||||
| def __init__(self, mode, out_type): | def __init__(self, mode, out_type): | ||||
| @@ -390,7 +388,6 @@ class SlidingWindow(TextTensorOperation): | |||||
| >>> # | [3,4,5]] | | >>> # | [3,4,5]] | | ||||
| >>> # +--------------+ | >>> # +--------------+ | ||||
| """ | """ | ||||
| @check_slidingwindow | @check_slidingwindow | ||||
| def __init__(self, width, axis=0): | def __init__(self, width, axis=0): | ||||
| self.width = width | self.width = width | ||||
| @@ -418,11 +415,11 @@ class ToNumber(TextTensorOperation): | |||||
| RuntimeError: If strings are invalid to cast, or are out of range after being casted. | RuntimeError: If strings are invalid to cast, or are out of range after being casted. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> import mindspore.common.dtype as mstype | >>> import mindspore.common.dtype as mstype | ||||
| >>> | |||||
| >>> data = [["1", "2", "3"]] | |||||
| >>> dataset = ds.NumpySlicesDataset(data) | |||||
| >>> to_number_op = text.ToNumber(mstype.int8) | >>> to_number_op = text.ToNumber(mstype.int8) | ||||
| >>> data1 = data1.map(operations=to_number_op) | |||||
| >>> dataset = dataset.map(operations=to_number_op) | |||||
| """ | """ | ||||
| @check_to_number | @check_to_number | ||||
| @@ -514,15 +511,15 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||||
| >>> | >>> | ||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', | >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', | ||||
| >>> max_bytes_per_token=100, with_offsets=False) | |||||
| ... max_bytes_per_token=100, with_offsets=False) | |||||
| >>> data1 = data1.map(operations=tokenizer_op) | >>> data1 = data1.map(operations=tokenizer_op) | ||||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | ||||
| >>> # ["offsets_limit", dtype=uint32]} | >>> # ["offsets_limit", dtype=uint32]} | ||||
| >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', | >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]', | ||||
| >>> max_bytes_per_token=100, with_offsets=True) | |||||
| ... max_bytes_per_token=100, with_offsets=True) | |||||
| >>> data2 = data2.map(operations=tokenizer_op, | >>> data2 = data2.map(operations=tokenizer_op, | ||||
| >>> input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| >>> column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| ... input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| ... column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| """ | """ | ||||
| @check_wordpiece_tokenizer | @check_wordpiece_tokenizer | ||||
| @@ -545,11 +542,9 @@ class PythonTokenizer: | |||||
| tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens. | tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> def my_tokenizer(line): | >>> def my_tokenizer(line): | ||||
| >>> return line.split() | |||||
| >>> data1 = data1.map(operations=text.PythonTokenizer(my_tokenizer)) | |||||
| ... return line.split() | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=text.PythonTokenizer(my_tokenizer)) | |||||
| """ | """ | ||||
| @check_python_tokenizer | @check_python_tokenizer | ||||
| @@ -590,26 +585,27 @@ if platform.system().lower() != 'windows': | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | with_offsets (bool, optional): If or not output offsets of tokens (default=False). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> tokenizer_op = text.BasicTokenizer(lower_case=False, | >>> tokenizer_op = text.BasicTokenizer(lower_case=False, | ||||
| >>> keep_whitespace=False, | |||||
| >>> normalization_form=NormalizeForm.NONE, | |||||
| >>> preserve_unused_token=True, | |||||
| >>> with_offsets=False) | |||||
| >>> data1 = data1.map(operations=tokenizer_op) | |||||
| ... keep_whitespace=False, | |||||
| ... normalization_form=NormalizeForm.NONE, | |||||
| ... preserve_unused_token=True, | |||||
| ... with_offsets=False) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) | |||||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | >>> # If with_offsets=False, then output three columns {["token", dtype=str], | ||||
| >>> # ["offsets_start", dtype=uint32], | >>> # ["offsets_start", dtype=uint32], | ||||
| >>> # ["offsets_limit", dtype=uint32]} | >>> # ["offsets_limit", dtype=uint32]} | ||||
| >>> tokenizer_op = text.BasicTokenizer(lower_case=False, | >>> tokenizer_op = text.BasicTokenizer(lower_case=False, | ||||
| >>> keep_whitespace=False, | |||||
| >>> normalization_form=NormalizeForm.NONE, | |||||
| >>> preserve_unused_token=True, | |||||
| >>> with_offsets=True) | |||||
| >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], | |||||
| >>> output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| >>> column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| ... keep_whitespace=False, | |||||
| ... normalization_form=NormalizeForm.NONE, | |||||
| ... preserve_unused_token=True, | |||||
| ... with_offsets=True) | |||||
| >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], | |||||
| ... output_columns=["token", "offsets_start", | |||||
| ... "offsets_limit"], | |||||
| ... column_order=["token", "offsets_start", | |||||
| ... "offsets_limit"]) | |||||
| """ | """ | ||||
| @check_basic_tokenizer | @check_basic_tokenizer | ||||
| @@ -653,24 +649,32 @@ if platform.system().lower() != 'windows': | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | with_offsets (bool, optional): If or not output offsets of tokens (default=False). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> from mindspore.dataset.text import NormalizeForm | |||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", | |||||
| ... "思", "故", "乡","繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak", | |||||
| ... "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "😀", "😃", | |||||
| ... "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]", | |||||
| ... "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"] | |||||
| >>> vocab = text.Vocab.from_list(vocab_list) | |||||
| >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, | >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, | ||||
| >>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False, | |||||
| >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, | |||||
| >>> with_offsets=False) | |||||
| >>> data1 = data1.map(operations=tokenizer_op) | |||||
| ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False, | |||||
| ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True, | |||||
| ... with_offsets=False) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) | |||||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | >>> # If with_offsets=False, then output three columns {["token", dtype=str], | ||||
| >>> # ["offsets_start", dtype=uint32], | >>> # ["offsets_start", dtype=uint32], | ||||
| >>> # ["offsets_limit", dtype=uint32]} | >>> # ["offsets_limit", dtype=uint32]} | ||||
| >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, | >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, | ||||
| >>> unknown_token='[UNK]', lower_case=False, keep_whitespace=False, | |||||
| >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, | |||||
| >>> with_offsets=True) | |||||
| >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], | |||||
| >>> output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| >>> column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False, | |||||
| ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True, | |||||
| ... with_offsets=True) | |||||
| >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], | |||||
| ... output_columns=["token", "offsets_start", | |||||
| ... "offsets_limit"], | |||||
| ... column_order=["token", "offsets_start", | |||||
| ... "offsets_limit"]) | |||||
| """ | """ | ||||
| @check_bert_tokenizer | @check_bert_tokenizer | ||||
| @@ -704,10 +708,8 @@ if platform.system().lower() != 'windows': | |||||
| CaseFold is not supported on Windows platform yet. | CaseFold is not supported on Windows platform yet. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> case_op = text.CaseFold() | >>> case_op = text.CaseFold() | ||||
| >>> data1 = data1.map(operations=case_op) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=case_op) | |||||
| """ | """ | ||||
| def parse(self): | def parse(self): | ||||
| @@ -734,10 +736,9 @@ if platform.system().lower() != 'windows': | |||||
| - NormalizeForm.NFKD, normalize with Normalization Form KD. | - NormalizeForm.NFKD, normalize with Normalization Form KD. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> from mindspore.dataset.text import NormalizeForm | |||||
| >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC) | >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC) | ||||
| >>> data1 = data1.map(operations=normalize_op) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=normalize_op) | |||||
| """ | """ | ||||
| def __init__(self, normalize_form=NormalizeForm.NFKC): | def __init__(self, normalize_form=NormalizeForm.NFKC): | ||||
| @@ -767,12 +768,10 @@ if platform.system().lower() != 'windows': | |||||
| if True, replace all matched elements (default=True). | if True, replace all matched elements (default=True). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> pattern = 'Canada' | >>> pattern = 'Canada' | ||||
| >>> replace = 'China' | >>> replace = 'China' | ||||
| >>> replace_op = text.RegexReplace(pattern, replace) | >>> replace_op = text.RegexReplace(pattern, replace) | ||||
| >>> data1 = data1.map(operations=replace_op) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=replace_op) | |||||
| """ | """ | ||||
| def __init__(self, pattern, replace, replace_all=True): | def __init__(self, pattern, replace, replace_all=True): | ||||
| @@ -802,18 +801,19 @@ if platform.system().lower() != 'windows': | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | with_offsets (bool, optional): If or not output offsets of tokens (default=False). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False) | |||||
| >>> data1 = data1.map(operations=tokenizer_op) | |||||
| >>> delim_pattern = r"[ |,]" | |||||
| >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) | |||||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | >>> # If with_offsets=False, then output three columns {["token", dtype=str], | ||||
| >>> # ["offsets_start", dtype=uint32], | >>> # ["offsets_start", dtype=uint32], | ||||
| >>> # ["offsets_limit", dtype=uint32]} | >>> # ["offsets_limit", dtype=uint32]} | ||||
| >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) | |||||
| >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], | |||||
| >>> output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| >>> column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True) | |||||
| >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], | |||||
| ... output_columns=["token", "offsets_start", | |||||
| ... "offsets_limit"], | |||||
| ... column_order=["token", "offsets_start", | |||||
| ... "offsets_limit"]) | |||||
| """ | """ | ||||
| @check_regex_tokenizer | @check_regex_tokenizer | ||||
| @@ -838,18 +838,19 @@ if platform.system().lower() != 'windows': | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | with_offsets (bool, optional): If or not output offsets of tokens (default=False). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False) | |||||
| >>> data1 = data1.map(operations=tokenizer_op) | |||||
| >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False) | |||||
| >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op) | |||||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | >>> # If with_offsets=False, then output three columns {["token", dtype=str], | ||||
| >>> # ["offsets_start", dtype=uint32], | >>> # ["offsets_start", dtype=uint32], | ||||
| >>> # ["offsets_limit", dtype=uint32]} | >>> # ["offsets_limit", dtype=uint32]} | ||||
| >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True) | |||||
| >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"], | |||||
| >>> output_columns=["token", "offsets_start", "offsets_limit"], | |||||
| >>> column_order=["token", "offsets_start", "offsets_limit"]) | |||||
| >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) | |||||
| >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"], | |||||
| ... output_columns=["token", "offsets_start", | |||||
| ... "offsets_limit"], | |||||
| ... column_order=["token", "offsets_start", | |||||
| ... "offsets_limit"]) | |||||
| """ | """ | ||||
| @check_unicode_script_tokenizer | @check_unicode_script_tokenizer | ||||
| @@ -874,8 +875,6 @@ if platform.system().lower() != 'windows': | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | with_offsets (bool, optional): If or not output offsets of tokens (default=False). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.text as text | |||||
| >>> | |||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| >>> tokenizer_op = text.WhitespaceTokenizer() | >>> tokenizer_op = text.WhitespaceTokenizer() | ||||
| >>> data1 = data1.map(operations=tokenizer_op) | >>> data1 = data1.map(operations=tokenizer_op) | ||||
| @@ -46,14 +46,8 @@ class OneHot(cde.OneHotOp): | |||||
| RuntimeError: feature size is bigger than num_classes. | RuntimeError: feature size is bigger than num_classes. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> import mindspore.dataset.vision.c_transforms as c_vision | |||||
| >>> | |||||
| >>> onehot_op = c_transforms.OneHot(num_classes=10) | >>> onehot_op = c_transforms.OneHot(num_classes=10) | ||||
| >>> data1 = data1.map(operations=onehot_op, input_columns=["label"]) | |||||
| >>> mixup_batch_op = c_vision.MixUpBatch(alpha=0.8) | |||||
| >>> data1 = data1.batch(4) | |||||
| >>> data1 = data1.map(operations=mixup_batch_op, input_columns=["image", "label"]) | |||||
| >>> mnist_dataset = mnist_dataset.map(operations=onehot_op, input_columns=["label"]) | |||||
| """ | """ | ||||
| @check_num_classes | @check_num_classes | ||||
| @@ -72,9 +66,15 @@ class Fill(cde.FillOp): | |||||
| to fill created tensor with. | to fill created tensor with. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> import numpy as np | |||||
| >>> from mindspore.dataset import GeneratorDataset | |||||
| >>> # Generate 1d int numpy array from 0 - 63 | |||||
| >>> def generator_1d(): | |||||
| >>> for i in range(64): | |||||
| ... yield (np.array([i]),) | |||||
| >>> generator_dataset = GeneratorDataset(generator_1d,column_names='col') | |||||
| >>> fill_op = c_transforms.Fill(3) | >>> fill_op = c_transforms.Fill(3) | ||||
| >>> generator_dataset = generator_dataset.map(operations=fill_op) | |||||
| """ | """ | ||||
| @check_fill_value | @check_fill_value | ||||
| @@ -90,10 +90,16 @@ class TypeCast(cde.TypeCastOp): | |||||
| data_type (mindspore.dtype): mindspore.dtype to be cast to. | data_type (mindspore.dtype): mindspore.dtype to be cast to. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> import numpy as np | |||||
| >>> import mindspore.common.dtype as mstype | >>> import mindspore.common.dtype as mstype | ||||
| >>> | |||||
| >>> from mindspore.dataset import GeneratorDataset | |||||
| >>> # Generate 1d int numpy array from 0 - 63 | |||||
| >>> def generator_1d(): | |||||
| >>> for i in range(64): | |||||
| ... yield (np.array([i]),) | |||||
| >>> generator_dataset = GeneratorDataset(generator_1d,column_names='col') | |||||
| >>> type_cast_op = c_transforms.TypeCast(mstype.int32) | >>> type_cast_op = c_transforms.TypeCast(mstype.int32) | ||||
| >>> generator_dataset = generator_dataset.map(operations=type_cast_op) | |||||
| """ | """ | ||||
| @check_de_type | @check_de_type | ||||
| @@ -149,14 +155,15 @@ class Slice(cde.SliceOp): | |||||
| 5. :py:obj:`Ellipses`: Slice the whole dimension. Similar to `:` in Python indexing. | 5. :py:obj:`Ellipses`: Slice the whole dimension. Similar to `:` in Python indexing. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> # Data before | >>> # Data before | ||||
| >>> # | col | | >>> # | col | | ||||
| >>> # +---------+ | >>> # +---------+ | ||||
| >>> # | [1,2,3] | | >>> # | [1,2,3] | | ||||
| >>> # +---------| | >>> # +---------| | ||||
| >>> data1 = data1.map(operations=c_transforms.Slice(slice(1,3))) # slice indices 1 and 2 only | |||||
| >>> data = [[1, 2, 3]] | |||||
| >>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["col"]) | |||||
| >>> # slice indices 1 and 2 only | |||||
| >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.Slice(slice(1,3))) | |||||
| >>> # Data after | >>> # Data after | ||||
| >>> # | col | | >>> # | col | | ||||
| >>> # +---------+ | >>> # +---------+ | ||||
| @@ -200,16 +207,17 @@ class Mask(cde.MaskOp): | |||||
| dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool). | dtype (mindspore.dtype, optional): Type of the generated mask (Default to bool). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> from mindspore.dataset.transforms.c_transforms import Relational | |||||
| >>> # Data before | >>> # Data before | ||||
| >>> # | col1 | | |||||
| >>> # | col | | |||||
| >>> # +---------+ | >>> # +---------+ | ||||
| >>> # | [1,2,3] | | >>> # | [1,2,3] | | ||||
| >>> # +---------+ | >>> # +---------+ | ||||
| >>> data1 = data1.map(operations=c_transforms.Mask(Relational.EQ, 2)) | |||||
| >>> data = [[1, 2, 3]] | |||||
| >>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["col"]) | |||||
| >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.Mask(Relational.EQ, 2)) | |||||
| >>> # Data after | >>> # Data after | ||||
| >>> # | col1 | | |||||
| >>> # | col | | |||||
| >>> # +--------------------+ | >>> # +--------------------+ | ||||
| >>> # | [False,True,False] | | >>> # | [False,True,False] | | ||||
| >>> # +--------------------+ | >>> # +--------------------+ | ||||
| @@ -233,14 +241,15 @@ class PadEnd(cde.PadEndOp): | |||||
| string in case of tensors of strings. | string in case of tensors of strings. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> # Data before | >>> # Data before | ||||
| >>> # | col | | >>> # | col | | ||||
| >>> # +---------+ | >>> # +---------+ | ||||
| >>> # | [1,2,3] | | >>> # | [1,2,3] | | ||||
| >>> # +---------| | >>> # +---------| | ||||
| >>> data1 = data1.map(operations=c_transforms.PadEnd(pad_shape=[4], pad_value=10)) | |||||
| >>> data = [[1, 2, 3]] | |||||
| >>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["col"]) | |||||
| >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.PadEnd(pad_shape=[4], | |||||
| ... pad_value=10)) | |||||
| >>> # Data after | >>> # Data after | ||||
| >>> # | col | | >>> # | col | | ||||
| >>> # +------------+ | >>> # +------------+ | ||||
| @@ -265,12 +274,14 @@ class Concatenate(cde.ConcatenateOp): | |||||
| append (numpy.array, optional): NumPy array to be appended to the already concatenated tensors (Default=None). | append (numpy.array, optional): NumPy array to be appended to the already concatenated tensors (Default=None). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> import numpy as np | |||||
| >>> # concatenate string | >>> # concatenate string | ||||
| >>> prepend_tensor = np.array(["dw", "df"], dtype='S') | >>> prepend_tensor = np.array(["dw", "df"], dtype='S') | ||||
| >>> append_tensor = np.array(["dwsdf", "df"], dtype='S') | >>> append_tensor = np.array(["dwsdf", "df"], dtype='S') | ||||
| >>> concatenate_op = c_transforms.Concatenate(0, prepend_tensor, append_tensor) | >>> concatenate_op = c_transforms.Concatenate(0, prepend_tensor, append_tensor) | ||||
| >>> data = [["This","is","a","string"]] | |||||
| >>> dataset = ds.NumpySlicesDataset(data) | |||||
| >>> dataset = dataset.map(operations=concatenate_op) | |||||
| """ | """ | ||||
| @check_concat_type | @check_concat_type | ||||
| @@ -287,15 +298,17 @@ class Duplicate(cde.DuplicateOp): | |||||
| Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list. | Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> # Data before | >>> # Data before | ||||
| >>> # | x | | >>> # | x | | ||||
| >>> # +---------+ | >>> # +---------+ | ||||
| >>> # | [1,2,3] | | >>> # | [1,2,3] | | ||||
| >>> # +---------+ | >>> # +---------+ | ||||
| >>> data1 = data1.map(operations=c_transforms.Duplicate(), input_columns=["x"], | |||||
| >>> output_columns=["x", "y"], column_order=["x", "y"]) | |||||
| >>> data = [[1,2,3]] | |||||
| >>> numpy_slices_dataset = ds.NumpySlicesDataset(data, ["x"]) | |||||
| >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=c_transforms.Duplicate(), | |||||
| ... input_columns=["x"], | |||||
| ... output_columns=["x", "y"], | |||||
| ... column_order=["x", "y"]) | |||||
| >>> # Data after | >>> # Data after | ||||
| >>> # | x | y | | >>> # | x | y | | ||||
| >>> # +---------+---------+ | >>> # +---------+---------+ | ||||
| @@ -319,15 +332,17 @@ class Unique(cde.UniqueOp): | |||||
| Call batch op before calling this function. | Call batch op before calling this function. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> | |||||
| >>> # Data before | >>> # Data before | ||||
| >>> # | x | | >>> # | x | | ||||
| >>> # +--------------------+ | >>> # +--------------------+ | ||||
| >>> # | [[0,1,2], [1,2,3]] | | >>> # | [[0,1,2], [1,2,3]] | | ||||
| >>> # +--------------------+ | >>> # +--------------------+ | ||||
| >>> data1 = data1.map(operations=c_transforms.Unique(), input_columns=["x"], | |||||
| >>> output_columns=["x", "y", "z"], column_order=["x", "y", "z"]) | |||||
| >>> data = [[[0,1,2], [1,2,3]]] | |||||
| >>> dataset = ds.NumpySlicesDataset(data, ["x"]) | |||||
| >>> dataset = dataset.map(operations=c_transforms.Unique(), | |||||
| ... input_columns=["x"], | |||||
| ... output_columns=["x", "y", "z"], | |||||
| ... column_order=["x", "y", "z"]) | |||||
| >>> # Data after | >>> # Data after | ||||
| >>> # | x | y |z | | >>> # | x | y |z | | ||||
| >>> # +---------+-----------------+---------+ | >>> # +---------+-----------------+---------+ | ||||
| @@ -343,11 +358,8 @@ class Compose(): | |||||
| transforms (list): List of transformations to be applied. | transforms (list): List of transformations to be applied. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> import mindspore.dataset.vision.c_transforms as c_vision | |||||
| >>> | |||||
| >>> compose = c_transforms.Compose([c_vision.Decode(), c_vision.RandomCrop(512)]) | >>> compose = c_transforms.Compose([c_vision.Decode(), c_vision.RandomCrop(512)]) | ||||
| >>> data1 = data1.map(operations=compose) | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=compose) | |||||
| """ | """ | ||||
| @check_random_transform_ops | @check_random_transform_ops | ||||
| @@ -372,11 +384,8 @@ class RandomApply(): | |||||
| prob (float, optional): The probability to apply the transformation list (default=0.5) | prob (float, optional): The probability to apply the transformation list (default=0.5) | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> import mindspore.dataset.vision.c_transforms as c_vision | |||||
| >>> | |||||
| >>> rand_apply = c_transforms.RandomApply([c_vision.RandomCrop(512)]) | >>> rand_apply = c_transforms.RandomApply([c_vision.RandomCrop(512)]) | ||||
| >>> data1 = data1.map(operations=rand_apply) | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=rand_apply) | |||||
| """ | """ | ||||
| @check_random_transform_ops | @check_random_transform_ops | ||||
| @@ -402,11 +411,8 @@ class RandomChoice(): | |||||
| transforms (list): List of transformations to be chosen from to apply. | transforms (list): List of transformations to be chosen from to apply. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> import mindspore.dataset.vision.c_transforms as c_vision | |||||
| >>> | |||||
| >>> rand_choice = c_transforms.RandomChoice([c_vision.CenterCrop(50), c_vision.RandomCrop(512)]) | >>> rand_choice = c_transforms.RandomChoice([c_vision.CenterCrop(50), c_vision.RandomCrop(512)]) | ||||
| >>> data1 = data1.map(operations=rand_choice) | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=rand_choice) | |||||
| """ | """ | ||||
| @check_random_transform_ops | @check_random_transform_ops | ||||
| @@ -31,11 +31,9 @@ class OneHotOp: | |||||
| (Default=0.0 means no smoothing is applied.) | (Default=0.0 means no smoothing is applied.) | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.transforms as py_transforms | |||||
| >>> | |||||
| >>> transforms_list = [py_transforms.OneHotOp(num_classes=10, smoothing_rate=0.1)] | >>> transforms_list = [py_transforms.OneHotOp(num_classes=10, smoothing_rate=0.1)] | ||||
| >>> transform = py_transforms.Compose(transforms_list) | >>> transform = py_transforms.Compose(transforms_list) | ||||
| >>> data1 = data1.map(input_columns=["label"], operations=transform()) | |||||
| >>> mnist_dataset = mnist_dataset(input_columns=["label"], operations=transform) | |||||
| """ | """ | ||||
| @check_one_hot_op | @check_one_hot_op | ||||
| @@ -71,53 +69,44 @@ class Compose: | |||||
| transforms (list): List of transformations to be applied. | transforms (list): List of transformations to be applied. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.vision.py_transforms as py_vision | |||||
| >>> import mindspore.dataset.transforms.py_transforms as py_transforms | |||||
| >>> | |||||
| >>> dataset_dir = "path/to/imagefolder_directory" | |||||
| >>> image_folder_dataset_dir = "/path/to/image_folder_dataset_directory" | |||||
| >>> # create a dataset that reads all files in dataset_dir with 8 threads | >>> # create a dataset that reads all files in dataset_dir with 8 threads | ||||
| >>> data1 = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) | |||||
| >>> image_folder_dataset = ds.ImageFolderDataset(image_folder_dataset_dir, num_parallel_workers=8) | |||||
| >>> # create a list of transformations to be applied to the image data | >>> # create a list of transformations to be applied to the image data | ||||
| >>> transform = py_transforms.Compose([py_vision.Decode(), | >>> transform = py_transforms.Compose([py_vision.Decode(), | ||||
| >>> py_vision.RandomHorizontalFlip(0.5), | |||||
| >>> py_vision.ToTensor(), | |||||
| >>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| >>> py_vision.RandomErasing()]) | |||||
| >>> # apply the transform to the dataset through dataset.map() | |||||
| >>> data1 = data1.map(operations=transform, input_columns="image") | |||||
| ... py_vision.RandomHorizontalFlip(0.5), | |||||
| ... py_vision.ToTensor(), | |||||
| ... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| ... py_vision.RandomErasing()]) | |||||
| >>> # apply the transform to the dataset through dataset.map function | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=transform, input_columns=["image"]) | |||||
| >>> | >>> | ||||
| >>> # Compose is also be invoked implicitly, by just passing in a list of ops | >>> # Compose is also be invoked implicitly, by just passing in a list of ops | ||||
| >>> # the above example then becomes: | >>> # the above example then becomes: | ||||
| >>> transform_list = [py_vision.Decode(), | >>> transform_list = [py_vision.Decode(), | ||||
| >>> py_vision.RandomHorizontalFlip(0.5), | |||||
| >>> py_vision.ToTensor(), | |||||
| >>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| >>> py_vision.RandomErasing()] | |||||
| ... py_vision.RandomHorizontalFlip(0.5), | |||||
| ... py_vision.ToTensor(), | |||||
| ... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| ... py_vision.RandomErasing()] | |||||
| >>> | >>> | ||||
| >>> # apply the transform to the dataset through dataset.map() | >>> # apply the transform to the dataset through dataset.map() | ||||
| >>> data2 = data2.map(operations=transform_list, input_columns="image") | |||||
| >>> image_folder_dataset_1 = image_folder_dataset_1.map(operations=transform_list, input_columns=["image"]) | |||||
| >>> | >>> | ||||
| >>> # Certain C++ and Python ops can be combined, but not all of them | >>> # Certain C++ and Python ops can be combined, but not all of them | ||||
| >>> # An example of combined operations | >>> # An example of combined operations | ||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.transforms.c_transforms as c_transforms | |||||
| >>> import mindspore.dataset.vision.c_transforms as c_vision | |||||
| >>> | |||||
| >>> data3 = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False) | |||||
| >>> arr = [0, 1] | |||||
| >>> dataset = ds.NumpySlicesDataset(arr, column_names=["cols"], shuffle=False) | |||||
| >>> transformed_list = [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)] | >>> transformed_list = [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)] | ||||
| >>> data3 = data3.map(operations=transformed_list, input_columns=["cols"]) | |||||
| >>> dataset = dataset.map(operations=transformed_list, input_columns=["cols"]) | |||||
| >>> | >>> | ||||
| >>> # Here is an example of mixing vision ops | >>> # Here is an example of mixing vision ops | ||||
| >>> data_dir = "/path/to/imagefolder_directory" | |||||
| >>> data4 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False) | |||||
| >>> input_columns = ["column_names"] | |||||
| >>> import numpy as np | |||||
| >>> op_list=[c_vision.Decode(), | >>> op_list=[c_vision.Decode(), | ||||
| >>> c_vision.Resize((224, 244)), | |||||
| >>> py_vision.ToPIL(), | |||||
| >>> np.array, # need to convert PIL image to a NumPy array to pass it to C++ operation | |||||
| >>> c_vision.Resize((24, 24))] | |||||
| >>> data4 = data4.map(operations=op_list, input_columns=input_columns) | |||||
| ... c_vision.Resize((224, 244)), | |||||
| ... py_vision.ToPIL(), | |||||
| ... np.array, # need to convert PIL image to a NumPy array to pass it to C++ operation | |||||
| ... c_vision.Resize((24, 24))] | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=op_list, input_columns=["image"]) | |||||
| """ | """ | ||||
| @check_compose_list | @check_compose_list | ||||
| @@ -144,12 +133,14 @@ class RandomApply: | |||||
| prob (float, optional): The probability to apply the transformation list (default=0.5). | prob (float, optional): The probability to apply the transformation list (default=0.5). | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.vision.py_transforms as py_vision | |||||
| >>> from mindspore.dataset.transforms.py_transforms import Compose | >>> from mindspore.dataset.transforms.py_transforms import Compose | ||||
| >>> | |||||
| >>> Compose([py_vision.Decode(), | |||||
| >>> py_vision.RandomApply(transforms_list, prob=0.6), | |||||
| >>> py_vision.ToTensor()]) | |||||
| >>> transform_list = [py_vision.RandomHorizontalFlip(0.5), | |||||
| ... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| ... py_vision.RandomErasing()] | |||||
| >>> transforms = Compose([py_vision.Decode(), | |||||
| ... py_transforms.RandomApply(transforms_list, prob=0.6), | |||||
| ... py_vision.ToTensor()]) | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=transforms, input_columns=["image"]) | |||||
| """ | """ | ||||
| @check_random_apply | @check_random_apply | ||||
| @@ -178,12 +169,14 @@ class RandomChoice: | |||||
| transforms (list): List of transformations to be chosen from to apply. | transforms (list): List of transformations to be chosen from to apply. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.vision.py_transforms as py_vision | |||||
| >>> from mindspore.dataset.transforms.py_transforms import Compose, RandomChoice | >>> from mindspore.dataset.transforms.py_transforms import Compose, RandomChoice | ||||
| >>> | |||||
| >>> Compose([py_vision.Decode(), | |||||
| >>> RandomChoice(transforms_list), | |||||
| >>> py_vision.ToTensor()]) | |||||
| >>> transform_list = [py_vision.RandomHorizontalFlip(0.5), | |||||
| ... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| ... py_vision.RandomErasing()] | |||||
| >>> transforms = Compose([py_vision.Decode(), | |||||
| ... py_transforms.RandomChoice(transform_list), | |||||
| ... py_vision.ToTensor()]) | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=transforms, input_columns=["image"]) | |||||
| """ | """ | ||||
| @check_transforms_list | @check_transforms_list | ||||
| @@ -211,12 +204,14 @@ class RandomOrder: | |||||
| transforms (list): List of the transformations to apply. | transforms (list): List of the transformations to apply. | ||||
| Examples: | Examples: | ||||
| >>> import mindspore.dataset.vision.py_transforms as py_vision | |||||
| >>> from mindspore.dataset.transforms.py_transforms import Compose | >>> from mindspore.dataset.transforms.py_transforms import Compose | ||||
| >>> | |||||
| >>> Compose([py_vision.Decode(), | |||||
| >>> py_vision.RandomOrder(transforms_list), | |||||
| >>> py_vision.ToTensor()]) | |||||
| >>> transform_list = [py_vision.RandomHorizontalFlip(0.5), | |||||
| ... py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), | |||||
| ... py_vision.RandomErasing()] | |||||
| >>> transforms = Compose([py_vision.Decode(), | |||||
| ... py_transforms.RandomOrder(transforms_list), | |||||
| ... py_vision.ToTensor()]) | |||||
| >>> image_folder_dataset = image_folder_dataset.map(operations=transforms, input_columns=["image"]) | |||||
| """ | """ | ||||
| @check_transforms_list | @check_transforms_list | ||||