diff --git a/mindspore/dataset/core/config.py b/mindspore/dataset/core/config.py index 134d9c74de..b279c69e61 100644 --- a/mindspore/dataset/core/config.py +++ b/mindspore/dataset/core/config.py @@ -13,7 +13,8 @@ # limitations under the License. # ============================================================================== """ -The configuration manager. +The configuration module provides various functions to set and get the supported +configuration parameters, and read a configuration file. """ import random import numpy @@ -35,18 +36,20 @@ def set_seed(seed): Note: This set_seed function sets the seed in the Python random library and numpy.random library for deterministic Python augmentations using randomness. This set_seed function should - be called with every iterator created to reset the random seed. In our pipeline this + be called with every iterator created to reset the random seed. In the pipeline, this does not guarantee deterministic results with num_parallel_workers > 1. Args: - seed(int): seed to be set. + seed(int): Seed to be set. Raises: ValueError: If seed is invalid (< 0 or > MAX_UINT_32). Examples: >>> import mindspore.dataset as ds - >>> # sets the new seed value, now operators with a random seed will use new seed value. + >>> + >>> # Set a new global configuration value for the seed value. + >>> # Operations with randomness will use the seed value to generate random values. >>> ds.config.set_seed(1000) """ if seed < 0 or seed > UINT32_MAX: @@ -72,14 +75,15 @@ def set_prefetch_size(size): Set the number of rows to be prefetched. Args: - size (int): total number of rows to be prefetched. + size (int): Total number of rows to be prefetched. Raises: ValueError: If prefetch_size is invalid (<= 0 or > MAX_INT_32). Examples: >>> import mindspore.dataset as ds - >>> # sets the new prefetch value. + >>> + >>> # Set a new global configuration value for the prefetch size. >>> ds.config.set_prefetch_size(1000) """ if size <= 0 or size > INT32_MAX: @@ -102,18 +106,20 @@ def set_num_parallel_workers(num): Set the default number of parallel workers. Args: - num (int): number of parallel workers to be used as a default for each operation. + num (int): Number of parallel workers to be used as a default for each operation. Raises: ValueError: If num_parallel_workers is invalid (<= 0 or > MAX_INT_32). Examples: >>> import mindspore.dataset as ds - >>> # sets the new parallel_workers value, now parallel dataset operators will run with 8 workers. + >>> + >>> # Set a new global configuration value for the number of parallel workers. + >>> # Now parallel dataset operators will run with 8 workers. >>> ds.config.set_num_parallel_workers(8) """ if num <= 0 or num > INT32_MAX: - raise ValueError("Num workers given is not within the required range.") + raise ValueError("Number of parallel workers given is not within the required range.") _config.set_num_parallel_workers(num) @@ -129,17 +135,18 @@ def get_num_parallel_workers(): def set_monitor_sampling_interval(interval): """ - Set the default interval(ms) of monitor sampling. + Set the default interval (in milliseconds) for monitor sampling. Args: - interval (int): interval(ms) to be used to performance monitor sampling. + interval (int): Interval (in milliseconds) to be used for performance monitor sampling. Raises: ValueError: If interval is invalid (<= 0 or > MAX_INT_32). Examples: >>> import mindspore.dataset as ds - >>> # sets the new interval value. + >>> + >>> # Set a new global configuration value for the monitor sampling interval. >>> ds.config.set_monitor_sampling_interval(100) """ if interval <= 0 or interval > INT32_MAX: @@ -152,7 +159,7 @@ def get_monitor_sampling_interval(): Get the default interval of performance monitor sampling. Returns: - Interval: interval(ms) of performance monitor sampling. + Interval: interval (in milliseconds) for performance monitor sampling. """ return _config.get_monitor_sampling_interval() @@ -163,18 +170,19 @@ def set_callback_timeout(timeout): In case of a deadlock, the wait function will exit after the timeout period. Args: - timeout (int): timeout(s) to be used to end teh wait in DSWaitedCallback in case of a deadlock. + timeout (int): Timeout (in seconds) to be used to end the wait in DSWaitedCallback in case of a deadlock. Raises: ValueError: If timeout is invalid (<= 0 or > MAX_INT_32). Examples: >>> import mindspore.dataset as ds - >>> # sets the new timout value. + >>> + >>> # Set a new global configuration value for the timeout value. >>> ds.config.set_callback_timeout(100) """ if timeout <= 0 or timeout > INT32_MAX: - raise ValueError("timeout given is not within the required range.") + raise ValueError("Timeout given is not within the required range.") _config.set_callback_timeout(timeout) @@ -201,25 +209,23 @@ def __str__(): def load(file): """ - Load configuration from a file. + Load configurations from a file. Args: - file (str): path the config file to be loaded. + file (str): Path of the configuration file to be loaded. Raises: RuntimeError: If file is invalid and parsing fails. Examples: >>> import mindspore.dataset as ds - >>> # sets the default value according to values in configuration file. + >>> + >>> # Set new default configuration values according to values in the configuration file. >>> ds.config.load("path/to/config/file") >>> # example config file: >>> # { >>> # "logFilePath": "/tmp", - >>> # "rowsPerBuffer": 32, >>> # "numParallelWorkers": 4, - >>> # "workerConnectorSize": 16, - >>> # "opConnectorSize": 16, >>> # "seed": 5489, >>> # "monitorSamplingInterval": 30 >>> # } diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 319b67f678..b156ac0003 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -84,7 +84,7 @@ def zip(datasets): >>> ds1 = ds.ImageFolderDataset(dataset_dir1, num_parallel_workers=8) >>> ds2 = ds.ImageFolderDataset(dataset_dir2, num_parallel_workers=8) >>> - >>> # creates a dataset which is the combination of ds1 and ds2 + >>> # Create a dataset which is the combination of ds1 and ds2 >>> data = ds.zip((ds1, ds2)) """ if len(datasets) <= 1: @@ -218,18 +218,19 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. >>> - >>> # creates a dataset where every 100 rows is combined into a batch + >>> # Create a dataset where every 100 rows is combined into a batch >>> # and drops the last incomplete batch if there is one. >>> column_names = ["col1", "col2"] >>> buket_boundaries = [5, 10] >>> bucket_batch_sizes = [5, 1, 1] >>> element_length_function = (lambda col1, col2: max(len(col1), len(col2))) >>> - >>> # will pad col1 to shape [2, bucket_boundaries[i]] where i is the + >>> # Will pad col1 to shape [2, bucket_boundaries[i]] where i is the >>> # index of the bucket that is currently being batched. - >>> # will pad col2 to a shape where each dimension is the longest in all + >>> # Will pad col2 to a shape where each dimension is the longest in all >>> # the elements currently being batched. >>> pad_info = {"col1", ([2, None], -1)} >>> pad_to_bucket_boundary = True @@ -291,8 +292,10 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. - >>> # creates a dataset where every 100 rows is combined into a batch + >>> + >>> # Create a dataset where every 100 rows is combined into a batch >>> # and drops the last incomplete batch if there is one. >>> data = data.batch(100, True) """ @@ -314,6 +317,7 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. >>> data = data.sync_wait("callback1") >>> data = data.batch(batch_size) @@ -349,11 +353,12 @@ class Dataset: Examples: >>> import mindspore.dataset as ds - >>> # data is an instance of Dataset object - >>> # optionally set the seed for the first epoch + >>> + >>> # data is an instance of Dataset object. + >>> # Optionally set the seed for the first epoch >>> ds.config.set_seed(58) >>> - >>> # creates a shuffled dataset using a shuffle buffer of size 4 + >>> # Create a shuffled dataset using a shuffle buffer of size 4 >>> data = data.shuffle(4) """ return ShuffleDataset(self, buffer_size) @@ -375,12 +380,13 @@ class Dataset: Examples: >>> import mindspore.dataset as ds >>> import mindspore.dataset.text as text - >>> # declare a function which returns a Dataset object + >>> + >>> # Declare a function which returns a Dataset object >>> def flat_map_func(x): >>> data_dir = text.to_str(x[0]) >>> d = ds.ImageFolderDataset(data_dir) >>> return d - >>> # data is a Dataset object + >>> # data is an instance of a Dataset object. >>> data = ds.TextFileDataset(DATA_FILE) >>> data = data.flat_map(flat_map_func) @@ -460,16 +466,17 @@ class Dataset: >>> import mindspore.dataset.vision.c_transforms as c_transforms >>> >>> # data is an instance of Dataset which has 2 columns, "image" and "label". - >>> # ds_pyfunc is an instance of Dataset which has 3 columns, "col0", "col1", and "col2". Each column is - >>> # a 2d array of integers. + >>> # ds_pyfunc is an instance of Dataset which has 3 columns, "col0", "col1", and "col2". + >>> # Each column is a 2D array of integers. >>> - >>> # This config is a global setting, meaning that all future operations which - >>> # uses this config value will use 2 worker threads, unless if specified - >>> # otherwise in their constructor. set_num_parallel_workers can be called - >>> # again later if a different number of worker threads are needed. + >>> # Set the global configuration value for num_parallel_workers to be 2. + >>> # Operations which use this configuration value will use 2 worker threads, + >>> # unless otherwise specified in the operator's constructor. + >>> # set_num_parallel_workers can be called again later if a different + >>> # global configuration value for the number of worker threads is desired. >>> ds.config.set_num_parallel_workers(2) >>> - >>> # Two operations, which takes 1 column for input and outputs 1 column. + >>> # Define two operations, where each operation accepts 1 input column and outputs 1 column. >>> decode_op = c_transforms.Decode(rgb_format=True) >>> random_jitter_op = c_transforms.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0)) >>> @@ -478,12 +485,12 @@ class Dataset: >>> operations = [decode_op] >>> input_columns = ["image"] >>> - >>> # Applies decode_op on column "image". This column will be replaced by the outputed + >>> # Apply decode_op on column "image". This column will be replaced by the outputted >>> # column of decode_op. Since column_order is not provided, both columns "image" >>> # and "label" will be propagated to the child node in their original order. >>> ds_decoded = data.map(operations, input_columns) >>> - >>> # Rename column "image" to "decoded_image" + >>> # Rename column "image" to "decoded_image". >>> output_columns = ["decoded_image"] >>> ds_decoded = data.map(operations, input_columns, output_columns) >>> @@ -501,7 +508,7 @@ class Dataset: >>> output_columns = ["decoded_image"] >>> ds_decoded = data.map(operations, input_columns, output_columns, column_order) >>> - >>> # Simple example using pyfunc. Renaming columns and specifying column order + >>> # A simple example using pyfunc: Renaming columns and specifying column order >>> # work in the same way as the previous examples. >>> input_columns = ["col0"] >>> operations = [(lambda x: x + 1)] @@ -515,7 +522,7 @@ class Dataset: >>> >>> input_columns = ["image"] >>> - >>> # Creates a dataset where the images are decoded, then randomly color jittered. + >>> # Create a dataset where the images are decoded, then randomly color jittered. >>> # decode_op takes column "image" as input and outputs one column. The column >>> # outputted by decode_op is passed as input to random_jitter_op. >>> # random_jitter_op will output one column. Column "image" will be replaced by @@ -524,13 +531,13 @@ class Dataset: >>> # columns will remain the same. >>> ds_mapped = data.map(operations, input_columns) >>> - >>> # Creates a dataset that is identical to ds_mapped, except the column "image" + >>> # Create a dataset that is identical to ds_mapped, except the column "image" >>> # that is outputted by random_jitter_op is renamed to "image_transformed". >>> # Specifying column order works in the same way as examples in 1). >>> output_columns = ["image_transformed"] >>> ds_mapped_and_renamed = data.map(operation, input_columns, output_columns) >>> - >>> # Multiple operations using pyfunc. Renaming columns and specifying column order + >>> # Multiple operations using pyfunc: Renaming columns and specifying column order >>> # work in the same way as examples in 1). >>> input_columns = ["col0"] >>> operations = [(lambda x: x + x), (lambda x: x - 1)] @@ -543,15 +550,15 @@ class Dataset: >>> # operations[1] is a lambda that takes 3 columns as input and outputs 1 column. >>> # operations[1] is a lambda that takes 1 column as input and outputs 4 columns. >>> # - >>> # Note: the number of output columns of operation[i] must equal the number of + >>> # Note: The number of output columns of operation[i] must equal the number of >>> # input columns of operation[i+1]. Otherwise, this map call will also result >>> # in an error. >>> operations = [(lambda x y: (x, x + y, x + y + 1)), >>> (lambda x y z: x * y * z), >>> (lambda x: (x % 2, x % 3, x % 5, x % 7))] >>> - >>> # Note: because the number of input columns is not the same as the number of - >>> # output columns, the output_columns and column_order parameter must be + >>> # Note: Since the number of input columns is not the same as the number of + >>> # output columns, the output_columns and column_order parameters must be >>> # specified. Otherwise, this map call will also result in an error. >>> input_columns = ["col2", "col0"] >>> output_columns = ["mod2", "mod3", "mod5", "mod7"] @@ -614,15 +621,17 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. - >>> # creates a dataset where the dataset is repeated for 50 epochs + >>> + >>> # Create a dataset where the dataset is repeated for 50 epochs >>> repeated = data.repeat(50) >>> - >>> # creates a dataset where each epoch is shuffled individually + >>> # Create a dataset where each epoch is shuffled individually >>> shuffled_and_repeated = data.shuffle(10) >>> shuffled_and_repeated = shuffled_and_repeated.repeat(50) >>> - >>> # creates a dataset where the dataset is first repeated for + >>> # Create a dataset where the dataset is first repeated for >>> # 50 epochs before shuffling. The shuffle operator will treat >>> # the entire 50 epochs as one big dataset. >>> repeat_and_shuffle = data.repeat(50) @@ -645,8 +654,9 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. - >>> # creates a dataset which skips first 3 elements from data + >>> # Create a dataset which skips first 3 elements from data >>> data = data.skip(3) """ return SkipDataset(self, count) @@ -670,8 +680,9 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. - >>> # creates a dataset where the dataset including 50 elements. + >>> # Create a dataset where the dataset includes 50 elements. >>> data = data.take(50) """ if count == -1: @@ -781,11 +792,11 @@ class Dataset: Examples: >>> import mindspore.dataset as ds >>> - >>> dataset_dir = "/path/to/text_file.txt" + >>> dataset_files = "/path/to/text_file/*" >>> - >>> # TextFileDataset is not a mappable dataset, so this non optimized split will be called. - >>> # many datasets have shuffle on by default, set shuffle to False if split will be called! - >>> data = ds.TextFileDataset(dataset_dir, shuffle=False) + >>> # TextFileDataset is not a mappable dataset, so this non-optimized split will be called. + >>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called! + >>> data = ds.TextFileDataset(dataset_files, shuffle=False) >>> train, test = data.split([0.9, 0.1]) """ if self.is_shuffled(): @@ -829,8 +840,9 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # ds1 and ds2 are instances of Dataset object - >>> # creates a dataset which is the combination of ds1 and ds2 + >>> # Create a dataset which is the combination of ds1 and ds2 >>> data = ds1.zip(ds2) """ if isinstance(datasets, tuple): @@ -858,10 +870,12 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # ds1 and ds2 are instances of Dataset object - >>> # creates a dataset by concatenating ds1 and ds2 with "+" operator + >>> + >>> # Create a dataset by concatenating ds1 and ds2 with "+" operator >>> data1 = ds1 + ds2 - >>> # creates a dataset by concatenating ds1 and ds2 with concat operation + >>> # Create a dataset by concatenating ds1 and ds2 with concat operation >>> data1 = ds1.concat(ds2) """ if isinstance(datasets, Dataset): @@ -886,11 +900,12 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object. >>> input_columns = ["input_col1", "input_col2", "input_col3"] >>> output_columns = ["output_col1", "output_col2", "output_col3"] >>> - >>> # creates a dataset where input_col1 is renamed to output_col1, and + >>> # Create a dataset where input_col1 is renamed to output_col1, and >>> # input_col2 is renamed to output_col2, and input_col3 is renamed >>> # to output_col3. >>> data = data.rename(input_columns=input_columns, output_columns=output_columns) @@ -914,10 +929,11 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object >>> columns_to_project = ["column3", "column1", "column2"] >>> - >>> # creates a dataset that consist of column3, column1, column2 + >>> # Create a dataset that consists of column3, column1, column2 >>> # in that order, regardless of the original order of columns. >>> data = data.project(columns=columns_to_project) """ @@ -945,12 +961,15 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object - >>> # declare an apply_func function which returns a Dataset object + >>> + >>> # Declare an apply_func function which returns a Dataset object >>> def apply_func(ds): >>> ds = ds.batch(2) >>> return ds - >>> # use apply to call apply_func + >>> + >>> # Use apply to call apply_func >>> data = data.apply(apply_func) Raises: @@ -1150,8 +1169,10 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object - >>> # create an iterator + >>> + >>> # Create an iterator >>> # The columns in the data obtained by the iterator will not be changed. >>> iterator = data.create_tuple_iterator() >>> for item in iterator: @@ -1171,8 +1192,6 @@ class Dataset: Args: num_epochs (int, optional): Maximum number of epochs that iterator can be iterated (default=-1, iterator can be iterated infinite number of epochs). - num_epochs (int, optional): maximum epochs that iterator can be iteratered, - if num_epochs = -1, iterator can be iteratered infinite epochs (default=-1) output_numpy (bool, optional): Whether or not to output NumPy datatype, if output_numpy=False, iterator will output MSTensor (default=False). @@ -1181,14 +1200,15 @@ class Dataset: Examples: >>> import mindspore.dataset as ds + >>> >>> # data is an instance of Dataset object + >>> >>> # create an iterator >>> # The columns in the data obtained by the iterator might be changed. >>> iterator = data.create_dict_iterator() >>> for item in iterator: >>> # print the data in column1 >>> print(item["column1"]) - """ if self._noop_mode(): return DummyIterator(self, 'dict') @@ -1426,10 +1446,10 @@ class MappableDataset(SourceDataset): >>> import mindspore.dataset as ds >>> >>> dataset_dir = "/path/to/imagefolder_directory" - >>> # a SequentialSampler is created by default + >>> # Note: A SequentialSampler is created by default >>> data = ds.ImageFolderDataset(dataset_dir) >>> - >>> # use a DistributedSampler instead of the SequentialSampler + >>> # Use a DistributedSampler instead of the SequentialSampler >>> new_sampler = ds.DistributedSampler(10, 2) >>> data.use_sampler(new_sampler) """ @@ -1514,15 +1534,15 @@ class MappableDataset(SourceDataset): >>> >>> dataset_dir = "/path/to/imagefolder_directory" >>> - >>> # many datasets have shuffle on by default, set shuffle to False if split will be called! + >>> # Since many datasets have shuffle on by default, set shuffle to False if split will be called! >>> data = ds.ImageFolderDataset(dataset_dir, shuffle=False) >>> - >>> # sets the seed, and tells split to use this seed when randomizing. This - >>> # is needed because we are sharding later + >>> # Set the seed, and tell split to use this seed when randomizing. + >>> # This is needed because sharding will be done later >>> ds.config.set_seed(58) >>> train, test = data.split([0.9, 0.1]) >>> - >>> # if we want to shard the train dataset, we can use a DistributedSampler + >>> # To shard the train dataset, use a DistributedSampler >>> train_sampler = ds.DistributedSampler(10, 2) >>> train.use_sampler(train_sampler) """ @@ -1990,7 +2010,7 @@ class _PythonCallable: class MapDataset(DatasetOp): """ - The result of applying Map operator to the input Dataset. + The result of applying the Map operator to the input Dataset. Args: input_dataset (Dataset): Input Dataset to be mapped. @@ -2756,14 +2776,19 @@ class ImageFolderDataset(MappableDataset): Examples: >>> import mindspore.dataset as ds - >>> # path to imagefolder directory. This directory needs to contain sub-directories which contain the images + >>> + >>> # Set path to the imagefolder directory. + >>> # This directory needs to contain sub-directories which contain the images >>> dataset_dir = "/path/to/imagefolder_directory" - >>> # 1) read all samples (image files) in dataset_dir with 8 threads + >>> + >>> # 1) Read all samples (image files) in dataset_dir with 8 threads >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) - >>> # 2) read all samples (image files) from folder cat and folder dog with label 0 and 1 - >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir,class_indexing={"cat":0,"dog":1}) - >>> # 3) read all samples (image files) in dataset_dir with extensions .JPEG and .png (case sensitive) - >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, extensions=[".JPEG",".png"]) + >>> + >>> # 2) Read all samples (image files) from folder cat and folder dog with label 0 and 1 + >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, class_indexing={"cat":0, "dog":1}) + >>> + >>> # 3) Read all samples (image files) in dataset_dir with extensions .JPEG and .png (case sensitive) + >>> imagefolder_dataset = ds.ImageFolderDataset(dataset_dir, extensions=[".JPEG", ".png"]) """ @check_imagefolderdataset @@ -2912,10 +2937,11 @@ class MnistDataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_dir = "/path/to/mnist_folder" - >>> # 1) read 3 samples from mnist_dataset + >>> # Read 3 samples from MNIST dataset >>> mnist_dataset = ds.MnistDataset(dataset_dir=dataset_dir, num_samples=3) - >>> # in mnist_dataset dataset, each dictionary has keys "image" and "label" + >>> # Note: In mnist_dataset dataset, each dictionary has keys "image" and "label" """ @check_mnist_cifar_dataset @@ -3418,35 +3444,39 @@ class GeneratorDataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> # 1) Multidimensional generator function as callable input - >>> def generator_md(): + >>> def GeneratorMD(): >>> for i in range(64): >>> yield (np.array([[i, i + 1], [i + 2, i + 3]]),) - >>> # create multi_dimension_generator_dataset with GeneratorMD and column name "multi_dimensional_data" - >>> multi_dimension_generator_dataset = ds.GeneratorDataset(generator_md, ["multi_dimensional_data"]) + >>> # Create multi_dimension_generator_dataset with GeneratorMD and column name "multi_dimensional_data" + >>> multi_dimension_generator_dataset = ds.GeneratorDataset(GeneratorMD, ["multi_dimensional_data"]) + >>> >>> # 2) Multi-column generator function as callable input - >>> def generator_mc(maxid = 64): + >>> def GeneratorMC(maxid = 64): >>> for i in range(maxid): >>> yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]])) - >>> # create multi_column_generator_dataset with GeneratorMC and column names "col1" and "col2" - >>> multi_column_generator_dataset = ds.GeneratorDataset(generator_mc, ["col1", "col2"]) + >>> # Create multi_column_generator_dataset with GeneratorMC and column names "col1" and "col2" + >>> multi_column_generator_dataset = ds.GeneratorDataset(GeneratorMC, ["col1", "col2"]) + >>> >>> # 3) Iterable dataset as iterable input >>> class MyIterable(): >>> def __iter__(self): >>> return # User implementation - >>> # create iterable_generator_dataset with MyIterable object + >>> # Create iterable_generator_dataset with MyIterable object >>> iterable_generator_dataset = ds.GeneratorDataset(MyIterable(), ["col1"]) - >>> # 4) Random accessible dataset as Random accessible input + >>> + >>> # 4) Random accessible dataset as random accessible input >>> class MyRA(): >>> def __getitem__(self, index): >>> return # User implementation - >>> # create ra_generator_dataset with MyRA object + >>> # Create ra_generator_dataset with MyRA object >>> ra_generator_dataset = ds.GeneratorDataset(MyRA(), ["col1"]) >>> # List/Dict/Tuple is also random accessible >>> list_generator = ds.GeneratorDataset([(np.array(0),), (np.array(1)), (np.array(2))], ["col1"]) + >>> >>> # 5) Built-in Sampler >>> my_generator = ds.GeneratorDataset(my_ds, ["img", "label"], sampler=samplers.RandomSampler()) - >>> """ @check_generatordataset @@ -3602,15 +3632,19 @@ class TFRecordDataset(SourceDataset): Examples: >>> import mindspore.dataset as ds >>> import mindspore.common.dtype as mstype + >>> >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple tf data files - >>> # 1) get all rows from dataset_files with no explicit schema: + >>> + >>> # 1) Get all rows from dataset_files with no explicit schema >>> # The meta-data in the first row will be used as a schema. >>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files) - >>> # 2) get all rows from dataset_files with user-defined schema: + >>> + >>> # 2) Get all rows from dataset_files with user-defined schema >>> schema = ds.Schema() >>> schema.add_column('col_1d', de_type=mindspore.int64, shape=[2]) >>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files, schema=schema) - >>> # 3) get all rows from dataset_files with schema file "./schema.json": + >>> + >>> # 3) Get all rows from dataset_files with schema file "./schema.json" >>> tfdataset = ds.TFRecordDataset(dataset_files=dataset_files, schema="./schema.json") """ @@ -3773,10 +3807,14 @@ class ManifestDataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_file = "/path/to/manifest_file.manifest" - >>> # 1) read all samples specified in manifest_file dataset with 8 threads for training: + >>> + >>> # 1) Read all samples specified in manifest_file dataset with 8 threads for training >>> manifest_dataset = ds.ManifestDataset(dataset_file, usage="train", num_parallel_workers=8) - >>> # 2) reads samples (specified in manifest_file.manifest) for shard 0 in a 2-way distributed training setup: + >>> + >>> # 2) Read samples (specified in manifest_file.manifest) for shard 0 + >>> # in a 2-way distributed training setup >>> manifest_dataset = ds.ManifestDataset(dataset_file, num_shards=2, shard_id=0) """ @@ -3951,14 +3989,19 @@ class Cifar10Dataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_dir = "/path/to/cifar10_dataset_directory" - >>> # 1) get all samples from CIFAR10 dataset in sequence: - >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir,shuffle=False) - >>> # 2) randomly select 350 samples from CIFAR10 dataset: - >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir,num_samples=350, shuffle=True) - >>> # 3) get samples from CIFAR10 dataset for shard 0 in a 2 way distributed training: - >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir,num_shards=2,shard_id=0) - >>> # in CIFAR10 dataset, each dictionary has keys "image" and "label" + >>> + >>> # 1) Get all samples from CIFAR10 dataset in sequence + >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, shuffle=False) + >>> + >>> # 2) Randomly select 350 samples from CIFAR10 dataset + >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, num_samples=350, shuffle=True) + >>> + >>> # 3) Get samples from CIFAR10 dataset for shard 0 in a 2-way distributed training + >>> dataset = ds.Cifar10Dataset(dataset_dir=dataset_dir, num_shards=2, shard_id=0) + >>> + >>> # In CIFAR10 dataset, each dictionary has keys "image" and "label" """ @check_mnist_cifar_dataset @@ -4093,12 +4136,16 @@ class Cifar100Dataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_dir = "/path/to/cifar100_dataset_directory" - >>> # 1) get all samples from CIFAR100 dataset in sequence: - >>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir,shuffle=False) - >>> # 2) randomly select 350 samples from CIFAR100 dataset: - >>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir,num_samples=350, shuffle=True) - >>> # in CIFAR100 dataset, each dictionary has 3 keys: "image", "fine_label" and "coarse_label" + >>> + >>> # 1) Get all samples from CIFAR100 dataset in sequence + >>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir, shuffle=False) + >>> + >>> # 2) Randomly select 350 samples from CIFAR100 dataset + >>> cifar100_dataset = ds.Cifar100Dataset(dataset_dir=dataset_dir, num_samples=350, shuffle=True) + >>> + >>> # In CIFAR100 dataset, each dictionary has 3 keys: "image", "fine_label" and "coarse_label" """ @check_mnist_cifar_dataset @@ -4265,7 +4312,8 @@ class Schema: Example: >>> import mindspore.dataset as ds >>> import mindspore.common.dtype as mstype - >>> # create schema, specify column name, mindspore.dtype and shape of the column + >>> + >>> # Create schema; specify column name, mindspore.dtype and shape of the column >>> schema = ds.Schema() >>> schema.add_column('col1', de_type=mindspore.int64, shape=[2]) """ @@ -4522,17 +4570,23 @@ class VOCDataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_dir = "/path/to/voc_dataset_directory" - >>> # 1) read VOC data for segmenatation train + >>> + >>> # 1) Read VOC data for segmentatation training >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Segmentation", usage="train") - >>> # 2) read VOC data for detection train + >>> + >>> # 2) Read VOC data for detection training >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train") - >>> # 3) read all VOC dataset samples in dataset_dir with 8 threads in random order: + >>> + >>> # 3) Read all VOC dataset samples in dataset_dir with 8 threads in random order >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train", num_parallel_workers=8) - >>> # 4) read then decode all VOC dataset samples in dataset_dir in sequence: + >>> + >>> # 4) Read then decode all VOC dataset samples in dataset_dir in sequence >>> voc_dataset = ds.VOCDataset(dataset_dir, task="Detection", usage="train", decode=True, shuffle=False) - >>> # in VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target" - >>> # in VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation" + >>> + >>> # In VOC dataset, if task='Segmentation', each dictionary has keys "image" and "target" + >>> # In VOC dataset, if task='Detection', each dictionary has keys "image" and "annotation" """ @check_vocdataset @@ -4722,17 +4776,23 @@ class CocoDataset(MappableDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_dir = "/path/to/coco_dataset_directory/image_folder" >>> annotation_file = "/path/to/coco_dataset_directory/annotation_folder/annotation.json" - >>> # 1) read COCO data for Detection task + >>> + >>> # 1) Read COCO data for Detection task >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Detection') - >>> # 2) read COCO data for Stuff task + >>> + >>> # 2) Read COCO data for Stuff task >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Stuff') - >>> # 3) read COCO data for Panoptic task + >>> + >>> # 3) Read COCO data for Panoptic task >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Panoptic') - >>> # 4) read COCO data for Keypoint task + >>> + >>> # 4) Read COCO data for Keypoint task >>> coco_dataset = ds.CocoDataset(dataset_dir, annotation_file=annotation_file, task='Keypoint') - >>> # in COCO dataset, each dictionary has keys "image" and "annotation" + >>> + >>> # In COCO dataset, each dictionary has keys "image" and "annotation" """ @check_cocodataset @@ -4857,6 +4917,12 @@ class CelebADataset(MappableDataset): into (default=None). shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. + + Examples: + >>> import mindspore.dataset as ds + >>> + >>> dataset_dir = "/path/to/celeba_directory" + >>> dataset = ds.CelebADataset(dataset_dir=dataset_dir, usage='train') """ @check_celebadataset @@ -4976,6 +5042,7 @@ class CLUEDataset(SourceDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files >>> dataset = ds.CLUEDataset(dataset_files=dataset_files, task='AFQMC', usage='train') """ @@ -5162,7 +5229,7 @@ class CLUEDataset(SourceDataset): class CSVDataset(SourceDataset): """ - A source dataset that reads and parses CSV datasets. + A source dataset that reads and parses comma-separated values (CSV) datasets. Args: dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search @@ -5192,6 +5259,7 @@ class CSVDataset(SourceDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files >>> dataset = ds.CSVDataset(dataset_files=dataset_files, column_names=['col1', 'col2', 'col3', 'col4']) """ @@ -5288,6 +5356,7 @@ class TextFileDataset(SourceDataset): argument can only be specified when num_shards is also specified. Examples: >>> import mindspore.dataset as ds + >>> >>> dataset_files = ["/path/to/1", "/path/to/2"] # contains 1 or multiple text files >>> dataset = ds.TextFileDataset(dataset_files=dataset_files) """ @@ -5455,10 +5524,10 @@ class NumpySlicesDataset(GeneratorDataset): Args: data (Union[list, tuple, dict]) Input of given data. Supported data types include: list, tuple, dict and other - NumPy formats. Input data will be sliced in first dimension and generate many rows. Large data is not - recommended to be loaded in this way as data is loading into memory. + NumPy formats. Input data will be sliced along the first dimension and generate additional rows. + Large data is not recommended to be loaded in this way as data is loading into memory. column_names (list[str], optional): List of column names of the dataset (default=None). If column_names is not - provided, when data is dict, column_names will be its key, otherwise it will be like column_1, column_2 ... + provided, when data is dict, column_names will be its keys, otherwise it will be like column_1, column_2 ... num_samples (int, optional): The number of samples to be included in the dataset (default=None, all images). num_parallel_workers (int, optional): Number of subprocesses used to fetch the dataset in parallel (default=1). shuffle (bool, optional): Whether or not to perform shuffle on the dataset. Random accessible input is required. @@ -5472,16 +5541,20 @@ class NumpySlicesDataset(GeneratorDataset): Examples: >>> import mindspore.dataset as ds + >>> >>> # 1) Input data can be a list >>> data = [1, 2, 3] >>> dataset1 = ds.NumpySlicesDataset(data, column_names=["column_1"]) - >>> # 2) Input data can be a dict, and column_names will be its key + >>> + >>> # 2) Input data can be a dictionary, and column_names will be its keys >>> data = {"a": [1, 2], "b": [3, 4]} >>> dataset2 = ds.NumpySlicesDataset(data) + >>> >>> # 3) Input data can be a tuple of lists (or NumPy arrays), each tuple element refers to data in each column >>> data = ([1, 2], [3, 4], [5, 6]) >>> dataset3 = ds.NumpySlicesDataset(data, column_names=["column_1", "column_2", "column_3"]) - >>> # 4) Load data from csv file + >>> + >>> # 4) Load data from CSV file >>> import pandas as pd >>> df = pd.read_csv("file.csv") >>> dataset4 = ds.NumpySlicesDataset(dict(df), shuffle=False) diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py index af29f3b665..c5039fb15c 100644 --- a/mindspore/dataset/engine/samplers.py +++ b/mindspore/dataset/engine/samplers.py @@ -223,7 +223,8 @@ class DistributedSampler(BuiltinSampler): shard_id (int): Shard ID of the current shard within num_shards. shuffle (bool, optional): If True, the indices are shuffled (default=True). num_samples (int, optional): The number of samples to draw (default=None, all elements). - offset(int, optional): Offset from shard when the element of dataset is allocated (default=-1). + offset(int, optional): The starting sample ID where access to elements in the dataset begins (default=-1). + Examples: >>> import mindspore.dataset as ds >>> diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index e8709f90da..16886b7b1e 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -The module text.transforms is inheritted from _c_dataengine +The module text.transforms is inherited from _c_dataengine and is implemented based on ICU4C and cppjieba in C++. It's a high performance module to process NLP text. Users can use Vocab to build their own dictionary, @@ -23,26 +23,26 @@ and use Lookup to find the index of tokens in Vocab. A constructor's arguments for every class in this module must be saved into the class attributes (self.xxx) to support save() and load(). - Examples: - >>> import mindspore.dataset as ds - >>> import mindspore.dataset.text as text - >>> - >>> dataset_file = "path/to/text_file_path" - >>> # sentences as line data saved in a file - >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False) - >>> # tokenize sentence to unicode characters - >>> tokenizer = text.UnicodeCharTokenizer() - >>> # load vocabulary form list - >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) - >>> # lookup is an operation for mapping tokens to ids - >>> lookup = text.Lookup(vocab) - >>> dataset = dataset.map(operations=[tokenizer, lookup]) - >>> for i in dataset.create_dict_iterator(): - >>> print(i) - >>> # if text line in dataset_file is: - >>> # 深圳欢迎您 - >>> # then the output will be: - >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} +Examples: + >>> import mindspore.dataset as ds + >>> import mindspore.dataset.text as text + >>> + >>> dataset_file = "path/to/text_file_path" + >>> # sentences as line data saved in a file + >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False) + >>> # tokenize sentence to unicode characters + >>> tokenizer = text.UnicodeCharTokenizer() + >>> # load vocabulary form list + >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) + >>> # lookup is an operation for mapping tokens to ids + >>> lookup = text.Lookup(vocab) + >>> dataset = dataset.map(operations=[tokenizer, lookup]) + >>> for i in dataset.create_dict_iterator(): + >>> print(i) + >>> # if text line in dataset_file is: + >>> # 深圳欢迎您 + >>> # then the output will be: + >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} """ import os import re diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index 73752ff004..33bd9785f7 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================== """ -This module c_transforms provides common operations, including OneHotOp and TypeCast. +The module transforms.c_transforms provides common operations, including OneHotOp and TypeCast. """ from enum import IntEnum import numpy as np diff --git a/mindspore/dataset/transforms/py_transforms.py b/mindspore/dataset/transforms/py_transforms.py index 0dc1445cdd..5aa3244dd6 100644 --- a/mindspore/dataset/transforms/py_transforms.py +++ b/mindspore/dataset/transforms/py_transforms.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """ -This module py_transforms is implemented basing on Python. It provides common +The module transforms.py_transform is implemented based on Python. It provides common operations including OneHotOp. """ from .validators import check_one_hot_op, check_compose_list, check_random_apply, check_transforms_list, \ @@ -80,11 +79,11 @@ class Compose: >>> # create a dataset that reads all files in dataset_dir with 8 threads >>> dataset = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) >>> # create a list of transformations to be applied to the image data - >>> transform = py_transform.Compose([py_vision.Decode(), - >>> py_vision.RandomHorizontalFlip(0.5), - >>> py_vision.ToTensor(), - >>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), - >>> py_vision.RandomErasing()]) + >>> transform = py_transforms.Compose([py_vision.Decode(), + >>> py_vision.RandomHorizontalFlip(0.5), + >>> py_vision.ToTensor(), + >>> py_vision.Normalize((0.491, 0.482, 0.447), (0.247, 0.243, 0.262)), + >>> py_vision.RandomErasing()]) >>> # apply the transform to the dataset through dataset.map() >>> dataset = dataset.map(operations=transform, input_columns="image") """ diff --git a/mindspore/dataset/vision/c_transforms.py b/mindspore/dataset/vision/c_transforms.py index 12cd54ad2d..c92b609f67 100644 --- a/mindspore/dataset/vision/c_transforms.py +++ b/mindspore/dataset/vision/c_transforms.py @@ -22,26 +22,26 @@ to improve their training models. A constructor's arguments for every class in this module must be saved into the class attributes (self.xxx) to support save() and load(). - Examples: - >>> import mindspore.dataset as ds - >>> import mindspore.dataset.transforms.c_transforms as c_transforms - >>> import mindspore.dataset.vision.c_transforms as c_vision - >>> from mindspore.dataset.vision import Border, Inter - >>> - >>> dataset_dir = "path/to/imagefolder_directory" - >>> # create a dataset that reads all files in dataset_dir with 8 threads - >>> data1 = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) - >>> # create a list of transformations to be applied to the image data - >>> transforms_list = [c_vision.Decode(), - >>> c_vision.Resize((256, 256), interpolation=Inter.LINEAR), - >>> c_vision.RandomCrop(200, padding_mode=Border.EDGE), - >>> c_vision.RandomRotation((0, 15)), - >>> c_vision.Normalize((100, 115.0, 121.0), (71.0, 68.0, 70.0)), - >>> c_vision.HWC2CHW()] - >>> onehot_op = c_transforms.OneHot(num_classes=10) - >>> # apply the transformation to the dataset through data1.map() - >>> data1 = data1.map(operations=transforms_list, input_columns="image") - >>> data1 = data1.map(operations=onehot_op, input_columns="label") +Examples: + >>> import mindspore.dataset as ds + >>> import mindspore.dataset.transforms.c_transforms as c_transforms + >>> import mindspore.dataset.vision.c_transforms as c_vision + >>> from mindspore.dataset.vision import Border, Inter + >>> + >>> dataset_dir = "path/to/imagefolder_directory" + >>> # create a dataset that reads all files in dataset_dir with 8 threads + >>> data1 = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8) + >>> # create a list of transformations to be applied to the image data + >>> transforms_list = [c_vision.Decode(), + >>> c_vision.Resize((256, 256), interpolation=Inter.LINEAR), + >>> c_vision.RandomCrop(200, padding_mode=Border.EDGE), + >>> c_vision.RandomRotation((0, 15)), + >>> c_vision.Normalize((100, 115.0, 121.0), (71.0, 68.0, 70.0)), + >>> c_vision.HWC2CHW()] + >>> onehot_op = c_transforms.OneHot(num_classes=10) + >>> # apply the transformation to the dataset through data1.map() + >>> data1 = data1.map(operations=transforms_list, input_columns="image") + >>> data1 = data1.map(operations=onehot_op, input_columns="label") """ import numbers import mindspore._c_dataengine as cde diff --git a/mindspore/dataset/vision/py_transforms.py b/mindspore/dataset/vision/py_transforms.py index 31185cc4dd..22eafa808f 100644 --- a/mindspore/dataset/vision/py_transforms.py +++ b/mindspore/dataset/vision/py_transforms.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """ The module vision.py_transforms is implemented based on Python PIL. This module provides many kinds of image augmentations. It also provides @@ -50,9 +49,9 @@ class ToTensor: Convert the input NumPy image array or PIL image of shape (H, W, C) to a NumPy ndarray of shape (C, H, W). Note: - The ranges of values in height and width dimension are converted from [0, 255] to [0.0, 1.0]. + The values in the input arrays are rescaled from [0, 255] to [0.0, 1.0]. The type is cast to output_type (default NumPy float32). - The range of channel dimension remains the same. + The number of channels remains the same. Args: output_type (NumPy datatype, optional): The datatype of the NumPy output (default=np.float32).