fix issues of MD:

1. description of num_shard in datasets.py 2. add param validation for sampler 3. add description for add_child in sampler.py
4 years ago · 415c8b08a5
--- a/mindspore/dataset/core/validator_helpers.py
+++ b/mindspore/dataset/core/validator_helpers.py
@@ -288,14 +288,17 @@ def check_sampler_shuffle_shard_options(param_dict):
    """
    shuffle, sampler = param_dict.get('shuffle'), param_dict.get('sampler')
    num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
    num_samples = param_dict.get('num_samples')

    type_check(sampler, (type(None), samplers.BuiltinSampler, samplers.Sampler), "sampler")

    if sampler is not None:
        if shuffle is not None:
            raise RuntimeError("sampler and shuffle cannot be specified at the same time.")
        if num_shards is not None:
        if num_shards is not None or shard_id is not None:
            raise RuntimeError("sampler and sharding cannot be specified at the same time.")
        if num_samples is not None:
            raise RuntimeError("sampler and num_samples cannot be specified at the same time.")

    if num_shards is not None:
        check_pos_int32(num_shards)
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -3045,7 +3045,8 @@ class ImageFolderDataset(MappableDataset):
            unique index starting from 0).
        decode (bool, optional): Decode the images after reading (default=False).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -3194,7 +3195,8 @@ class MnistDataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -3277,6 +3279,7 @@ class MindDataset(MappableDataset):
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset
            (default=None, performs shuffle).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        sampler (Sampler, optional): Object used to choose samples from the
@@ -3742,7 +3745,8 @@ class GeneratorDataset(MappableDataset):
        sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
            input is required (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' will not used. Random accessible input is required.
            Random accessible input is required. When this argument is specified, 'num_samples' reflects the max sample
            number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
            when num_shards is also specified. Random accessible input is required.
        python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker process. This
@@ -3922,7 +3926,8 @@ class TFRecordDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        shard_equal_rows (bool, optional): Get equal rows for all shards(default=False). If shard_equal_rows
@@ -4115,7 +4120,8 @@ class ManifestDataset(MappableDataset):
            class will be given a unique index starting from 0).
        decode (bool, optional): decode the images after reading (default=False).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4279,7 +4285,8 @@ class Cifar10Dataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4420,7 +4427,8 @@ class Cifar100Dataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4510,7 +4518,8 @@ class RandomDataset(SourceDataset):
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset
            (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
    """
@@ -4766,7 +4775,8 @@ class VOCDataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the dataset
            (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4962,7 +4972,8 @@ class CocoDataset(MappableDataset):
        sampler (Sampler, optional): Object used to choose samples from the dataset
            (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5115,7 +5126,8 @@ class CelebADataset(MappableDataset):
        num_samples (int, optional): The number of images to be included in the dataset.
            (default=None, all images).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None).
            into (default=None). When this argument is specified, 'num_samples' reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5229,6 +5241,7 @@ class CLUEDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5463,6 +5476,7 @@ class CSVDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5575,6 +5589,7 @@ class TextFileDataset(SourceDataset):
            - Shuffle.FILES: Shuffle files only.

        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5779,7 +5794,8 @@ class NumpySlicesDataset(GeneratorDataset):
        sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
            input is required (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' will not used. Random accessible input is required.
            Random accessible input is required. When this argument is specified, 'num_samples' reflects the max
            sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
            when num_shards is also specified. Random accessible input is required.

--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@@ -137,6 +137,20 @@ class BuiltinSampler:
        pass

    def add_child(self, sampler):
        """
        Add a sub-sampler for given sampler. The sub-sampler will receive all data from the
        output of parent sampler and apply its sample logic to return new samples.

        Args:
            sampler (Sampler): Object used to choose samples from the dataset. Only builtin
                samplers(DistributedSampler, PKSampler, RandomSampler, SequentialSampler,
                SubsetRandomSampler, WeightedRandomSampler) are supported.

        Examples:
            >>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
            >>> sampler.add_child(ds.RandomSampler(num_samples=2))
            >>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler)
        """
        self.child_sampler = sampler

    def get_child(self):
@@ -448,7 +462,7 @@ class SequentialSampler(BuiltinSampler):
    Samples the dataset elements sequentially, same as not having a sampler.

    Args:
        start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
        start_index (int, optional): Index to start sampling at. (default=None, start at first ID)
        num_samples (int, optional): Number of elements to sample (default=None, all elements).

    Examples:
--- a/tests/ut/python/dataset/test_sampler.py
+++ b/tests/ut/python/dataset/test_sampler.py
@@ -232,9 +232,9 @@ def test_add_sampler_invalid_input():
    assert "not an instance of a sampler" in str(info.value)

    sampler = ds.SequentialSampler()
    with pytest.raises(ValueError) as info:
    with pytest.raises(RuntimeError) as info:
        data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20)
    assert "Conflicting arguments during sampler assignments" in str(info.value)
    assert "sampler and num_samples cannot be specified at the same time" in str(info.value)


 def test_distributed_sampler_invalid_offset():