diff --git a/mindspore/dataset/core/validator_helpers.py b/mindspore/dataset/core/validator_helpers.py index 46a16d100e..6497bb6021 100644 --- a/mindspore/dataset/core/validator_helpers.py +++ b/mindspore/dataset/core/validator_helpers.py @@ -288,14 +288,17 @@ def check_sampler_shuffle_shard_options(param_dict): """ shuffle, sampler = param_dict.get('shuffle'), param_dict.get('sampler') num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id') + num_samples = param_dict.get('num_samples') type_check(sampler, (type(None), samplers.BuiltinSampler, samplers.Sampler), "sampler") if sampler is not None: if shuffle is not None: raise RuntimeError("sampler and shuffle cannot be specified at the same time.") - if num_shards is not None: + if num_shards is not None or shard_id is not None: raise RuntimeError("sampler and sharding cannot be specified at the same time.") + if num_samples is not None: + raise RuntimeError("sampler and num_samples cannot be specified at the same time.") if num_shards is not None: check_pos_int32(num_shards) diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index c48fced552..f0b1e6006b 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -3045,7 +3045,8 @@ class ImageFolderDataset(MappableDataset): unique index starting from 0). decode (bool, optional): Decode the images after reading (default=False). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -3194,7 +3195,8 @@ class MnistDataset(MappableDataset): sampler (Sampler, optional): Object used to choose samples from the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -3277,6 +3279,7 @@ class MindDataset(MappableDataset): shuffle (bool, optional): Whether or not to perform shuffle on the dataset (default=None, performs shuffle). num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). + When this argument is specified, 'num_samples' reflects the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. sampler (Sampler, optional): Object used to choose samples from the @@ -3742,7 +3745,8 @@ class GeneratorDataset(MappableDataset): sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible input is required (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). - When this argument is specified, 'num_samples' will not used. Random accessible input is required. + Random accessible input is required. When this argument is specified, 'num_samples' reflects the max sample + number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only when num_shards is also specified. Random accessible input is required. python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker process. This @@ -3922,7 +3926,8 @@ class TFRecordDataset(SourceDataset): - Shuffle.FILES: Shuffle files only. num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. shard_equal_rows (bool, optional): Get equal rows for all shards(default=False). If shard_equal_rows @@ -4115,7 +4120,8 @@ class ManifestDataset(MappableDataset): class will be given a unique index starting from 0). decode (bool, optional): decode the images after reading (default=False). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -4279,7 +4285,8 @@ class Cifar10Dataset(MappableDataset): sampler (Sampler, optional): Object used to choose samples from the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -4420,7 +4427,8 @@ class Cifar100Dataset(MappableDataset): sampler (Sampler, optional): Object used to choose samples from the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -4510,7 +4518,8 @@ class RandomDataset(SourceDataset): shuffle (bool, optional): Whether or not to perform shuffle on the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. """ @@ -4766,7 +4775,8 @@ class VOCDataset(MappableDataset): sampler (Sampler, optional): Object used to choose samples from the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -4962,7 +4972,8 @@ class CocoDataset(MappableDataset): sampler (Sampler, optional): Object used to choose samples from the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -5115,7 +5126,8 @@ class CelebADataset(MappableDataset): num_samples (int, optional): The number of images to be included in the dataset. (default=None, all images). num_shards (int, optional): Number of shards that the dataset will be divided - into (default=None). + into (default=None). When this argument is specified, 'num_samples' reflects + the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -5229,6 +5241,7 @@ class CLUEDataset(SourceDataset): - Shuffle.FILES: Shuffle files only. num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). + When this argument is specified, 'num_samples' reflects the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -5463,6 +5476,7 @@ class CSVDataset(SourceDataset): - Shuffle.FILES: Shuffle files only. num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). + When this argument is specified, 'num_samples' reflects the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -5575,6 +5589,7 @@ class TextFileDataset(SourceDataset): - Shuffle.FILES: Shuffle files only. num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). + When this argument is specified, 'num_samples' reflects the max sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument can only be specified when num_shards is also specified. cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. @@ -5779,7 +5794,8 @@ class NumpySlicesDataset(GeneratorDataset): sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible input is required (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset will be divided into (default=None). - When this argument is specified, 'num_samples' will not used. Random accessible input is required. + Random accessible input is required. When this argument is specified, 'num_samples' reflects the max + sample number of per shard. shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only when num_shards is also specified. Random accessible input is required. diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py index 1fa866ae29..69672e24a5 100644 --- a/mindspore/dataset/engine/samplers.py +++ b/mindspore/dataset/engine/samplers.py @@ -137,6 +137,20 @@ class BuiltinSampler: pass def add_child(self, sampler): + """ + Add a sub-sampler for given sampler. The sub-sampler will receive all data from the + output of parent sampler and apply its sample logic to return new samples. + + Args: + sampler (Sampler): Object used to choose samples from the dataset. Only builtin + samplers(DistributedSampler, PKSampler, RandomSampler, SequentialSampler, + SubsetRandomSampler, WeightedRandomSampler) are supported. + + Examples: + >>> sampler = ds.SequentialSampler(start_index=0, num_samples=3) + >>> sampler.add_child(ds.RandomSampler(num_samples=2)) + >>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler) + """ self.child_sampler = sampler def get_child(self): @@ -448,7 +462,7 @@ class SequentialSampler(BuiltinSampler): Samples the dataset elements sequentially, same as not having a sampler. Args: - start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID) + start_index (int, optional): Index to start sampling at. (default=None, start at first ID) num_samples (int, optional): Number of elements to sample (default=None, all elements). Examples: diff --git a/tests/ut/python/dataset/test_sampler.py b/tests/ut/python/dataset/test_sampler.py index 80688bed5e..258c2d907a 100644 --- a/tests/ut/python/dataset/test_sampler.py +++ b/tests/ut/python/dataset/test_sampler.py @@ -232,9 +232,9 @@ def test_add_sampler_invalid_input(): assert "not an instance of a sampler" in str(info.value) sampler = ds.SequentialSampler() - with pytest.raises(ValueError) as info: + with pytest.raises(RuntimeError) as info: data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20) - assert "Conflicting arguments during sampler assignments" in str(info.value) + assert "sampler and num_samples cannot be specified at the same time" in str(info.value) def test_distributed_sampler_invalid_offset():