Browse Source

fix issues of MD:

1. description of num_shard in datasets.py
2. add param validation for sampler
3. add description for add_child in sampler.py
tags/v1.2.0-rc1
luoyang 4 years ago
parent
commit
415c8b08a5
4 changed files with 49 additions and 16 deletions
  1. +4
    -1
      mindspore/dataset/core/validator_helpers.py
  2. +28
    -12
      mindspore/dataset/engine/datasets.py
  3. +15
    -1
      mindspore/dataset/engine/samplers.py
  4. +2
    -2
      tests/ut/python/dataset/test_sampler.py

+ 4
- 1
mindspore/dataset/core/validator_helpers.py View File

@@ -288,14 +288,17 @@ def check_sampler_shuffle_shard_options(param_dict):
"""
shuffle, sampler = param_dict.get('shuffle'), param_dict.get('sampler')
num_shards, shard_id = param_dict.get('num_shards'), param_dict.get('shard_id')
num_samples = param_dict.get('num_samples')

type_check(sampler, (type(None), samplers.BuiltinSampler, samplers.Sampler), "sampler")

if sampler is not None:
if shuffle is not None:
raise RuntimeError("sampler and shuffle cannot be specified at the same time.")
if num_shards is not None:
if num_shards is not None or shard_id is not None:
raise RuntimeError("sampler and sharding cannot be specified at the same time.")
if num_samples is not None:
raise RuntimeError("sampler and num_samples cannot be specified at the same time.")

if num_shards is not None:
check_pos_int32(num_shards)


+ 28
- 12
mindspore/dataset/engine/datasets.py View File

@@ -3045,7 +3045,8 @@ class ImageFolderDataset(MappableDataset):
unique index starting from 0).
decode (bool, optional): Decode the images after reading (default=False).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -3194,7 +3195,8 @@ class MnistDataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -3277,6 +3279,7 @@ class MindDataset(MappableDataset):
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, performs shuffle).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
sampler (Sampler, optional): Object used to choose samples from the
@@ -3742,7 +3745,8 @@ class GeneratorDataset(MappableDataset):
sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
input is required (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' will not used. Random accessible input is required.
Random accessible input is required. When this argument is specified, 'num_samples' reflects the max sample
number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
when num_shards is also specified. Random accessible input is required.
python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker process. This
@@ -3922,7 +3926,8 @@ class TFRecordDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.

num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
shard_equal_rows (bool, optional): Get equal rows for all shards(default=False). If shard_equal_rows
@@ -4115,7 +4120,8 @@ class ManifestDataset(MappableDataset):
class will be given a unique index starting from 0).
decode (bool, optional): decode the images after reading (default=False).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4279,7 +4285,8 @@ class Cifar10Dataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4420,7 +4427,8 @@ class Cifar100Dataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4510,7 +4518,8 @@ class RandomDataset(SourceDataset):
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
"""
@@ -4766,7 +4775,8 @@ class VOCDataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the dataset
(default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -4962,7 +4972,8 @@ class CocoDataset(MappableDataset):
sampler (Sampler, optional): Object used to choose samples from the dataset
(default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5115,7 +5126,8 @@ class CelebADataset(MappableDataset):
num_samples (int, optional): The number of images to be included in the dataset.
(default=None, all images).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None).
into (default=None). When this argument is specified, 'num_samples' reflects
the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5229,6 +5241,7 @@ class CLUEDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.

num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5463,6 +5476,7 @@ class CSVDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.

num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5575,6 +5589,7 @@ class TextFileDataset(SourceDataset):
- Shuffle.FILES: Shuffle files only.

num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
@@ -5779,7 +5794,8 @@ class NumpySlicesDataset(GeneratorDataset):
sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
input is required (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, 'num_samples' will not used. Random accessible input is required.
Random accessible input is required. When this argument is specified, 'num_samples' reflects the max
sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
when num_shards is also specified. Random accessible input is required.



+ 15
- 1
mindspore/dataset/engine/samplers.py View File

@@ -137,6 +137,20 @@ class BuiltinSampler:
pass

def add_child(self, sampler):
"""
Add a sub-sampler for given sampler. The sub-sampler will receive all data from the
output of parent sampler and apply its sample logic to return new samples.

Args:
sampler (Sampler): Object used to choose samples from the dataset. Only builtin
samplers(DistributedSampler, PKSampler, RandomSampler, SequentialSampler,
SubsetRandomSampler, WeightedRandomSampler) are supported.

Examples:
>>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
>>> sampler.add_child(ds.RandomSampler(num_samples=2))
>>> dataset = ds.Cifar10Dataset(cifar10_dataset_dir, sampler=sampler)
"""
self.child_sampler = sampler

def get_child(self):
@@ -448,7 +462,7 @@ class SequentialSampler(BuiltinSampler):
Samples the dataset elements sequentially, same as not having a sampler.

Args:
start_index (int, optional): Index to start sampling at. (dafault=None, start at first ID)
start_index (int, optional): Index to start sampling at. (default=None, start at first ID)
num_samples (int, optional): Number of elements to sample (default=None, all elements).

Examples:


+ 2
- 2
tests/ut/python/dataset/test_sampler.py View File

@@ -232,9 +232,9 @@ def test_add_sampler_invalid_input():
assert "not an instance of a sampler" in str(info.value)

sampler = ds.SequentialSampler()
with pytest.raises(ValueError) as info:
with pytest.raises(RuntimeError) as info:
data2 = ds.ManifestDataset(manifest_file, sampler=sampler, num_samples=20)
assert "Conflicting arguments during sampler assignments" in str(info.value)
assert "sampler and num_samples cannot be specified at the same time" in str(info.value)


def test_distributed_sampler_invalid_offset():


Loading…
Cancel
Save