Browse Source

msadd example for common used Api

tags/v1.6.0
ms_yan 4 years ago
parent
commit
fe36c57961
7 changed files with 140 additions and 31 deletions
  1. +4
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
  2. +3
    -3
      mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
  3. +2
    -1
      mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc
  4. +9
    -7
      mindspore/dataset/callback/ds_callback.py
  5. +99
    -13
      mindspore/dataset/engine/datasets.py
  6. +16
    -2
      mindspore/dataset/engine/samplers.py
  7. +7
    -3
      mindspore/dataset/text/utils.py

+ 4
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc View File

@@ -513,8 +513,10 @@ Status DeviceQueueOp::RetryPushData(unsigned int handle, const std::vector<DataI
BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
if (ret) {
if (ret == BlockQueueStatus_T::ERROR_INPUT) {
return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
"Invalid data, check the output of dataset with creating iterator and print data item.");
return Status(
StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
"Invalid data, the types or shapes of current row is different with previous row(i.e. do batch operation but "
"drop_reminder is False, or without resize image into the same size, these will cause shapes differs).");
} else {
if (!stop_send_) {
if (!flag_log) {


+ 3
- 3
mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc View File

@@ -922,8 +922,8 @@ Status AdjustBrightness(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
}
CHECK_FAIL_RETURN_UNEXPECTED(
input_cv->shape().Size() > CHANNEL_INDEX,
"AdjustBrightness: image rank should not bigger than:" + std::to_string(CHANNEL_INDEX) + ", but got" +
std::to_string(input_cv->shape().Size()));
"AdjustBrightness: image rank should not bigger than:" + std::to_string(CHANNEL_INDEX) +
", but got: " + std::to_string(input_cv->shape().Size()));
int num_channels = input_cv->shape()[CHANNEL_INDEX];
// Rank of the image represents how many dimensions, image is expected to be HWC
if (input_cv->Rank() != DEFAULT_IMAGE_RANK || num_channels != DEFAULT_IMAGE_CHANNELS) {
@@ -949,7 +949,7 @@ Status AdjustContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tens
}
CHECK_FAIL_RETURN_UNEXPECTED(input_cv->shape().Size() > CHANNEL_INDEX,
"AdjustContrast: image rank should not bigger than:" + std::to_string(CHANNEL_INDEX) +
", but got" + std::to_string(input_cv->shape().Size()));
", but got: " + std::to_string(input_cv->shape().Size()));
int num_channels = input_cv->shape()[CHANNEL_INDEX];
if (input_cv->Rank() != DEFAULT_IMAGE_CHANNELS || num_channels != DEFAULT_IMAGE_CHANNELS) {
RETURN_STATUS_UNEXPECTED("AdjustContrast: image shape is not <H,W,C> or channel is not 3, got image rank: " +


+ 2
- 1
mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc View File

@@ -35,7 +35,8 @@ Status SharpnessOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_pt
}

if (input_cv->Rank() == 1 || input_cv->mat().dims > 2) {
RETURN_STATUS_UNEXPECTED("Sharpness: shape of input is not <H,W,C> or <H,W>, but got rank: " + input_cv->Rank());
RETURN_STATUS_UNEXPECTED("Sharpness: shape of input is not <H,W,C> or <H,W>, but got rank: " +
std::to_string(input_cv->Rank()));
}

/// creating a smoothing filter. 1, 1, 1,


+ 9
- 7
mindspore/dataset/callback/ds_callback.py View File

@@ -33,9 +33,9 @@ class DSCallback:
>>> from mindspore.dataset import DSCallback
>>>
>>> class PrintInfo(DSCallback):
>>> def ds_epoch_end(self, ds_run_context):
>>> print(cb_params.cur_epoch_num)
>>> print(cb_params.cur_step_num)
... def ds_epoch_end(self, ds_run_context):
... print(cb_params.cur_epoch_num)
... print(cb_params.cur_step_num)
>>>
>>> # dataset is an instance of Dataset object
>>> dataset = dataset.map(operations=op, callbacks=PrintInfo())
@@ -71,7 +71,7 @@ class DSCallback:

def ds_step_begin(self, ds_run_context):
"""
Called before n steps are started.
Called before each step start.

Args:
ds_run_context (RunContext): Include some information of the pipeline.
@@ -79,7 +79,7 @@ class DSCallback:

def ds_step_end(self, ds_run_context):
"""
Called after n steps are finished.
Called after each step finished.

Args:
ds_run_context (RunContext): Include some information of the pipeline.
@@ -89,7 +89,8 @@ class DSCallback:
"""
Creates a runtime (C++) object from the callback methods defined by the user.

Returns: _c_dataengine.PyDSCallback
Returns:
_c_dataengine.PyDSCallback.
"""
c_cb = PyDSCallback(self.step_size)
at_least_one = False
@@ -225,7 +226,8 @@ class WaitedDSCallback(Callback, DSCallback):
"""
Creates a runtime (C++) object from the callback methods defined by the user. This method is internal.

Returns: _c_dataengine.PyDSCallback
Returns:
_c_dataengine.PyDSCallback.
"""
c_cb = PyDSCallback(self.step_size)
at_least_one = False


+ 99
- 13
mindspore/dataset/engine/datasets.py View File

@@ -417,22 +417,21 @@ class Dataset:
BucketBatchByLengthDataset, dataset bucketed and batched by length.

Examples:
>>> # Create a dataset where every 100 rows are combined into a batch
>>> # Create a dataset where certain counts rows are combined into a batch
>>> # and drops the last incomplete batch if there is one.
>>> import numpy as np
>>> def generate_2_columns(n):
... for i in range(n):
... yield (np.array([i]), np.array([j for j in range(i + 1)]))
>>>
>>> column_names = ["col1", "col2"]
>>> dataset = ds.GeneratorDataset(generate_2_columns(202), column_names)
>>> dataset = ds.GeneratorDataset(generate_2_columns(8), column_names)
>>> bucket_boundaries = [5, 10]
>>> bucket_batch_sizes = [5, 1, 1]
>>> bucket_batch_sizes = [2, 1, 1]
>>> element_length_function = (lambda col1, col2: max(len(col1), len(col2)))
>>> # Will pad col1 to shape [2, bucket_boundaries[i]] where i is the
>>> # Will pad col2 to shape [bucket_boundaries[i]] where i is the
>>> # index of the bucket that is currently being batched.
>>> # Will pad col2 to a shape where each dimension is the longest in all
>>> # the elements currently being batched.
>>> pad_info = {"col1": ([2, None], -1)}
>>> pad_info = {"col2": ([None], -1)}
>>> pad_to_bucket_boundary = True
>>> dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
... bucket_batch_sizes,
@@ -1145,7 +1144,7 @@ class Dataset:
Returns:
Vocab, vocab built from the dataset.

Example:
Examples:
>>> import numpy as np
>>>
>>> def gen_corpus():
@@ -1213,7 +1212,7 @@ class Dataset:
Returns:
SentencePieceVocab, vocab built from the dataset.

Example:
Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>>
>>> def gen_corpus():
@@ -1549,6 +1548,10 @@ class Dataset:

Returns:
list, list of column names in the dataset.

Examples:
>>> # dataset is an instance object of Dataset
>>> col_names = dataset.get_col_names()
"""
if self._col_names is None:
runtime_getter = self._init_tree_getters()
@@ -1563,6 +1566,10 @@ class Dataset:

Returns:
list, list of shapes of each column.

Examples:
>>> # dataset is an instance object of Dataset
>>> output_shapes = dataset.output_shapes()
"""
if self.saved_output_shapes is None:
runtime_getter = self._init_tree_getters()
@@ -1580,6 +1587,10 @@ class Dataset:

Returns:
list, list of data types.

Examples:
>>> # dataset is an instance object of Dataset
>>> output_types = dataset.output_types()
"""
if self.saved_output_types is None:
runtime_getter = self._init_tree_getters()
@@ -1597,6 +1608,10 @@ class Dataset:

Returns:
int, number of batches.

Examples:
>>> # dataset is an instance object of Dataset
>>> dataset_size = dataset.get_dataset_size()
"""
if self.dataset_size is None:
runtime_getter = self.__init_size_getter()
@@ -1612,6 +1627,16 @@ class Dataset:
Args:
columns (dict): A dict contains shape information of each column in dataset.
The value of shape[i] is :py:obj:`None` indicates that the data length of shape[i] is dynamic.

Examples:
>>> import numpy as np
>>>
>>> def generator1():
>>> for i in range(1, 100):
>>> yield np.ones((16, i, 83)), np.array(i)
>>>
>>> dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])
>>> dataset.set_dynamic_columns(columns={"data1": [16, None, 83], "data2": []})
"""
if not isinstance(columns, dict):
raise TypeError("Pass a dict to set dynamic shape, example: {\"data1\": [16, None, 256]}")
@@ -1624,6 +1649,17 @@ class Dataset:

Returns:
lists, min_shapes, max_shapes of source data.

Examples:
>>> import numpy as np
>>>
>>> def generator1():
>>> for i in range(1, 100):
>>> yield np.ones((16, i, 83)), np.array(i)
>>>
>>> dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])
>>> dataset.set_dynamic_columns(columns={"data1": [16, None, 83], "data2": []})
>>> min_shapes, max_shapes = dataset.dynamic_min_max_shapes()
"""
if self.saved_min_shapes is None or self.saved_max_shapes is None:
self.saved_output_shapes, self.saved_min_shapes, self.saved_max_shapes = self._dynamic_output_shapes()
@@ -1712,6 +1748,10 @@ class Dataset:

Returns:
int, number of classes.

Examples:
>>> # dataset is an instance object of Dataset
>>> num_classes = dataset.num_classes()
"""
if self._num_classes is None:
runtime_getter = self._init_tree_getters()
@@ -1771,6 +1811,10 @@ class Dataset:

Returns:
int, the number of data in a batch.

Examples:
>>> # dataset is an instance object of Dataset
>>> batch_size = dataset.get_batch_size()
"""
if self._batch_size is None:
runtime_getter = self._init_tree_getters()
@@ -1785,6 +1829,10 @@ class Dataset:

Returns:
int, the count of repeat.

Examples:
>>> # dataset is an instance object of Dataset
>>> repeat_count = dataset.get_repeat_count()
"""
if self._repeat_count is None:
runtime_getter = self._init_tree_getters()
@@ -1801,6 +1849,10 @@ class Dataset:
dict, a str-to-int mapping from label name to index.
dict, a str-to-list<int> mapping from label name to index for Coco ONLY. The second number
in the list is used to indicate the super category.

Examples:
>>> # dataset is an instance object of Dataset
>>> class_indexing = dataset.get_class_indexing()
"""
if self.children:
return self.children[0].get_class_indexing()
@@ -1927,7 +1979,18 @@ class MappableDataset(SourceDataset):
self.sampler = samplers.select_sampler(num_samples, sampler, shuffle, num_shards, shard_id)

def add_sampler(self, new_sampler):
""" add a sampler """
"""
Add a sampler for current dataset,.

Args:
new_sampler (Sampler): The sampler to be added as the parent sampler for current dataset.

Examples:
>>> # dataset is an instance object of Dataset
>>> # use a DistributedSampler instead
>>> new_sampler = ds.DistributedSampler(10, 2)
>>> dataset.add_sampler(new_sampler)
"""
# note: By adding a sampler, the sampled IDs will flow to new_sampler
# after first passing through the current samplers attached to this dataset.
self.dataset_size = None
@@ -4523,6 +4586,12 @@ class ManifestDataset(MappableDataset):

Returns:
dict, a str-to-int mapping from label name to index.

Examples:
>>> manifest_dataset_dir = "/path/to/manifest_dataset_file"
>>>
>>> dataset = ds.ManifestDataset(dataset_file=manifest_dataset_dir)
>>> class_indexing = dataset.get_class_indexing()
"""
if self.class_indexing is None or not self.class_indexing:
if self._class_indexing is None:
@@ -4843,7 +4912,7 @@ class Schema:
Raises:
RuntimeError: If schema file failed to load.

Example:
Examples:
>>> from mindspore import dtype as mstype
>>>
>>> # Create schema; specify column name, mindspore.dtype and shape of the column
@@ -4896,7 +4965,7 @@ class Schema:
RuntimeError: If column's name field is missing.
RuntimeError: If column's type field is missing.

Example:
Examples:
>>> schema = Schema()
>>> columns1 = [{'name': 'image', 'type': 'int8', 'shape': [3, 3]},
>>> {'name': 'label', 'type': 'int8', 'shape': [1]}]
@@ -5220,6 +5289,12 @@ class VOCDataset(MappableDataset):

Returns:
dict, a str-to-int mapping from label name to index.

Examples:
>>> voc_dataset_dir = "/path/to/voc_dataset_directory"
>>>
>>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir)
>>> class_indexing = dataset.get_class_indexing()
"""
if self.task != "Detection":
raise NotImplementedError("Only 'Detection' support get_class_indexing.")
@@ -5423,7 +5498,18 @@ class CocoDataset(MappableDataset):
Get the class index.

Returns:
dict, a str-to-list<int> mapping from label name to index
dict, a str-to-list<int> mapping from label name to index.

Examples:
>>> coco_dataset_dir = "/path/to/coco_dataset_directory/images"
>>> coco_annotation_file = "/path/to/coco_dataset_directory/annotation_file"
>>>
>>> # Read COCO data for Detection task
>>> dataset = ds.CocoDataset(dataset_dir=coco_dataset_dir,
... annotation_file=coco_annotation_file,
... task='Detection')
>>>
>>> class_indexing = dataset.get_class_indexing()
"""
if self.task not in {"Detection", "Panoptic"}:
raise NotImplementedError("Only 'Detection' and 'Panoptic' support get_class_indexing.")


+ 16
- 2
mindspore/dataset/engine/samplers.py View File

@@ -121,7 +121,17 @@ class BuiltinSampler:
self.child_sampler = sampler

def get_child(self):
""" Get the child sampler. """
"""
Get the child sampler of given sampler.

Returns:
Sampler, The child sampler of given sampler.

Examples:
>>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
>>> sampler.add_child(ds.RandomSampler(num_samples=2))
>>> child_sampler = sampler.get_child()
"""
return self.child_sampler

def parse_child(self):
@@ -189,6 +199,10 @@ class BuiltinSampler:

Returns:
int, the number of samples, or None.

Examples:
>>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
>>> num_samplers = sampler.get_num_samples()
"""
if self.child_sampler is not None:
child_samples = self.child_sampler.get_num_samples()
@@ -306,7 +320,7 @@ class Sampler(BuiltinSampler):

class DistributedSampler(BuiltinSampler):
"""
A sampler that accesses a shard of the dataset.
A sampler that accesses a shard of the dataset, it helps divide dataset into multi-subset for distributed training.

Args:
num_shards (int): Number of shards to divide the dataset into.


+ 7
- 3
mindspore/dataset/text/utils.py View File

@@ -120,7 +120,7 @@ class Vocab(cde.Vocab):
Vocab, vocab built from the file.

Examples:
>>> vocab = text.Vocab.from_file("/path/to/wordpiece/vocab/file", ",", None, ["<pad>", "<unk>"], True)
>>> vocab = text.Vocab.from_file("/path/to/simple/vocab/file", ",", None, ["<pad>", "<unk>"], True)
"""
if vocab_size is None:
vocab_size = -1
@@ -183,6 +183,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
SentencePieceVocab, vocab built from the dataset.

Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
>>> vocab = text.SentencePieceVocab.from_dataset(dataset, ["text"], 5000, 0.9995,
... SentencePieceModel.UNIGRAM, {})
@@ -226,6 +227,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
SentencePieceVocab, vocab built from the file.

Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995,
... SentencePieceModel.UNIGRAM, {})
"""
@@ -244,6 +246,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
filename(str): The name of the file.

Examples:
>>> from mindspore.dataset.text import SentencePieceModel
>>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995,
... SentencePieceModel.UNIGRAM, {})
>>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
@@ -263,9 +266,10 @@ def to_str(array, encoding='utf8'):
numpy.ndarray, NumPy array of `str`.

Examples:
>>> dataset = ds.TextFileDataset("/path/to/text_file_dataset_file", shuffle=False)
>>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"]
>>> dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir, shuffle=False)
>>> for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
>>> print(text.to_str(item["text"]))
... print(text.to_str(item["text"]))
"""

if not isinstance(array, np.ndarray):


Loading…
Cancel
Save