msadd example for common used Api

4 years ago · fe36c57961
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -513,8 +513,10 @@ Status DeviceQueueOp::RetryPushData(unsigned int handle, const std::vector<DataI
    BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
    if (ret) {
      if (ret == BlockQueueStatus_T::ERROR_INPUT) {
        return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
                      "Invalid data, check the output of dataset with creating iterator and print data item.");
        return Status(
          StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
          "Invalid data, the types or shapes of current row is different with previous row(i.e. do batch operation but "
          "drop_reminder is False, or without resize image into the same size, these will cause shapes differs).");
      } else {
        if (!stop_send_) {
          if (!flag_log) {
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
@@ -922,8 +922,8 @@ Status AdjustBrightness(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
    }
    CHECK_FAIL_RETURN_UNEXPECTED(
      input_cv->shape().Size() > CHANNEL_INDEX,
      "AdjustBrightness: image rank should not bigger than:" + std::to_string(CHANNEL_INDEX) + ", but got" +
        std::to_string(input_cv->shape().Size()));
      "AdjustBrightness: image rank should not bigger than:" + std::to_string(CHANNEL_INDEX) +
        ", but got: " + std::to_string(input_cv->shape().Size()));
    int num_channels = input_cv->shape()[CHANNEL_INDEX];
    // Rank of the image represents how many dimensions, image is expected to be HWC
    if (input_cv->Rank() != DEFAULT_IMAGE_RANK || num_channels != DEFAULT_IMAGE_CHANNELS) {
@@ -949,7 +949,7 @@ Status AdjustContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tens
    }
    CHECK_FAIL_RETURN_UNEXPECTED(input_cv->shape().Size() > CHANNEL_INDEX,
                                 "AdjustContrast: image rank should not bigger than:" + std::to_string(CHANNEL_INDEX) +
                                   ", but got" + std::to_string(input_cv->shape().Size()));
                                   ", but got: " + std::to_string(input_cv->shape().Size()));
    int num_channels = input_cv->shape()[CHANNEL_INDEX];
    if (input_cv->Rank() != DEFAULT_IMAGE_CHANNELS || num_channels != DEFAULT_IMAGE_CHANNELS) {
      RETURN_STATUS_UNEXPECTED("AdjustContrast: image shape is not <H,W,C> or channel is not 3, got image rank: " +
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc
@@ -35,7 +35,8 @@ Status SharpnessOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_pt
    }

    if (input_cv->Rank() == 1 || input_cv->mat().dims > 2) {
      RETURN_STATUS_UNEXPECTED("Sharpness: shape of input is not <H,W,C> or <H,W>, but got rank: " + input_cv->Rank());
      RETURN_STATUS_UNEXPECTED("Sharpness: shape of input is not <H,W,C> or <H,W>, but got rank: " +
                               std::to_string(input_cv->Rank()));
    }

    /// creating a smoothing filter. 1, 1, 1,
--- a/mindspore/dataset/callback/ds_callback.py
+++ b/mindspore/dataset/callback/ds_callback.py
@@ -33,9 +33,9 @@ class DSCallback:
        >>> from mindspore.dataset import DSCallback
        >>>
        >>> class PrintInfo(DSCallback):
        >>>     def ds_epoch_end(self, ds_run_context):
        >>>         print(cb_params.cur_epoch_num)
        >>>         print(cb_params.cur_step_num)
        ...     def ds_epoch_end(self, ds_run_context):
        ...         print(cb_params.cur_epoch_num)
        ...         print(cb_params.cur_step_num)
        >>>
        >>> # dataset is an instance of Dataset object
        >>> dataset = dataset.map(operations=op, callbacks=PrintInfo())
@@ -71,7 +71,7 @@ class DSCallback:

    def ds_step_begin(self, ds_run_context):
        """
        Called before n steps are started.
        Called before each step start.

        Args:
            ds_run_context (RunContext): Include some information of the pipeline.
@@ -79,7 +79,7 @@ class DSCallback:

    def ds_step_end(self, ds_run_context):
        """
        Called after n steps are finished.
        Called after each step finished.

        Args:
            ds_run_context (RunContext): Include some information of the pipeline.
@@ -89,7 +89,8 @@ class DSCallback:
        """
        Creates a runtime (C++) object from the callback methods defined by the user.

        Returns: _c_dataengine.PyDSCallback
        Returns:
            _c_dataengine.PyDSCallback.
        """
        c_cb = PyDSCallback(self.step_size)
        at_least_one = False
@@ -225,7 +226,8 @@ class WaitedDSCallback(Callback, DSCallback):
        """
        Creates a runtime (C++) object from the callback methods defined by the user. This method is internal.

        Returns: _c_dataengine.PyDSCallback
        Returns:
            _c_dataengine.PyDSCallback.
        """
        c_cb = PyDSCallback(self.step_size)
        at_least_one = False
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -417,22 +417,21 @@ class Dataset:
            BucketBatchByLengthDataset, dataset bucketed and batched by length.

        Examples:
            >>> # Create a dataset where every 100 rows are combined into a batch
            >>> # Create a dataset where certain counts rows are combined into a batch
            >>> # and drops the last incomplete batch if there is one.
            >>> import numpy as np
            >>> def generate_2_columns(n):
            ...     for i in range(n):
            ...         yield (np.array([i]), np.array([j for j in range(i + 1)]))
            >>>
            >>> column_names = ["col1", "col2"]
            >>> dataset = ds.GeneratorDataset(generate_2_columns(202), column_names)
            >>> dataset = ds.GeneratorDataset(generate_2_columns(8), column_names)
            >>> bucket_boundaries = [5, 10]
            >>> bucket_batch_sizes = [5, 1, 1]
            >>> bucket_batch_sizes = [2, 1, 1]
            >>> element_length_function = (lambda col1, col2: max(len(col1), len(col2)))
            >>> # Will pad col1 to shape [2, bucket_boundaries[i]] where i is the
            >>> # Will pad col2 to shape [bucket_boundaries[i]] where i is the
            >>> # index of the bucket that is currently being batched.
            >>> # Will pad col2 to a shape where each dimension is the longest in all
            >>> # the elements currently being batched.
            >>> pad_info = {"col1": ([2, None], -1)}
            >>> pad_info = {"col2": ([None], -1)}
            >>> pad_to_bucket_boundary = True
            >>> dataset = dataset.bucket_batch_by_length(column_names, bucket_boundaries,
            ...                                          bucket_batch_sizes,
@@ -1145,7 +1144,7 @@ class Dataset:
        Returns:
            Vocab, vocab built from the dataset.

        Example:
        Examples:
            >>> import numpy as np
            >>>
            >>> def gen_corpus():
@@ -1213,7 +1212,7 @@ class Dataset:
        Returns:
            SentencePieceVocab, vocab built from the dataset.

        Example:
        Examples:
            >>> from mindspore.dataset.text import SentencePieceModel
            >>>
            >>> def gen_corpus():
@@ -1549,6 +1548,10 @@ class Dataset:

        Returns:
            list, list of column names in the dataset.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> col_names = dataset.get_col_names()
        """
        if self._col_names is None:
            runtime_getter = self._init_tree_getters()
@@ -1563,6 +1566,10 @@ class Dataset:

        Returns:
            list, list of shapes of each column.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> output_shapes = dataset.output_shapes()
        """
        if self.saved_output_shapes is None:
            runtime_getter = self._init_tree_getters()
@@ -1580,6 +1587,10 @@ class Dataset:

        Returns:
            list, list of data types.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> output_types = dataset.output_types()
        """
        if self.saved_output_types is None:
            runtime_getter = self._init_tree_getters()
@@ -1597,6 +1608,10 @@ class Dataset:

        Returns:
            int, number of batches.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> dataset_size = dataset.get_dataset_size()
        """
        if self.dataset_size is None:
            runtime_getter = self.__init_size_getter()
@@ -1612,6 +1627,16 @@ class Dataset:
        Args:
            columns (dict): A dict contains shape information of each column in dataset.
                The value of shape[i] is :py:obj:`None` indicates that the data length of shape[i] is dynamic.

        Examples:
            >>> import numpy as np
            >>>
            >>> def generator1():
            >>>     for i in range(1, 100):
            >>>         yield np.ones((16, i, 83)), np.array(i)
            >>>
            >>> dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])
            >>> dataset.set_dynamic_columns(columns={"data1": [16, None, 83], "data2": []})
        """
        if not isinstance(columns, dict):
            raise TypeError("Pass a dict to set dynamic shape, example: {\"data1\": [16, None, 256]}")
@@ -1624,6 +1649,17 @@ class Dataset:

        Returns:
            lists, min_shapes, max_shapes of source data.

        Examples:
            >>> import numpy as np
            >>>
            >>> def generator1():
            >>>     for i in range(1, 100):
            >>>         yield np.ones((16, i, 83)), np.array(i)
            >>>
            >>> dataset = ds.GeneratorDataset(generator1, ["data1", "data2"])
            >>> dataset.set_dynamic_columns(columns={"data1": [16, None, 83], "data2": []})
            >>> min_shapes, max_shapes = dataset.dynamic_min_max_shapes()
        """
        if self.saved_min_shapes is None or self.saved_max_shapes is None:
            self.saved_output_shapes, self.saved_min_shapes, self.saved_max_shapes = self._dynamic_output_shapes()
@@ -1712,6 +1748,10 @@ class Dataset:

        Returns:
            int, number of classes.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> num_classes = dataset.num_classes()
        """
        if self._num_classes is None:
            runtime_getter = self._init_tree_getters()
@@ -1771,6 +1811,10 @@ class Dataset:

        Returns:
            int, the number of data in a batch.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> batch_size = dataset.get_batch_size()
        """
        if self._batch_size is None:
            runtime_getter = self._init_tree_getters()
@@ -1785,6 +1829,10 @@ class Dataset:

        Returns:
            int, the count of repeat.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> repeat_count = dataset.get_repeat_count()
        """
        if self._repeat_count is None:
            runtime_getter = self._init_tree_getters()
@@ -1801,6 +1849,10 @@ class Dataset:
            dict, a str-to-int mapping from label name to index.
            dict, a str-to-list<int> mapping from label name to index for Coco ONLY. The second number
            in the list is used to indicate the super category.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> class_indexing = dataset.get_class_indexing()
        """
        if self.children:
            return self.children[0].get_class_indexing()
@@ -1927,7 +1979,18 @@ class MappableDataset(SourceDataset):
        self.sampler = samplers.select_sampler(num_samples, sampler, shuffle, num_shards, shard_id)

    def add_sampler(self, new_sampler):
        """ add a sampler """
        """
        Add a sampler for current dataset,.

        Args:
            new_sampler (Sampler): The sampler to be added as the parent sampler for current dataset.

        Examples:
            >>> # dataset is an instance object of Dataset
            >>> # use a DistributedSampler instead
            >>> new_sampler = ds.DistributedSampler(10, 2)
            >>> dataset.add_sampler(new_sampler)
        """
        # note: By adding a sampler, the sampled IDs will flow to new_sampler
        # after first passing through the current samplers attached to this dataset.
        self.dataset_size = None
@@ -4523,6 +4586,12 @@ class ManifestDataset(MappableDataset):

        Returns:
            dict, a str-to-int mapping from label name to index.

        Examples:
            >>> manifest_dataset_dir = "/path/to/manifest_dataset_file"
            >>>
            >>> dataset = ds.ManifestDataset(dataset_file=manifest_dataset_dir)
            >>> class_indexing = dataset.get_class_indexing()
        """
        if self.class_indexing is None or not self.class_indexing:
            if self._class_indexing is None:
@@ -4843,7 +4912,7 @@ class Schema:
    Raises:
        RuntimeError: If schema file failed to load.

    Example:
    Examples:
        >>> from mindspore import dtype as mstype
        >>>
        >>> # Create schema; specify column name, mindspore.dtype and shape of the column
@@ -4896,7 +4965,7 @@ class Schema:
            RuntimeError: If column's name field is missing.
            RuntimeError: If column's type field is missing.

        Example:
        Examples:
            >>> schema = Schema()
            >>> columns1 = [{'name': 'image', 'type': 'int8', 'shape': [3, 3]},
            >>>             {'name': 'label', 'type': 'int8', 'shape': [1]}]
@@ -5220,6 +5289,12 @@ class VOCDataset(MappableDataset):

        Returns:
            dict, a str-to-int mapping from label name to index.

        Examples:
            >>> voc_dataset_dir = "/path/to/voc_dataset_directory"
            >>>
            >>> dataset = ds.VOCDataset(dataset_dir=voc_dataset_dir)
            >>> class_indexing = dataset.get_class_indexing()
        """
        if self.task != "Detection":
            raise NotImplementedError("Only 'Detection' support get_class_indexing.")
@@ -5423,7 +5498,18 @@ class CocoDataset(MappableDataset):
        Get the class index.

        Returns:
            dict, a str-to-list<int> mapping from label name to index
            dict, a str-to-list<int> mapping from label name to index.

        Examples:
            >>> coco_dataset_dir = "/path/to/coco_dataset_directory/images"
            >>> coco_annotation_file = "/path/to/coco_dataset_directory/annotation_file"
            >>>
            >>> # Read COCO data for Detection task
            >>> dataset = ds.CocoDataset(dataset_dir=coco_dataset_dir,
            ...                          annotation_file=coco_annotation_file,
            ...                          task='Detection')
            >>>
            >>> class_indexing = dataset.get_class_indexing()
        """
        if self.task not in {"Detection", "Panoptic"}:
            raise NotImplementedError("Only 'Detection' and 'Panoptic' support get_class_indexing.")
--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@@ -121,7 +121,17 @@ class BuiltinSampler:
        self.child_sampler = sampler

    def get_child(self):
        """ Get the child sampler. """
        """
        Get the child sampler of given sampler.

        Returns:
            Sampler, The child sampler of given sampler.

        Examples:
            >>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
            >>> sampler.add_child(ds.RandomSampler(num_samples=2))
            >>> child_sampler = sampler.get_child()
        """
        return self.child_sampler

    def parse_child(self):
@@ -189,6 +199,10 @@ class BuiltinSampler:

        Returns:
            int, the number of samples, or None.

        Examples:
            >>> sampler = ds.SequentialSampler(start_index=0, num_samples=3)
            >>> num_samplers = sampler.get_num_samples()
        """
        if self.child_sampler is not None:
            child_samples = self.child_sampler.get_num_samples()
@@ -306,7 +320,7 @@ class Sampler(BuiltinSampler):

 class DistributedSampler(BuiltinSampler):
    """
    A sampler that accesses a shard of the dataset.
    A sampler that accesses a shard of the dataset, it helps divide dataset into multi-subset for distributed training.

    Args:
        num_shards (int): Number of shards to divide the dataset into.
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@@ -120,7 +120,7 @@ class Vocab(cde.Vocab):
            Vocab, vocab built from the file.

        Examples:
            >>> vocab = text.Vocab.from_file("/path/to/wordpiece/vocab/file", ",", None, ["<pad>", "<unk>"], True)
            >>> vocab = text.Vocab.from_file("/path/to/simple/vocab/file", ",", None, ["<pad>", "<unk>"], True)
        """
        if vocab_size is None:
            vocab_size = -1
@@ -183,6 +183,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
            SentencePieceVocab, vocab built from the dataset.

        Examples:
            >>> from mindspore.dataset.text import SentencePieceModel
            >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
            >>> vocab = text.SentencePieceVocab.from_dataset(dataset, ["text"], 5000, 0.9995,
            ...                                              SentencePieceModel.UNIGRAM, {})
@@ -226,6 +227,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
            SentencePieceVocab, vocab built from the file.

        Examples:
            >>> from mindspore.dataset.text import SentencePieceModel
            >>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995,
            ...                                           SentencePieceModel.UNIGRAM, {})
        """
@@ -244,6 +246,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
            filename(str): The name of the file.

        Examples:
            >>> from mindspore.dataset.text import SentencePieceModel
            >>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995,
            ...                                           SentencePieceModel.UNIGRAM, {})
            >>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
@@ -263,9 +266,10 @@ def to_str(array, encoding='utf8'):
        numpy.ndarray, NumPy array of `str`.

    Examples:
        >>> dataset = ds.TextFileDataset("/path/to/text_file_dataset_file", shuffle=False)
        >>> text_file_dataset_dir = ["/path/to/text_file_dataset_file"]
        >>> dataset = ds.TextFileDataset(dataset_files=text_file_dataset_dir, shuffle=False)
        >>> for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        >>>     print(text.to_str(item["text"]))
        ...     print(text.to_str(item["text"]))
    """

    if not isinstance(array, np.ndarray):