!12291 fix some wrong descriptions of the API docs

From: @tiancixiao Reviewed-by: @pandoublefeng,@liucunwei Signed-off-by: @pandoublefeng,@liucunwei
4 years ago · 2e9a52fc5c
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -1732,10 +1732,7 @@ class MappableDataset(SourceDataset):
            new_sampler (Sampler): The sampler to use for the current dataset.

        Examples:
            >>> # Note: A SequentialSampler is created by default
            >>> dataset = ds.ImageFolderDataset(image_folder_dataset_dir)
            >>>
            >>> # Use a DistributedSampler instead of the SequentialSampler
            >>> # use a DistributedSampler instead
            >>> new_sampler = ds.DistributedSampler(10, 2)
            >>> dataset.use_sampler(new_sampler)
        """
@@ -2888,15 +2885,15 @@ class MnistDataset(MappableDataset):

    The generated dataset has two columns ['image', 'label'].
    The type of the image tensor is uint8. The label is a scalar uint32 tensor.
    This dataset can take in a sampler. 'sampler' and 'shuffle' are mutually exclusive. The table
    This dataset can take in a sampler. `sampler` and `shuffle` are mutually exclusive. The table
    below shows what input arguments are allowed and their expected behavior.

    .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
       :widths: 25 25 50
       :header-rows: 1

       * - Parameter 'sampler'
         - Parameter 'shuffle'
       * - Parameter `sampler`
         - Parameter `shuffle`
         - Expected Order Behavior
       * - None
         - None
@@ -2937,19 +2934,19 @@ class MnistDataset(MappableDataset):
        dataset_dir (str): Path to the root directory that contains the dataset.
        usage (str, optional): Usage of this dataset, can be "train", "test" or "all" . "train" will read from 60,000
            train samples, "test" will read from 10,000 test samples, "all" will read from all 70,000 samples.
            (default=None, all samples)
            (default=None, will read all samples)
        num_samples (int, optional): The number of images to be included in the dataset
            (default=None, all images).
            (default=None, will read all images).
        num_parallel_workers (int, optional): Number of workers to read the data
            (default=None, set in the config).
            (default=None, will use value set in the config).
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset
            (default=None, expected order behavior shown in the table).
        sampler (Sampler, optional): Object used to choose samples from the
            dataset (default=None, expected order behavior shown in the table).
        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
            When this argument is specified, 'num_samples' reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
            When this argument is specified, `num_samples` reflects the max sample number of per shard.
        shard_id (int, optional): The shard ID within `num_shards` (default=None). This
            argument can only be specified when `num_shards` is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
            (default=None, which means no cache is used).

@@ -3587,15 +3584,15 @@ class ManifestDataset(MappableDataset):
    The shape of the image column is [image_size] if decode flag is False, or [H,W,C]
    otherwise.
    The type of the image tensor is uint8. The label is a scalar uint64 tensor.
    This dataset can take in a sampler. 'sampler' and 'shuffle' are mutually exclusive. The table
    This dataset can take in a sampler. `sampler` and `shuffle` are mutually exclusive. The table
    below shows what input arguments are allowed and their expected behavior.

    .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
    .. list-table:: Expected Order Behavior of Using `sampler` and `shuffle`
       :widths: 25 25 50
       :header-rows: 1

       * - Parameter 'sampler'
         - Parameter 'shuffle'
       * - Parameter `sampler`
         - Parameter `shuffle`
         - Expected Order Behavior
       * - None
         - None
@@ -3618,11 +3615,11 @@ class ManifestDataset(MappableDataset):

    Args:
        dataset_file (str): File to be read.
        usage (str, optional): acceptable usages include train, eval and inference (default="train").
        usage (str, optional): Acceptable usages include "train", "eval" and "inference" (default="train").
        num_samples (int, optional): The number of images to be included in the dataset.
            (default=None, all images).
            (default=None, will include all images).
        num_parallel_workers (int, optional): Number of workers to read the data
            (default=None, number set in the config).
            (default=None, will use value set in the config).
        shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None, expected
            order behavior shown in the table).
        sampler (Sampler, optional): Object used to choose samples from the
@@ -3632,10 +3629,10 @@ class ManifestDataset(MappableDataset):
            class will be given a unique index starting from 0).
        decode (bool, optional): decode the images after reading (default=False).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None). When this argument is specified, 'num_samples' reflects
            into (default=None). When this argument is specified, `num_samples` reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        shard_id (int, optional): The shard ID within `num_shards` (default=None). This
            argument can only be specified when `num_shards` is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
            (default=None, which means no cache is used).

@@ -4195,7 +4192,8 @@ class CocoDataset(MappableDataset):
    """
    A source dataset for reading and parsing COCO dataset.

    CocoDataset support four kinds of task: 2017 Train/Val/Test Detection, Keypoints, Stuff, Panoptic.
    `CocoDataset` supports four kinds of tasks, which are Object Detection, Keypoint Detection, Stuff Segmentation and
    Panoptic Segmentation of 2017 Train/Val/Test dataset.

    The generated dataset has multi-columns :

@@ -4339,11 +4337,12 @@ class CocoDataset(MappableDataset):

 class CelebADataset(MappableDataset):
    """
    A source dataset for reading and parsing CelebA dataset. Currently supported: list_attr_celeba.txt only.
    A source dataset for reading and parsing CelebA dataset. Only support to read `list_attr_celeba.txt` currently,
    which is the attribute annotations of the dataset.

    Note:
        The generated dataset has two columns ['image', 'attr'].
        The type of the image tensor is uint8. The attribute tensor is uint32 and one hot type.
        The image tensor is of the uint8 type. The attribute tensor is of the uint32 type and one hot encoded.

    Citation of CelebA dataset.

@@ -4376,20 +4375,20 @@ class CelebADataset(MappableDataset):

    Args:
        dataset_dir (str): Path to the root directory that contains the dataset.
        num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config).
        num_parallel_workers (int, optional): Number of workers to read the data (default=None, will use value set in
            the config).
        shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None).
        usage (str): one of 'all', 'train', 'valid' or 'test'.
        usage (str): one of 'all', 'train', 'valid' or 'test' (default='all', will read all samples).
        sampler (Sampler, optional): Object used to choose samples from the dataset (default=None).
        decode (bool, optional): decode the images after reading (default=False).
        extensions (list[str], optional): List of file extensions to be
            included in the dataset (default=None).
        num_samples (int, optional): The number of images to be included in the dataset.
            (default=None, all images).
        extensions (list[str], optional): List of file extensions to be included in the dataset (default=None).
        num_samples (int, optional): The number of images to be included in the dataset
            (default=None, will include all images).
        num_shards (int, optional): Number of shards that the dataset will be divided
            into (default=None). When this argument is specified, 'num_samples' reflects
            into (default=None). When this argument is specified, `num_samples` reflects
            the max sample number of per shard.
        shard_id (int, optional): The shard ID within num_shards (default=None). This
            argument can only be specified when num_shards is also specified.
        shard_id (int, optional): The shard ID within `num_shards` (default=None). This
            argument can only be specified when `num_shards` is also specified.
        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
            (default=None, which means no cache is used).

--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@@ -63,6 +63,7 @@ class TextTensorOperation(TensorOperation):
    """
    Base class of Text Tensor Ops
    """

    def __call__(self, input_tensor):
        if not isinstance(input_tensor, list):
            input_list = [input_tensor]
@@ -95,13 +96,11 @@ DE_C_INTER_JIEBA_MODE = {
    JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM
 }


 DE_C_INTER_SENTENCEPIECE_LOADTYPE = {
    SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE,
    SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL
 }


 DE_C_INTER_SENTENCEPIECE_OUTTYPE = {
    SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString,
    SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT
@@ -282,7 +281,7 @@ class Lookup(TextTensorOperation):
        vocab (Vocab): A vocabulary object.
        unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV).
            If unknown_token is OOV, a runtime error will be thrown (default=None).
        data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mstype.int32)
        data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mindspore.int32)

    Examples:
        >>> # Load vocabulary from list
@@ -309,18 +308,19 @@ class Ngram(TextTensorOperation):
    Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.

    Args:
        n (list[int]):  n in n-gram, n >= 1. n is a list of positive integers. For example, if n=[4,3], then the result
        n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result
            would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
            for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore","best"] will result in
            for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in
            an empty string produced.
        left_pad (tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width
            will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default=None).
        right_pad (tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence.
            pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--"
            (default=None).
        separator (str, optional): symbol used to join strings together. For example. if 2-gram is
        left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width).
            `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the
            sequence with "__" (default=None).
        right_pad (tuple, optional): Padding performed on right side of the sequence shaped like
            ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("-", 2)
            would pad right side of the sequence with "--" (default=None).
        separator (str, optional): Symbol used to join strings together. For example. if 2-gram is
            ["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"]
            (default=None, which means whitespace is used).
            (default=None, which will use whitespace as separator).

    Examples:
        >>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator=""))
@@ -389,6 +389,7 @@ class SlidingWindow(TextTensorOperation):
        >>> # |   [3,4,5]]  |
        >>> # +--------------+
    """

    @check_slidingwindow
    def __init__(self, width, axis=0):
        self.width = width
@@ -557,6 +558,7 @@ class PythonTokenizer:
        tokens = self.tokenizer(in_array)
        return tokens


 if platform.system().lower() != 'windows':
    DE_C_INTER_NORMALIZE_FORM = {
        NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
@@ -575,12 +577,12 @@ if platform.system().lower() != 'windows':
            BasicTokenizer is not supported on Windows platform yet.

        Args:
            lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
            lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation
                on input text to fold the text to lower case and strip accents characters. If False, only apply
                NormalizeUTF8('normalization_form' mode) operation on input text (default=False).
            keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
                NormalizeUTF8 operation with the specified mode on input text (default=False).
            keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False).
            normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
                only effective when 'lower_case' is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
                only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
            preserve_unused_token (bool, optional): If True, do not split special tokens like
                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
@@ -637,14 +639,14 @@ if platform.system().lower() != 'windows':
            vocab (Vocab): A vocabulary object.
            suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
            max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
            unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
                return the token directly, else return 'unknown_token'(default='[UNK]').
            lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
            unknown_token (str, optional): When an unknown token is found, return the token directly if `unknown_token`
                is an empty string, else return `unknown_token` instead (default='[UNK]').
            lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation
                on input text to fold the text to lower case and strip accented characters. If False, only apply
                NormalizeUTF8('normalization_form' mode) operation on input text (default=False).
                NormalizeUTF8 operation with the specified mode on input text (default=False).
            keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
            normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode,
                only effective when 'lower_case' is False. See NormalizeUTF8 for details (default='NONE').
                only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
            preserve_unused_token (bool, optional): If True, do not split special tokens like
                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
            with_offsets (bool, optional): If or not output offsets of tokens (default=False).
@@ -703,7 +705,8 @@ if platform.system().lower() != 'windows':

    class CaseFold(TextTensorOperation):
        """
        Apply case fold operation on UTF-8 string tensor.
        Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into
        lower case.

        Note:
            CaseFold is not supported on Windows platform yet.
--- a/mindspore/dataset/transforms/c_transforms.py
+++ b/mindspore/dataset/transforms/c_transforms.py
@@ -59,23 +59,24 @@ class OneHot(cde.OneHotOp):

 class Fill(cde.FillOp):
    """
    Tensor operation to create a tensor filled with input scalar value.
    Tensor operation to fill all elements in the tensor with the specified value.
    The output tensor will have the same shape and type as the input tensor.

    Args:
        fill_value (Union[str, bytes, int, float, bool])) : scalar value
            to fill created tensor with.
            to fill the tensor with.

    Examples:
        >>> import numpy as np
        >>> from mindspore.dataset import GeneratorDataset
        >>> # Generate 1d int numpy array from 0 - 63
        >>> # generate a 1D integer numpy array from 0 to 4
        >>> def generator_1d():
        >>>     for i in range(64):
        ...     for i in range(5):
        ...         yield (np.array([i]),)
        >>> generator_dataset = GeneratorDataset(generator_1d,column_names='col')
        >>> generator_dataset = ds.GeneratorDataset(generator_1d, column_names="col1")
        >>> # [[0], [1], [2], [3], [4]]
        >>> fill_op = c_transforms.Fill(3)
        >>> generator_dataset = generator_dataset.map(operations=fill_op)
        >>> # [[3], [3], [3], [3], [3]]
    """

    @check_fill_value
@@ -351,6 +352,8 @@ class Unique(cde.UniqueOp):
        >>> # +---------+-----------------+---------+

    """


 class Compose():
    """
    Compose a list of transforms into a single transform.
@@ -376,6 +379,7 @@ class Compose():
                operations.append(op)
        return cde.ComposeOperation(operations)


 class RandomApply():
    """
    Randomly perform a series of transforms with a given probability.
--- a/mindspore/dataset/vision/c_transforms.py
+++ b/mindspore/dataset/vision/c_transforms.py
@@ -62,6 +62,7 @@ class ImageTensorOperation(TensorOperation):
    """
    Base class of Image Tensor Ops
    """

    def __call__(self, input_tensor):
        if not isinstance(input_tensor, list):
            input_list = [input_tensor]
@@ -93,11 +94,9 @@ DE_C_BORDER_TYPE = {Border.CONSTANT: cde.BorderType.DE_BORDER_CONSTANT,
                    Border.REFLECT: cde.BorderType.DE_BORDER_REFLECT,
                    Border.SYMMETRIC: cde.BorderType.DE_BORDER_SYMMETRIC}


 DE_C_IMAGE_BATCH_FORMAT = {ImageBatchFormat.NHWC: cde.ImageBatchFormat.DE_IMAGE_BATCH_FORMAT_NHWC,
                           ImageBatchFormat.NCHW: cde.ImageBatchFormat.DE_IMAGE_BATCH_FORMAT_NCHW}


 DE_C_INTER_MODE = {Inter.NEAREST: cde.InterpolationMode.DE_INTER_NEAREST_NEIGHBOUR,
                   Inter.LINEAR: cde.InterpolationMode.DE_INTER_LINEAR,
                   Inter.CUBIC: cde.InterpolationMode.DE_INTER_CUBIC,
@@ -307,6 +306,7 @@ class Equalize(ImageTensorOperation):
        >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list,
        ...                                                 input_columns=["image"])
    """

    def parse(self):
        return cde.EqualizeOperation()

@@ -337,6 +337,7 @@ class Invert(ImageTensorOperation):
        >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list,
        ...                                                 input_columns=["image"])
    """

    def parse(self):
        return cde.InvertOperation()

@@ -729,7 +730,7 @@ class RandomCrop(ImageTensorOperation):

 class RandomCropDecodeResize(ImageTensorOperation):
    """
    Equivalent to RandomResizedCrop, but crops before decodes.
    A combination of `Crop`, `Decode` and `Resize`. It will get better performance for JPEG images.

    Args:
        size (Union[int, sequence]): The size of the output image.
@@ -813,7 +814,7 @@ class RandomCropWithBBox(ImageTensorOperation):

    Examples:
        >>> decode_op = c_vision.Decode()
        >>> random_crop_with_bbox_op = c_vision.RandomCrop([512, 512], [200, 200, 200, 200])
        >>> random_crop_with_bbox_op = c_vision.RandomCropWithBBox([512, 512], [200, 200, 200, 200])
        >>> transforms_list = [decode_op, random_crop_with_bbox_op]
        >>> image_folder_dataset = image_folder_dataset.map(operations=transforms_list,
        ...                                                 input_columns=["image"])