Browse Source

add common import

tags/v1.6.0
ms_yan 4 years ago
parent
commit
bfd306a6f2
11 changed files with 124 additions and 32 deletions
  1. +8
    -5
      mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_and_resize_op.cc
  2. +7
    -0
      mindspore/dataset/__init__.py
  3. +7
    -0
      mindspore/dataset/audio/__init__.py
  4. +10
    -4
      mindspore/dataset/callback/ds_callback.py
  5. +6
    -0
      mindspore/dataset/core/config.py
  6. +26
    -22
      mindspore/dataset/engine/datasets.py
  7. +1
    -1
      mindspore/dataset/engine/graphdata.py
  8. +7
    -0
      mindspore/dataset/text/__init__.py
  9. +35
    -0
      mindspore/dataset/text/utils.py
  10. +9
    -0
      mindspore/dataset/transforms/__init__.py
  11. +8
    -0
      mindspore/dataset/vision/__init__.py

+ 8
- 5
mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_and_resize_op.cc View File

@@ -109,12 +109,15 @@ Status RandomCropAndResizeOp::GetCropBox(int h_in, int w_in, int *x, int *y, int
// Note rnd_aspect_ is already a random distribution of the input aspect ratio in logarithmic sample_scale.
double const sample_aspect = exp(rnd_aspect_(rnd_));

CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / h_in) > w_in,
"RandomCropAndResizeOp: multiplication out of bounds");
CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / h_in / w_in) > sample_scale,
"RandomCropAndResizeOp: multiplication out of bounds");
CHECK_FAIL_RETURN_UNEXPECTED(
(std::numeric_limits<int32_t>::max() / h_in) > w_in,
"RandomCropAndResizeOp: multiplication out of bounds, check image width and image height first.");
CHECK_FAIL_RETURN_UNEXPECTED(
(std::numeric_limits<int32_t>::max() / h_in / w_in) > sample_scale,
"RandomCropAndResizeOp: multiplication out of bounds, check image width, image height and sample scale first.");
CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / h_in / w_in / sample_scale) > sample_aspect,
"RandomCropAndResizeOp: multiplication out of bounds");
"RandomCropAndResizeOp: multiplication out of bounds, check image width, image "
"height, sample scale and sample aspect first.");
*crop_width = static_cast<int32_t>(std::round(std::sqrt(h_in * w_in * sample_scale * sample_aspect)));
*crop_height = static_cast<int32_t>(std::round(*crop_width / sample_aspect));



+ 7
- 0
mindspore/dataset/__init__.py View File

@@ -22,6 +22,13 @@ Besides, this module provides APIs to sample data while loading.
We can enable cache in most of the dataset with its key arguments 'cache'. Please notice that cache is not supported
on Windows platform yet. Do not use it while loading and processing data on Windows. More introductions and limitations
can refer `Single-Node Tensor Cache <https://www.mindspore.cn/docs/programming_guide/en/master/cache.html>`_.

Common imported modules in corresponding API examples are as follows:

.. code-block::

import mindspore.dataset as ds
from mindspore.dataset.transforms import c_transforms
"""

from .core import config


+ 7
- 0
mindspore/dataset/audio/__init__.py View File

@@ -17,6 +17,13 @@ This module is to support audio augmentations.
It includes two parts: transforms and utils.
transforms is a high performance processing module with common audio operations.
utils provides some general methods for audio processing.

Common imported modules in corresponding API examples are as follows:

.. code-block::

import mindspore.dataset as ds
from mindspore.dataset import audio
"""
from . import transforms
from . import utils

+ 10
- 4
mindspore/dataset/callback/ds_callback.py View File

@@ -30,12 +30,15 @@ class DSCallback:
step_size (int, optional): The number of steps between the step_begin and step_end are called (Default=1).

Examples:
>>> from mindspore.dataset import DSCallback
>>>
>>> class PrintInfo(DSCallback):
>>> def ds_epoch_end(self, ds_run_context):
>>> print(cb_params.cur_epoch_num)
>>> print(cb_params.cur_step_num)
>>>
>>> data = data.map(operations=op, callbacks=PrintInfo())
>>> # dataset is an instance of Dataset object
>>> dataset = dataset.map(operations=op, callbacks=PrintInfo())
"""

@check_callback
@@ -127,9 +130,12 @@ class WaitedDSCallback(Callback, DSCallback):
will be equal to the batch size (Default=1).

Examples:
>>> my_cb = MyWaitedCallback(32)
>>> data = data.map(operations=AugOp(), callbacks=my_cb)
>>> data = data.batch(32)
>>> from mindspore.dataset import WaitedDSCallback
>>>
>>> my_cb = WaitedDSCallback(32)
>>> # dataset is an instance of Dataset object
>>> dataset = dataset.map(operations=AugOp(), callbacks=my_cb)
>>> dataset = dataset.batch(32)
>>> # define the model
>>> model.train(epochs, data, callbacks=[my_cb])
"""


+ 6
- 0
mindspore/dataset/core/config.py View File

@@ -15,6 +15,12 @@
"""
The configuration module provides various functions to set and get the supported
configuration parameters, and read a configuration file.

Common imported modules in corresponding API examples are as follows:

.. code-block::

import mindspore.dataset as ds
"""
import os
import platform


+ 26
- 22
mindspore/dataset/engine/datasets.py View File

@@ -264,7 +264,7 @@ class Dataset:
def close_pool(self):
"""
Close multiprocessing pool in dataset. If you are familiar with multiprocessing library, you can regard this
as a deconstructor for a processingPool object.
as a destructor for a processingPool object.
"""
if hasattr(self, 'process_pool') and self.process_pool is not None:
self.process_pool.close()
@@ -587,7 +587,7 @@ class Dataset:
RuntimeError: If exist sync operators before shuffle.

Examples:
>>> # dataset is an instance of Dataset object.
>>> # dataset is an instance object of Dataset
>>> # Optionally set the seed for the first epoch
>>> ds.config.set_seed(58)
>>> # Create a shuffled dataset using a shuffle buffer of size 4
@@ -823,7 +823,7 @@ class Dataset:
RepeatDataset, dataset repeated.

Examples:
>>> # dataset is an instance of Dataset object.
>>> # dataset is an instance object of Dataset
>>>
>>> # Create a dataset where the dataset is repeated for 50 epochs
>>> dataset = dataset.repeat(50)
@@ -852,7 +852,7 @@ class Dataset:
SkipDataset, dataset that containing rows like origin rows subtract skipped rows.

Examples:
>>> # dataset is an instance of Dataset object.
>>> # dataset is an instance object of Dataset
>>> # Create a dataset which skips first 3 elements from data
>>> dataset = dataset.skip(3)
"""
@@ -876,7 +876,7 @@ class Dataset:
TakeDataset, dataset taken.

Examples:
>>> # dataset is an instance of Dataset object.
>>> # dataset is an instance object of Dataset
>>> # Create a dataset where the dataset includes 50 elements.
>>> dataset = dataset.take(50)
"""
@@ -1085,7 +1085,7 @@ class Dataset:
RenameDataset, dataset renamed.

Examples:
>>> # dataset is an instance of Dataset object.
>>> # dataset is an instance object of Dataset
>>> input_columns = ["input_col1", "input_col2", "input_col3"]
>>> output_columns = ["output_col1", "output_col2", "output_col3"]
>>>
@@ -1112,7 +1112,7 @@ class Dataset:
ProjectDataset, dataset projected.

Examples:
>>> # dataset is an instance of Dataset object
>>> # dataset is an instance object of Dataset
>>> columns_to_project = ["column3", "column1", "column2"]
>>>
>>> # Create a dataset that consists of column3, column1, column2
@@ -1135,28 +1135,30 @@ class Dataset:
freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency
range will be stored.
Naturally 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
an be set to default, which corresponds to 0/total_words separately
can be set to default, which corresponds to 0/total_words separately.
top_k(int): Number of words to be built into vocab. top_k most frequent words are
taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken
special_tokens(list[str]): A list of strings, each one is a special token
special_tokens(list[str]): A list of strings, each one is a special token.
special_first(bool): Whether special_tokens will be prepended/appended to vocab, If special_tokens
is specified and special_first is set to default, special_tokens will be prepended
is specified and special_first is set to default, special_tokens will be prepended.

Returns:
Vocab, vocab built from the dataset.

Example:
>>> import numpy as np
>>>
>>> def gen_corpus():
... # key: word, value: number of occurrences, reason for using letters is so their order is apparent
... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
... for k, v in corpus.items():
... yield (np.array([k] * v, dtype='S'),)
>>> column_names = ["column1", "column2", "column3"]
>>> column_names = ["column1"]
>>> dataset = ds.GeneratorDataset(gen_corpus, column_names)
>>> dataset = dataset.build_vocab(columns=["column3", "column1", "column2"],
>>> dataset = dataset.build_vocab(columns=["column1"],
... freq_range=(1, 10), top_k=5,
... special_tokens=["<pad>", "<unk>"],
... special_first=True,vocab='vocab')
... special_first=True)

"""
vocab = cde.Vocab()
@@ -1213,6 +1215,7 @@ class Dataset:

Example:
>>> from mindspore.dataset.text import SentencePieceModel
>>>
>>> def gen_corpus():
... # key: word, value: number of occurrences, reason for using letters is so their order is apparent
... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
@@ -1223,8 +1226,8 @@ class Dataset:
>>> dataset = dataset.build_sentencepiece_vocab(columns=["column3", "column1", "column2"],
... vocab_size=5000,
... character_coverage=0.9995,
... model_type=SentencePieceModel.Unigram,
... params={},vocab='vocab')
... model_type=SentencePieceModel.UNIGRAM,
... params={})
"""
vocab = cde.SentencePieceVocab()

@@ -1253,13 +1256,13 @@ class Dataset:

Args:
apply_func (function): A function that must take one 'Dataset' as an argument and
return a preprogressing 'Dataset'.
return a preprocessed 'Dataset'.

Returns:
Dataset, dataset applied by the function.

Examples:
>>> # dataset is an instance of Dataset object
>>> # dataset is an instance object of Dataset
>>>
>>> # Declare an apply_func function which returns a Dataset object
>>> def apply_func(data):
@@ -1427,7 +1430,7 @@ class Dataset:
TupleIterator, tuple iterator over the dataset.

Examples:
>>> # dataset is an instance of Dataset object
>>> # dataset is an instance object of Dataset
>>> iterator = dataset.create_tuple_iterator()
>>> for item in iterator:
... # item is a list
@@ -1459,7 +1462,7 @@ class Dataset:
DictIterator, dictionary iterator over the dataset.

Examples:
>>> # dataset is an instance of Dataset object
>>> # dataset is an instance object of Dataset
>>> iterator = dataset.create_dict_iterator()
>>> for item in iterator:
... # item is a dict
@@ -1487,7 +1490,7 @@ class Dataset:
tuple, tuple of the input index information.

Examples:
>>> # dataset is an instance of Dataset object
>>> # dataset is an instance object of Dataset
>>> # set input_indexs
>>> dataset.input_indexs = 10
>>> print(dataset.input_indexs)
@@ -1939,6 +1942,7 @@ class MappableDataset(SourceDataset):
new_sampler (Sampler): The sampler to use for the current dataset.

Examples:
>>> # dataset is an instance object of Dataset
>>> # use a DistributedSampler instead
>>> new_sampler = ds.DistributedSampler(10, 2)
>>> dataset.use_sampler(new_sampler)
@@ -1987,8 +1991,8 @@ class MappableDataset(SourceDataset):
1. There is an optimized split function, which will be called automatically when the dataset
that calls this function is a MappableDataset.
2. Dataset should not be sharded if split is going to be called. Instead, create a
DistributedSampler and specify a split to shard after splitting. If dataset is
sharded after a split, it is strongly recommended to set the same seed in each instance
DistributedSampler and specify a split to shard after splitting. If the dataset is
sharded after a split, it is strongly recommended setting the same seed in each instance
of execution, otherwise each shard may not be part of the same split (see Examples).
3. It is strongly recommended to not shuffle the dataset, but use randomize=True instead.
Shuffling the dataset may not be deterministic, which means the data in each split


+ 1
- 1
mindspore/dataset/engine/graphdata.py View File

@@ -333,7 +333,7 @@ class GraphData:
next-hop sampling. A maximum of 6-hop are allowed.

The sampling result is tiled into a list in the format of [input node, 1-hop sampling result,
2-hop samling result ...]
2-hop sampling result ...]

Args:
node_list (Union[list, numpy.ndarray]): The given list of nodes.


+ 7
- 0
mindspore/dataset/text/__init__.py View File

@@ -16,6 +16,13 @@ This module is to support text processing for NLP. It includes two parts:
transforms and utils. transforms is a high performance
NLP text processing module which is developed with ICU4C and cppjieba.
utils provides some general methods for NLP text processing.

Common imported modules in corresponding API examples are as follows:

.. code-block::

import mindspore.dataset as ds
from mindspore.dataset import text
"""
import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \


+ 35
- 0
mindspore/dataset/text/utils.py View File

@@ -66,6 +66,13 @@ class Vocab(cde.Vocab):

Returns:
Vocab, vocab built from the dataset.

Examples:
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
>>> vocab = text.Vocab.from_dataset(dataset, "text", freq_range=None, top_k=None,
... special_tokens=["<pad>", "<unk>"],
... special_first=True)
>>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
"""
return dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)

@@ -84,6 +91,9 @@ class Vocab(cde.Vocab):

Returns:
Vocab, vocab built from the `list`.

Examples:
>>> vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True)
"""
if special_tokens is None:
special_tokens = []
@@ -108,6 +118,9 @@ class Vocab(cde.Vocab):

Returns:
Vocab, vocab built from the file.

Examples:
>>> vocab = text.Vocab.from_file("/path/to/wordpiece/vocab/file", ",", None, ["<pad>", "<unk>"], True)
"""
if vocab_size is None:
vocab_size = -1
@@ -127,6 +140,9 @@ class Vocab(cde.Vocab):

Returns:
Vocab, vocab built from the `dict`.

Examples:
>>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
"""

return super().from_dict(word_dict)
@@ -165,6 +181,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):

Returns:
SentencePieceVocab, vocab built from the dataset.

Examples:
>>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
>>> vocab = text.SentencePieceVocab.from_dataset(dataset, ["text"], 5000, 0.9995,
... SentencePieceModel.UNIGRAM, {})
"""

return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage,
@@ -203,6 +224,10 @@ class SentencePieceVocab(cde.SentencePieceVocab):

Returns:
SentencePieceVocab, vocab built from the file.

Examples:
>>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995,
... SentencePieceModel.UNIGRAM, {})
"""
return super().from_file(file_path, vocab_size, character_coverage,
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
@@ -217,6 +242,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
vocab(SentencePieceVocab): A SentencePiece object.
path(str): Path to store model.
filename(str): The name of the file.

Examples:
>>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995,
... SentencePieceModel.UNIGRAM, {})
>>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
"""
super().save_model(vocab, path, filename)

@@ -231,6 +261,11 @@ def to_str(array, encoding='utf8'):

Returns:
numpy.ndarray, NumPy array of `str`.

Examples:
>>> dataset = ds.TextFileDataset("/path/to/text_file_dataset_file", shuffle=False)
>>> for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
>>> print(text.to_str(item["text"]))
"""

if not isinstance(array, np.ndarray):


+ 9
- 0
mindspore/dataset/transforms/__init__.py View File

@@ -15,6 +15,15 @@
This module is to support common augmentations. C_transforms is a high performance
image augmentation module which is developed with C++ OpenCV. Py_transforms
provide more kinds of image augmentations which is developed with Python PIL.

Common imported modules in corresponding API examples are as follows:

.. code-block::

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as c_vision
from mindspore.dataset.transforms import c_transforms
from mindspore.dataset.transforms import py_transforms
"""
from .. import vision
from . import c_transforms


+ 8
- 0
mindspore/dataset/vision/__init__.py View File

@@ -16,6 +16,14 @@ This module is to support vision augmentations. It includes two parts:
c_transforms and py_transforms. C_transforms is a high performance
image augmentation module which is developed with c++ opencv. Py_transforms
provide more kinds of image augmentations which is developed with Python PIL.

Common imported modules in corresponding API examples are as follows:

.. code-block::

import mindspore.dataset.vision.c_transforms as c_vision
import mindspore.dataset.vision.py_transforms as py_vision
from mindspore.dataset.transforms import c_transforms
"""
from . import c_transforms
from . import py_transforms


Loading…
Cancel
Save