| @@ -109,12 +109,15 @@ Status RandomCropAndResizeOp::GetCropBox(int h_in, int w_in, int *x, int *y, int | |||
| // Note rnd_aspect_ is already a random distribution of the input aspect ratio in logarithmic sample_scale. | |||
| double const sample_aspect = exp(rnd_aspect_(rnd_)); | |||
| CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / h_in) > w_in, | |||
| "RandomCropAndResizeOp: multiplication out of bounds"); | |||
| CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / h_in / w_in) > sample_scale, | |||
| "RandomCropAndResizeOp: multiplication out of bounds"); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| (std::numeric_limits<int32_t>::max() / h_in) > w_in, | |||
| "RandomCropAndResizeOp: multiplication out of bounds, check image width and image height first."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| (std::numeric_limits<int32_t>::max() / h_in / w_in) > sample_scale, | |||
| "RandomCropAndResizeOp: multiplication out of bounds, check image width, image height and sample scale first."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / h_in / w_in / sample_scale) > sample_aspect, | |||
| "RandomCropAndResizeOp: multiplication out of bounds"); | |||
| "RandomCropAndResizeOp: multiplication out of bounds, check image width, image " | |||
| "height, sample scale and sample aspect first."); | |||
| *crop_width = static_cast<int32_t>(std::round(std::sqrt(h_in * w_in * sample_scale * sample_aspect))); | |||
| *crop_height = static_cast<int32_t>(std::round(*crop_width / sample_aspect)); | |||
| @@ -22,6 +22,13 @@ Besides, this module provides APIs to sample data while loading. | |||
| We can enable cache in most of the dataset with its key arguments 'cache'. Please notice that cache is not supported | |||
| on Windows platform yet. Do not use it while loading and processing data on Windows. More introductions and limitations | |||
| can refer `Single-Node Tensor Cache <https://www.mindspore.cn/docs/programming_guide/en/master/cache.html>`_. | |||
| Common imported modules in corresponding API examples are as follows: | |||
| .. code-block:: | |||
| import mindspore.dataset as ds | |||
| from mindspore.dataset.transforms import c_transforms | |||
| """ | |||
| from .core import config | |||
| @@ -17,6 +17,13 @@ This module is to support audio augmentations. | |||
| It includes two parts: transforms and utils. | |||
| transforms is a high performance processing module with common audio operations. | |||
| utils provides some general methods for audio processing. | |||
| Common imported modules in corresponding API examples are as follows: | |||
| .. code-block:: | |||
| import mindspore.dataset as ds | |||
| from mindspore.dataset import audio | |||
| """ | |||
| from . import transforms | |||
| from . import utils | |||
| @@ -30,12 +30,15 @@ class DSCallback: | |||
| step_size (int, optional): The number of steps between the step_begin and step_end are called (Default=1). | |||
| Examples: | |||
| >>> from mindspore.dataset import DSCallback | |||
| >>> | |||
| >>> class PrintInfo(DSCallback): | |||
| >>> def ds_epoch_end(self, ds_run_context): | |||
| >>> print(cb_params.cur_epoch_num) | |||
| >>> print(cb_params.cur_step_num) | |||
| >>> | |||
| >>> data = data.map(operations=op, callbacks=PrintInfo()) | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> dataset = dataset.map(operations=op, callbacks=PrintInfo()) | |||
| """ | |||
| @check_callback | |||
| @@ -127,9 +130,12 @@ class WaitedDSCallback(Callback, DSCallback): | |||
| will be equal to the batch size (Default=1). | |||
| Examples: | |||
| >>> my_cb = MyWaitedCallback(32) | |||
| >>> data = data.map(operations=AugOp(), callbacks=my_cb) | |||
| >>> data = data.batch(32) | |||
| >>> from mindspore.dataset import WaitedDSCallback | |||
| >>> | |||
| >>> my_cb = WaitedDSCallback(32) | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> dataset = dataset.map(operations=AugOp(), callbacks=my_cb) | |||
| >>> dataset = dataset.batch(32) | |||
| >>> # define the model | |||
| >>> model.train(epochs, data, callbacks=[my_cb]) | |||
| """ | |||
| @@ -15,6 +15,12 @@ | |||
| """ | |||
| The configuration module provides various functions to set and get the supported | |||
| configuration parameters, and read a configuration file. | |||
| Common imported modules in corresponding API examples are as follows: | |||
| .. code-block:: | |||
| import mindspore.dataset as ds | |||
| """ | |||
| import os | |||
| import platform | |||
| @@ -264,7 +264,7 @@ class Dataset: | |||
| def close_pool(self): | |||
| """ | |||
| Close multiprocessing pool in dataset. If you are familiar with multiprocessing library, you can regard this | |||
| as a deconstructor for a processingPool object. | |||
| as a destructor for a processingPool object. | |||
| """ | |||
| if hasattr(self, 'process_pool') and self.process_pool is not None: | |||
| self.process_pool.close() | |||
| @@ -587,7 +587,7 @@ class Dataset: | |||
| RuntimeError: If exist sync operators before shuffle. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object. | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> # Optionally set the seed for the first epoch | |||
| >>> ds.config.set_seed(58) | |||
| >>> # Create a shuffled dataset using a shuffle buffer of size 4 | |||
| @@ -823,7 +823,7 @@ class Dataset: | |||
| RepeatDataset, dataset repeated. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object. | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> | |||
| >>> # Create a dataset where the dataset is repeated for 50 epochs | |||
| >>> dataset = dataset.repeat(50) | |||
| @@ -852,7 +852,7 @@ class Dataset: | |||
| SkipDataset, dataset that containing rows like origin rows subtract skipped rows. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object. | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> # Create a dataset which skips first 3 elements from data | |||
| >>> dataset = dataset.skip(3) | |||
| """ | |||
| @@ -876,7 +876,7 @@ class Dataset: | |||
| TakeDataset, dataset taken. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object. | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> # Create a dataset where the dataset includes 50 elements. | |||
| >>> dataset = dataset.take(50) | |||
| """ | |||
| @@ -1085,7 +1085,7 @@ class Dataset: | |||
| RenameDataset, dataset renamed. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object. | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> input_columns = ["input_col1", "input_col2", "input_col3"] | |||
| >>> output_columns = ["output_col1", "output_col2", "output_col3"] | |||
| >>> | |||
| @@ -1112,7 +1112,7 @@ class Dataset: | |||
| ProjectDataset, dataset projected. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> columns_to_project = ["column3", "column1", "column2"] | |||
| >>> | |||
| >>> # Create a dataset that consists of column3, column1, column2 | |||
| @@ -1135,28 +1135,30 @@ class Dataset: | |||
| freq_range(tuple[int]): A tuple of integers (min_frequency, max_frequency). Words within the frequency | |||
| range will be stored. | |||
| Naturally 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency | |||
| an be set to default, which corresponds to 0/total_words separately | |||
| can be set to default, which corresponds to 0/total_words separately. | |||
| top_k(int): Number of words to be built into vocab. top_k most frequent words are | |||
| taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken | |||
| special_tokens(list[str]): A list of strings, each one is a special token | |||
| special_tokens(list[str]): A list of strings, each one is a special token. | |||
| special_first(bool): Whether special_tokens will be prepended/appended to vocab, If special_tokens | |||
| is specified and special_first is set to default, special_tokens will be prepended | |||
| is specified and special_first is set to default, special_tokens will be prepended. | |||
| Returns: | |||
| Vocab, vocab built from the dataset. | |||
| Example: | |||
| >>> import numpy as np | |||
| >>> | |||
| >>> def gen_corpus(): | |||
| ... # key: word, value: number of occurrences, reason for using letters is so their order is apparent | |||
| ... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1} | |||
| ... for k, v in corpus.items(): | |||
| ... yield (np.array([k] * v, dtype='S'),) | |||
| >>> column_names = ["column1", "column2", "column3"] | |||
| >>> column_names = ["column1"] | |||
| >>> dataset = ds.GeneratorDataset(gen_corpus, column_names) | |||
| >>> dataset = dataset.build_vocab(columns=["column3", "column1", "column2"], | |||
| >>> dataset = dataset.build_vocab(columns=["column1"], | |||
| ... freq_range=(1, 10), top_k=5, | |||
| ... special_tokens=["<pad>", "<unk>"], | |||
| ... special_first=True,vocab='vocab') | |||
| ... special_first=True) | |||
| """ | |||
| vocab = cde.Vocab() | |||
| @@ -1213,6 +1215,7 @@ class Dataset: | |||
| Example: | |||
| >>> from mindspore.dataset.text import SentencePieceModel | |||
| >>> | |||
| >>> def gen_corpus(): | |||
| ... # key: word, value: number of occurrences, reason for using letters is so their order is apparent | |||
| ... corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1} | |||
| @@ -1223,8 +1226,8 @@ class Dataset: | |||
| >>> dataset = dataset.build_sentencepiece_vocab(columns=["column3", "column1", "column2"], | |||
| ... vocab_size=5000, | |||
| ... character_coverage=0.9995, | |||
| ... model_type=SentencePieceModel.Unigram, | |||
| ... params={},vocab='vocab') | |||
| ... model_type=SentencePieceModel.UNIGRAM, | |||
| ... params={}) | |||
| """ | |||
| vocab = cde.SentencePieceVocab() | |||
| @@ -1253,13 +1256,13 @@ class Dataset: | |||
| Args: | |||
| apply_func (function): A function that must take one 'Dataset' as an argument and | |||
| return a preprogressing 'Dataset'. | |||
| return a preprocessed 'Dataset'. | |||
| Returns: | |||
| Dataset, dataset applied by the function. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> | |||
| >>> # Declare an apply_func function which returns a Dataset object | |||
| >>> def apply_func(data): | |||
| @@ -1427,7 +1430,7 @@ class Dataset: | |||
| TupleIterator, tuple iterator over the dataset. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> iterator = dataset.create_tuple_iterator() | |||
| >>> for item in iterator: | |||
| ... # item is a list | |||
| @@ -1459,7 +1462,7 @@ class Dataset: | |||
| DictIterator, dictionary iterator over the dataset. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> iterator = dataset.create_dict_iterator() | |||
| >>> for item in iterator: | |||
| ... # item is a dict | |||
| @@ -1487,7 +1490,7 @@ class Dataset: | |||
| tuple, tuple of the input index information. | |||
| Examples: | |||
| >>> # dataset is an instance of Dataset object | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> # set input_indexs | |||
| >>> dataset.input_indexs = 10 | |||
| >>> print(dataset.input_indexs) | |||
| @@ -1939,6 +1942,7 @@ class MappableDataset(SourceDataset): | |||
| new_sampler (Sampler): The sampler to use for the current dataset. | |||
| Examples: | |||
| >>> # dataset is an instance object of Dataset | |||
| >>> # use a DistributedSampler instead | |||
| >>> new_sampler = ds.DistributedSampler(10, 2) | |||
| >>> dataset.use_sampler(new_sampler) | |||
| @@ -1987,8 +1991,8 @@ class MappableDataset(SourceDataset): | |||
| 1. There is an optimized split function, which will be called automatically when the dataset | |||
| that calls this function is a MappableDataset. | |||
| 2. Dataset should not be sharded if split is going to be called. Instead, create a | |||
| DistributedSampler and specify a split to shard after splitting. If dataset is | |||
| sharded after a split, it is strongly recommended to set the same seed in each instance | |||
| DistributedSampler and specify a split to shard after splitting. If the dataset is | |||
| sharded after a split, it is strongly recommended setting the same seed in each instance | |||
| of execution, otherwise each shard may not be part of the same split (see Examples). | |||
| 3. It is strongly recommended to not shuffle the dataset, but use randomize=True instead. | |||
| Shuffling the dataset may not be deterministic, which means the data in each split | |||
| @@ -333,7 +333,7 @@ class GraphData: | |||
| next-hop sampling. A maximum of 6-hop are allowed. | |||
| The sampling result is tiled into a list in the format of [input node, 1-hop sampling result, | |||
| 2-hop samling result ...] | |||
| 2-hop sampling result ...] | |||
| Args: | |||
| node_list (Union[list, numpy.ndarray]): The given list of nodes. | |||
| @@ -16,6 +16,13 @@ This module is to support text processing for NLP. It includes two parts: | |||
| transforms and utils. transforms is a high performance | |||
| NLP text processing module which is developed with ICU4C and cppjieba. | |||
| utils provides some general methods for NLP text processing. | |||
| Common imported modules in corresponding API examples are as follows: | |||
| .. code-block:: | |||
| import mindspore.dataset as ds | |||
| from mindspore.dataset import text | |||
| """ | |||
| import platform | |||
| from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \ | |||
| @@ -66,6 +66,13 @@ class Vocab(cde.Vocab): | |||
| Returns: | |||
| Vocab, vocab built from the dataset. | |||
| Examples: | |||
| >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False) | |||
| >>> vocab = text.Vocab.from_dataset(dataset, "text", freq_range=None, top_k=None, | |||
| ... special_tokens=["<pad>", "<unk>"], | |||
| ... special_first=True) | |||
| >>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"]) | |||
| """ | |||
| return dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first) | |||
| @@ -84,6 +91,9 @@ class Vocab(cde.Vocab): | |||
| Returns: | |||
| Vocab, vocab built from the `list`. | |||
| Examples: | |||
| >>> vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True) | |||
| """ | |||
| if special_tokens is None: | |||
| special_tokens = [] | |||
| @@ -108,6 +118,9 @@ class Vocab(cde.Vocab): | |||
| Returns: | |||
| Vocab, vocab built from the file. | |||
| Examples: | |||
| >>> vocab = text.Vocab.from_file("/path/to/wordpiece/vocab/file", ",", None, ["<pad>", "<unk>"], True) | |||
| """ | |||
| if vocab_size is None: | |||
| vocab_size = -1 | |||
| @@ -127,6 +140,9 @@ class Vocab(cde.Vocab): | |||
| Returns: | |||
| Vocab, vocab built from the `dict`. | |||
| Examples: | |||
| >>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6}) | |||
| """ | |||
| return super().from_dict(word_dict) | |||
| @@ -165,6 +181,11 @@ class SentencePieceVocab(cde.SentencePieceVocab): | |||
| Returns: | |||
| SentencePieceVocab, vocab built from the dataset. | |||
| Examples: | |||
| >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False) | |||
| >>> vocab = text.SentencePieceVocab.from_dataset(dataset, ["text"], 5000, 0.9995, | |||
| ... SentencePieceModel.UNIGRAM, {}) | |||
| """ | |||
| return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage, | |||
| @@ -203,6 +224,10 @@ class SentencePieceVocab(cde.SentencePieceVocab): | |||
| Returns: | |||
| SentencePieceVocab, vocab built from the file. | |||
| Examples: | |||
| >>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995, | |||
| ... SentencePieceModel.UNIGRAM, {}) | |||
| """ | |||
| return super().from_file(file_path, vocab_size, character_coverage, | |||
| DE_C_INTER_SENTENCEPIECE_MODE[model_type], params) | |||
| @@ -217,6 +242,11 @@ class SentencePieceVocab(cde.SentencePieceVocab): | |||
| vocab(SentencePieceVocab): A SentencePiece object. | |||
| path(str): Path to store model. | |||
| filename(str): The name of the file. | |||
| Examples: | |||
| >>> vocab = text.SentencePieceVocab.from_file(["/path/to/sentence/piece/vocab/file"], 5000, 0.9995, | |||
| ... SentencePieceModel.UNIGRAM, {}) | |||
| >>> text.SentencePieceVocab.save_model(vocab, "./", "m.model") | |||
| """ | |||
| super().save_model(vocab, path, filename) | |||
| @@ -231,6 +261,11 @@ def to_str(array, encoding='utf8'): | |||
| Returns: | |||
| numpy.ndarray, NumPy array of `str`. | |||
| Examples: | |||
| >>> dataset = ds.TextFileDataset("/path/to/text_file_dataset_file", shuffle=False) | |||
| >>> for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| >>> print(text.to_str(item["text"])) | |||
| """ | |||
| if not isinstance(array, np.ndarray): | |||
| @@ -15,6 +15,15 @@ | |||
| This module is to support common augmentations. C_transforms is a high performance | |||
| image augmentation module which is developed with C++ OpenCV. Py_transforms | |||
| provide more kinds of image augmentations which is developed with Python PIL. | |||
| Common imported modules in corresponding API examples are as follows: | |||
| .. code-block:: | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as c_vision | |||
| from mindspore.dataset.transforms import c_transforms | |||
| from mindspore.dataset.transforms import py_transforms | |||
| """ | |||
| from .. import vision | |||
| from . import c_transforms | |||
| @@ -16,6 +16,14 @@ This module is to support vision augmentations. It includes two parts: | |||
| c_transforms and py_transforms. C_transforms is a high performance | |||
| image augmentation module which is developed with c++ opencv. Py_transforms | |||
| provide more kinds of image augmentations which is developed with Python PIL. | |||
| Common imported modules in corresponding API examples are as follows: | |||
| .. code-block:: | |||
| import mindspore.dataset.vision.c_transforms as c_vision | |||
| import mindspore.dataset.vision.py_transforms as py_vision | |||
| from mindspore.dataset.transforms import c_transforms | |||
| """ | |||
| from . import c_transforms | |||
| from . import py_transforms | |||