Merge pull request !4421 from ms_yan/api_commenttags/v0.7.0-beta
| @@ -267,9 +267,9 @@ class Dataset: | |||
| be dropped and not propagated to the child node. | |||
| num_parallel_workers (int, optional): Number of workers to process the Dataset in parallel (default=None). | |||
| per_batch_map (callable, optional): Per batch map callable. A callable which takes | |||
| (list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represent a batch of | |||
| Tensors on a given column. The number of lists should match with number of entries in input_columns. The | |||
| last parameter of the callable should always be a BatchInfo object. | |||
| (list[Tensor], list[Tensor], ..., BatchInfo) as input parameters. Each list[Tensor] represents a batch | |||
| of Tensors on a given column. The number of lists should match with number of entries in input_columns. | |||
| The last parameter of the callable should always be a BatchInfo object. | |||
| input_columns (list[str], optional): List of names of the input columns. The size of the list should | |||
| match with signature of per_batch_map callable. | |||
| pad_info (dict, optional): Whether to perform padding on selected columns. pad_info={"col1":([224,224],0)} | |||
| @@ -2821,7 +2821,7 @@ class MnistDataset(MappableDataset): | |||
| num_samples (int, optional): The number of images to be included in the dataset | |||
| (default=None, all images). | |||
| num_parallel_workers (int, optional): Number of workers to read the data | |||
| (default=value, set in the config). | |||
| (default=None, set in the config). | |||
| shuffle (bool, optional): Whether or not to perform shuffle on the dataset | |||
| (default=None, expected order behavior shown in the table). | |||
| sampler (Sampler, optional): Object used to choose samples from the | |||
| @@ -2898,7 +2898,7 @@ class MnistDataset(MappableDataset): | |||
| class MindDataset(MappableDataset): | |||
| """ | |||
| A source dataset that reads from shard files and database. | |||
| A source dataset that reads MindRecord files. | |||
| Args: | |||
| dataset_file (Union[str, list[str]]): One of file names or file list in dataset. | |||
| @@ -3225,7 +3225,7 @@ class _GeneratorWorker(multiprocessing.Process): | |||
| class GeneratorDataset(MappableDataset): | |||
| """ | |||
| A source dataset that generate data from python by invoking python data source each epoch. | |||
| A source dataset that generates data from python by invoking python data source each epoch. | |||
| This dataset can take in a sampler. sampler and shuffle are mutually exclusive. Table | |||
| below shows what input args are allowed and their expected behavior. | |||
| @@ -3437,7 +3437,7 @@ class TFRecordDataset(SourceDataset): | |||
| Args: | |||
| dataset_files (Union[str, list[str]]): String or list of files to be read or glob strings to search for a | |||
| pattern of files. The list will be sorted in a lexicographical order. | |||
| pattern of files. The list will be sorted in a lexicographical order. | |||
| schema (Union[str, Schema], optional): Path to the json schema file or schema object (default=None). | |||
| If the schema is not provided, the meta data from the TFData file is considered the schema. | |||
| columns_list (list[str], optional): List of columns to be read (default=None, read all columns) | |||
| @@ -3620,7 +3620,7 @@ class ManifestDataset(MappableDataset): | |||
| Args: | |||
| dataset_file (str): File to be read. | |||
| usage (str, optional): Need train, eval or inference data (default="train"). | |||
| usage (str, optional): acceptable usages include train, eval and inference (default="train"). | |||
| num_samples (int, optional): The number of images to be included in the dataset. | |||
| (default=None, all images). | |||
| num_parallel_workers (int, optional): Number of workers to read the data | |||
| @@ -72,6 +72,7 @@ class Lookup(cde.LookupOp): | |||
| def __init__(self, vocab, unknown_token=None): | |||
| super().__init__(vocab, unknown_token) | |||
| class SlidingWindow(cde.SlidingWindowOp): | |||
| """ | |||
| TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis | |||
| @@ -101,6 +102,7 @@ class SlidingWindow(cde.SlidingWindowOp): | |||
| def __init__(self, width, axis=0): | |||
| super().__init__(width=width, axis=axis) | |||
| class Ngram(cde.NgramOp): | |||
| """ | |||
| TensorOp to generate n-gram from a 1-D string Tensor. | |||
| @@ -511,8 +513,8 @@ if platform.system().lower() != 'windows': | |||
| on input text to make the text to lower case and strip accents characters; If False, only apply | |||
| NormalizeUTF8('normalization_form' mode) operation on input text(default=False). | |||
| keep_whitespace(bool, optional): If True, the whitespace will be kept in out tokens(default=False). | |||
| normalization_form(NormalizeForm, optional): Used to specify a specific normlaize mode, | |||
| only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). | |||
| normalization_form(NormalizeForm, optional): Used to specify a specific normalize mode, | |||
| only effective when 'lower_case' is False. See NormalizeUTF8 for details(default=NormalizeForm.NONE). | |||
| preserve_unused_token(bool, optional): If True, do not split special tokens like | |||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| @@ -132,12 +132,13 @@ class Vocab(cde.Vocab): | |||
| Build a vocab object from a dict. | |||
| Args: | |||
| word_dict (dict): dict contains word, id pairs where word should be str and id int. id is recommended to | |||
| start from 0 and be continuous. ValueError will be raised if id is negative. | |||
| word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended | |||
| to start from 0 and be continuous. ValueError will be raised if id is negative. | |||
| """ | |||
| return super().from_dict(word_dict) | |||
| class SentencePieceVocab(cde.SentencePieceVocab): | |||
| """ | |||
| SentencePiece obiect that is used to segmentate words | |||
| @@ -151,9 +152,9 @@ class SentencePieceVocab(cde.SentencePieceVocab): | |||
| Args: | |||
| dataset(Dataset): Dataset to build sentencepiece. | |||
| col_names(list): The list of the col name. | |||
| vocab_size(int): Vocabulary size, the type of uint32_t. | |||
| vocab_size(int): Vocabulary size. | |||
| character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for | |||
| languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small | |||
| languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small | |||
| character set. | |||
| model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence | |||
| must be pretokenized when using word type. | |||
| @@ -261,6 +262,7 @@ class NormalizeForm(IntEnum): | |||
| NFD = 3 | |||
| NFKD = 4 | |||
| class SentencePieceModel(IntEnum): | |||
| """An enumeration for SentencePieceModel, effective enumeration types are UNIGRAM, BPE, CHAR, WORD.""" | |||
| UNIGRAM = 0 | |||
| @@ -275,11 +277,13 @@ DE_C_INTER_SENTENCEPIECE_MODE = { | |||
| SentencePieceModel.WORD: cde.SentencePieceModel.DE_SENTENCE_PIECE_WORD | |||
| } | |||
| class SPieceTokenizerOutType(IntEnum): | |||
| """An enumeration for SPieceTokenizerOutType, effective enumeration types are STRING, INT.""" | |||
| STRING = 0 | |||
| INT = 1 | |||
| class SPieceTokenizerLoadType(IntEnum): | |||
| """An enumeration for SPieceTokenizerLoadType, effective enumeration types are FILE, MODEL.""" | |||
| FILE = 0 | |||
| @@ -204,7 +204,7 @@ class Concatenate(cde.ConcatenateOp): | |||
| Tensor operation that concatenates all columns into a single tensor. | |||
| Args: | |||
| axis (int, optional): axis to concatenate the tensors along (Default=0). | |||
| axis (int, optional): concatenate the tensors along given axis (Default=0). | |||
| prepend (numpy.array, optional): numpy array to be prepended to the already concatenated tensors (Default=None). | |||
| append (numpy.array, optional): numpy array to be appended to the already concatenated tensors (Default=None). | |||
| """ | |||
| @@ -188,8 +188,8 @@ class Normalize(cde.NormalizeOp): | |||
| Normalize the input image with respect to mean and standard deviation. | |||
| Args: | |||
| mean (sequence): List or tuple of mean values for each channel, w.r.t channel order. | |||
| std (sequence): List or tuple of standard deviations for each channel, w.r.t. channel order. | |||
| mean (sequence): List or tuple of mean values for each channel, with respect to channel order. | |||
| std (sequence): List or tuple of standard deviations for each channel, with respect to channel order. | |||
| """ | |||
| @check_normalize_c | |||
| @@ -23,6 +23,7 @@ from .common.exceptions import ParamValueError, ParamTypeError | |||
| __all__ = ['FileReader'] | |||
| class FileReader: | |||
| """ | |||
| Class to read MindRecord File series. | |||
| @@ -31,7 +32,7 @@ class FileReader: | |||
| file_name (str, list[str]): One of MindRecord File or file list. | |||
| num_consumer(int, optional): Number of consumer threads which load data to memory (default=4). | |||
| It should not be smaller than 1 or larger than the number of CPU. | |||
| columns (list[str], optional): List of fields which correspond data would be read (default=None). | |||
| columns (list[str], optional): List of fields which corresponding data would be read (default=None). | |||
| operator(int, optional): Reserved parameter for operators (default=None). | |||
| Raises: | |||
| @@ -275,7 +275,7 @@ class FileWriter: | |||
| def commit(self): | |||
| """ | |||
| Flush data to disk and generate the correspond db files. | |||
| Flush data to disk and generate the corresponding db files. | |||
| Returns: | |||
| MSRStatus, SUCCESS or FAILED. | |||
| @@ -25,12 +25,13 @@ from ..shardutils import check_filename | |||
| __all__ = ['ImageNetToMR'] | |||
| class ImageNetToMR: | |||
| """ | |||
| Class is for transformation from imagenet to MindRecord. | |||
| Args: | |||
| map_file (str): the map file which indicate label. | |||
| map_file (str): the map file which indicates label. | |||
| the map file content should like this: | |||
| .. code-block:: | |||
| @@ -37,7 +37,7 @@ class MnistToMR: | |||
| Class is for transformation from Mnist to MindRecord. | |||
| Args: | |||
| source (str): directory which contain t10k-images-idx3-ubyte.gz, | |||
| source (str): directory which contains t10k-images-idx3-ubyte.gz, | |||
| train-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz, | |||
| train-labels-idx1-ubyte.gz. | |||
| destination (str): the MindRecord file directory to transform into. | |||