Merge pull request !2258 from ZiruiWu/fix_api_doctags/v0.5.0-beta
| @@ -141,6 +141,11 @@ Status BuildVocabOp::CollectorThread() { | |||||
| } | } | ||||
| } | } | ||||
| int64_t num_words = std::min(static_cast<int64_t>(words.size()), top_k_); | int64_t num_words = std::min(static_cast<int64_t>(words.size()), top_k_); | ||||
| if (num_words == 0) { | |||||
| MS_LOG(WARNING) << "No word falls in the frequency range: (" << freq_range_.first << "," << freq_range_.second | |||||
| << ") vocab would be empty (except for special tokens)."; | |||||
| } | |||||
| // this would take the top-k most frequent words | // this would take the top-k most frequent words | ||||
| std::partial_sort(words.begin(), words.begin() + num_words, words.end(), | std::partial_sort(words.begin(), words.begin() + num_words, words.end(), | ||||
| [this](const std::string &w1, const std::string &w2) { | [this](const std::string &w1, const std::string &w2) { | ||||
| @@ -323,9 +323,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { | |||||
| // if mRecursive == false, don't go into folder of folders | // if mRecursive == false, don't go into folder of folders | ||||
| Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { | Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { | ||||
| std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir); | std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir); | ||||
| if (dir_itr == nullptr) { | |||||
| RETURN_STATUS_UNEXPECTED("Error encountered when indexing files"); | |||||
| } | |||||
| RETURN_UNEXPECTED_IF_NULL(dir_itr); | |||||
| while (dir_itr->hasNext()) { | while (dir_itr->hasNext()) { | ||||
| Path subdir = dir_itr->next(); | Path subdir = dir_itr->next(); | ||||
| if (subdir.IsDirectory()) { | if (subdir.IsDirectory()) { | ||||
| @@ -32,6 +32,5 @@ __all__ = ["config", "ConfigurationManager", "zip", | |||||
| "ImageFolderDatasetV2", "MnistDataset", | "ImageFolderDatasetV2", "MnistDataset", | ||||
| "MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset", | "MindDataset", "GeneratorDataset", "TFRecordDataset", "CLUEDataset", | ||||
| "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", | "ManifestDataset", "Cifar10Dataset", "Cifar100Dataset", "CelebADataset", | ||||
| "VOCDataset", "CocoDataset", "TextFileDataset", "BuildVocabDataset", "Schema", "Schema", | |||||
| "DistributedSampler", "PKSampler", | |||||
| "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler"] | |||||
| "VOCDataset", "CocoDataset", "TextFileDataset", "Schema", "DistributedSampler", | |||||
| "PKSampler", "RandomSampler", "SequentialSampler", "SubsetRandomSampler", "WeightedRandomSampler"] | |||||
| @@ -42,8 +42,8 @@ from .iterators import DictIterator, TupleIterator | |||||
| from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \ | from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \ | ||||
| check_rename, check_numpyslicesdataset, \ | check_rename, check_numpyslicesdataset, \ | ||||
| check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \ | check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \ | ||||
| check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset,\ | |||||
| check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat,\ | |||||
| check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \ | |||||
| check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \ | |||||
| check_split, check_bucket_batch_by_length, check_cluedataset | check_split, check_bucket_batch_by_length, check_cluedataset | ||||
| from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist | from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist | ||||
| @@ -895,26 +895,7 @@ class Dataset: | |||||
| return ProjectDataset(self, columns) | return ProjectDataset(self, columns) | ||||
| def build_vocab(self, vocab, columns, freq_range, top_k): | def build_vocab(self, vocab, columns, freq_range, top_k): | ||||
| """ | |||||
| Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab | |||||
| which contains top_k most frequent words (if top_k is specified) | |||||
| This function is not meant to be called directly by user. To build vocab, please use the function | |||||
| text.Vocab.from_dataset() | |||||
| Args: | |||||
| vocab(Vocab): vocab object | |||||
| columns(str or list, optional): column names to get words from. It can be a list of column names. | |||||
| (Default is None where all columns will be used. If any column isn't string type, will return error) | |||||
| freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency | |||||
| range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency | |||||
| can be None, which corresponds to 0/total_words separately (default is None, all words are included) | |||||
| top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | |||||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default is None | |||||
| all words are included) | |||||
| Returns: | |||||
| BuildVocabDataset | |||||
| """ | |||||
| """ Internal function for building a vocab""" | |||||
| return BuildVocabDataset(self, vocab, columns, freq_range, top_k) | return BuildVocabDataset(self, vocab, columns, freq_range, top_k) | ||||
| def apply(self, apply_func): | def apply(self, apply_func): | ||||
| @@ -1468,6 +1449,7 @@ class DatasetOp(Dataset): | |||||
| # No need for __init__ since it is the same as the super's init | # No need for __init__ since it is the same as the super's init | ||||
| class BucketBatchByLengthDataset(DatasetOp): | class BucketBatchByLengthDataset(DatasetOp): | ||||
| """ | """ | ||||
| The result of applying BucketBatchByLength operator to the input dataset. | The result of applying BucketBatchByLength operator to the input dataset. | ||||
| @@ -1608,7 +1590,7 @@ class BatchDataset(DatasetOp): | |||||
| Args: | Args: | ||||
| dataset (Dataset): dataset to be checked. | dataset (Dataset): dataset to be checked. | ||||
| batchsize (int): batch size to notify. | |||||
| batch_size (int): batch size to notify. | |||||
| """ | """ | ||||
| if isinstance(dataset, SyncWaitDataset): | if isinstance(dataset, SyncWaitDataset): | ||||
| dataset.update_sync_batch_size(batch_size) | dataset.update_sync_batch_size(batch_size) | ||||
| @@ -1646,7 +1628,7 @@ class BlockReleasePair: | |||||
| Args: | Args: | ||||
| init_release_rows (int): Number of lines to allow through the pipeline. | init_release_rows (int): Number of lines to allow through the pipeline. | ||||
| callback (function): The callback funciton that will be called when release is called. | |||||
| callback (function): The callback function that will be called when release is called. | |||||
| """ | """ | ||||
| def __init__(self, init_release_rows, callback=None): | def __init__(self, init_release_rows, callback=None): | ||||
| @@ -1710,7 +1692,7 @@ class SyncWaitDataset(DatasetOp): | |||||
| input_dataset (Dataset): Input dataset to apply flow control. | input_dataset (Dataset): Input dataset to apply flow control. | ||||
| num_batch (int): the number of batches without blocking at the start of each epoch. | num_batch (int): the number of batches without blocking at the start of each epoch. | ||||
| condition_name (str): The condition name that is used to toggle sending next row. | condition_name (str): The condition name that is used to toggle sending next row. | ||||
| callback (function): The callback funciton that will be invoked when sync_update is called. | |||||
| callback (function): The callback function that will be invoked when sync_update is called. | |||||
| Raises: | Raises: | ||||
| RuntimeError: If condition name already exists. | RuntimeError: If condition name already exists. | ||||
| @@ -2066,7 +2048,7 @@ class SkipDataset(DatasetOp): | |||||
| The result of applying Skip operator to the input Dataset. | The result of applying Skip operator to the input Dataset. | ||||
| Args: | Args: | ||||
| datasets (tuple): A tuple of datasets to be skipped. | |||||
| input_dataset (tuple): A tuple of datasets to be skipped. | |||||
| count (int): Number of rows the dataset should be skipped. | count (int): Number of rows the dataset should be skipped. | ||||
| """ | """ | ||||
| @@ -3055,7 +3037,7 @@ class GeneratorDataset(MappableDataset): | |||||
| provide either column_names or schema. | provide either column_names or schema. | ||||
| column_types (list[mindspore.dtype], optional): List of column data types of the dataset (default=None). | column_types (list[mindspore.dtype], optional): List of column data types of the dataset (default=None). | ||||
| If provided, sanity check will be performed on generator output. | If provided, sanity check will be performed on generator output. | ||||
| schema (Schema/String, optional): Path to the json schema file or schema object (default=None). Users are | |||||
| schema (Schema/str, optional): Path to the json schema file or schema object (default=None). Users are | |||||
| required to provide either column_names or schema. If both are provided, schema will be used. | required to provide either column_names or schema. If both are provided, schema will be used. | ||||
| num_samples (int, optional): The number of samples to be included in the dataset | num_samples (int, optional): The number of samples to be included in the dataset | ||||
| (default=None, all images). | (default=None, all images). | ||||
| @@ -4343,7 +4325,7 @@ class CelebADataset(MappableDataset): | |||||
| dataset_dir (str): Path to the root directory that contains the dataset. | dataset_dir (str): Path to the root directory that contains the dataset. | ||||
| num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config). | num_parallel_workers (int, optional): Number of workers to read the data (default=value set in the config). | ||||
| shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None). | shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None). | ||||
| dataset_type (string): one of 'all', 'train', 'valid' or 'test'. | |||||
| dataset_type (str): one of 'all', 'train', 'valid' or 'test'. | |||||
| sampler (Sampler, optional): Object used to choose samples from the dataset (default=None). | sampler (Sampler, optional): Object used to choose samples from the dataset (default=None). | ||||
| decode (bool, optional): decode the images after reading (default=False). | decode (bool, optional): decode the images after reading (default=False). | ||||
| extensions (list[str], optional): List of file extensions to be | extensions (list[str], optional): List of file extensions to be | ||||
| @@ -4874,18 +4856,15 @@ class BuildVocabDataset(DatasetOp): | |||||
| text.Vocab.from_dataset() | text.Vocab.from_dataset() | ||||
| Args: | Args: | ||||
| vocab(Vocab): vocab object. | |||||
| vocab(Vocab): text.vocab object. | |||||
| columns(str or list, optional): column names to get words from. It can be a list of column names (Default is | columns(str or list, optional): column names to get words from. It can be a list of column names (Default is | ||||
| None, all columns are used, return error if any column isn't string). | |||||
| None, all columns are used, return error if any column isn't string). | |||||
| freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency | freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency | ||||
| range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency | |||||
| can be None, which corresponds to 0/total_words separately (default is None, all words are included). | |||||
| range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency | |||||
| can be None, which corresponds to 0/total_words separately (default=None, all words are included). | |||||
| top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | ||||
| taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default is None | |||||
| all words are included). | |||||
| Returns: | |||||
| BuildVocabDataset | |||||
| taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, | |||||
| all words are included). | |||||
| """ | """ | ||||
| def __init__(self, input_dataset, vocab, columns, freq_range, top_k, prefetch_size=None): | def __init__(self, input_dataset, vocab, columns, freq_range, top_k, prefetch_size=None): | ||||
| @@ -30,8 +30,8 @@ class Lookup(cde.LookupOp): | |||||
| """ | """ | ||||
| Lookup operator that looks up a word to an id | Lookup operator that looks up a word to an id | ||||
| Args: | Args: | ||||
| vocab(Vocab): a Vocab object | |||||
| unknown(None,int): default id to lookup a word that is out of vocab | |||||
| vocab(Vocab): a Vocab object. | |||||
| unknown(int): default id to lookup a word that is out of vocab (default is None). | |||||
| """ | """ | ||||
| @check_lookup | @check_lookup | ||||
| @@ -45,16 +45,21 @@ class Lookup(cde.LookupOp): | |||||
| class Ngram(cde.NgramOp): | class Ngram(cde.NgramOp): | ||||
| """ | """ | ||||
| TensorOp to generate n-gram from a 1-D string Tensor | TensorOp to generate n-gram from a 1-D string Tensor | ||||
| Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an explanation of what n-gram is. | |||||
| Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works. | |||||
| Args: | Args: | ||||
| n(int or list): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result | |||||
| would be a 4-gram followed by a 3-gram in the same tensor. | |||||
| left_pad(tuple, optional): ("pad_token",pad_width). Padding performed on left side of the sequence. pad_width | |||||
| will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__". (Default is None) | |||||
| right_pad(tuple, optional): ("pad_token",pad_width). Padding performed on right side of the sequence. pad_width | |||||
| will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--". (Default is None) | |||||
| n([int, list]): n in n-gram, n >= 1. n is a list of positive integers, for e.g. n=[4,3], The result | |||||
| would be a 4-gram followed by a 3-gram in the same tensor. If number of words is not enough to make up for | |||||
| a n-gram, an empty string would be returned. For e.g. 3 grams on ["mindspore","best"] would result in an | |||||
| empty string be produced. | |||||
| left_pad(tuple, optional): ("pad_token", pad_width). Padding performed on left side of the sequence. pad_width | |||||
| will be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (Default is None). | |||||
| right_pad(tuple, optional): ("pad_token", pad_width). Padding performed on right side of the sequence. | |||||
| pad_width will be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" | |||||
| (Default is None). | |||||
| separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"] | separator(str,optional): symbol used to join strings together. for e.g. if 2-gram the ["mindspore", "amazing"] | ||||
| with separator="-" the result would be ["mindspore-amazing"]. (Default is None which means whitespace is used) | |||||
| with separator="-" the result would be ["mindspore-amazing"] (Default is None which means whitespace is | |||||
| used). | |||||
| """ | """ | ||||
| @check_ngram | @check_ngram | ||||
| @@ -12,7 +12,7 @@ | |||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| """ | """ | ||||
| Some basic function for nlp | |||||
| Some basic function for text | |||||
| """ | """ | ||||
| from enum import IntEnum | from enum import IntEnum | ||||
| @@ -25,42 +25,47 @@ from .validators import check_from_file, check_from_list, check_from_dict, check | |||||
| class Vocab(cde.Vocab): | class Vocab(cde.Vocab): | ||||
| """ | """ | ||||
| Vocab object that is used for lookup word | |||||
| Vocab object that is used for lookup word. | |||||
| """ | """ | ||||
| @classmethod | @classmethod | ||||
| @check_from_dataset | @check_from_dataset | ||||
| def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None): | def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None): | ||||
| """ | """ | ||||
| Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab | |||||
| which contains top_k most frequent words (if top_k is specified) | |||||
| Build a vocab from a dataset. This would collect all unique words in a dataset and return a vocab within | |||||
| the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency. | |||||
| Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be | |||||
| ordered lexicographically. | |||||
| Args: | Args: | ||||
| dataset(Dataset): dataset to build vocab from. | dataset(Dataset): dataset to build vocab from. | ||||
| columns(str or list, optional): column names to get words from. It can be a list of column names. | |||||
| (Default is None where all columns will be used. If any column isn't string type, will return error) | |||||
| columns([str, list], optional): column names to get words from. It can be a list of column names. | |||||
| (Default=None where all columns will be used. If any column isn't string type, will return error) | |||||
| freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency | freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency | ||||
| range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency | |||||
| can be None, which corresponds to 0/total_words separately (default is None, all words are included) | |||||
| range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as | |||||
| min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words. | |||||
| min_frequency/max_frequency can be None, which corresponds to 0/total_words separately | |||||
| (default=None, all words are included). | |||||
| top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | ||||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default is None | |||||
| all words are included) | |||||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None | |||||
| all words are included). | |||||
| return: | return: | ||||
| text.Vocab: vocab object built from dataset. | |||||
| text.Vocab: Vocab object built from dataset. | |||||
| """ | """ | ||||
| vocab = Vocab() | vocab = Vocab() | ||||
| root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k) | root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k) | ||||
| for d in root.create_dict_iterator(): | for d in root.create_dict_iterator(): | ||||
| if d is not None: | if d is not None: | ||||
| raise ValueError("from_dataset should receive data other than None") | |||||
| raise ValueError("from_dataset should receive data other than None.") | |||||
| return vocab | return vocab | ||||
| @classmethod | @classmethod | ||||
| @check_from_list | @check_from_list | ||||
| def from_list(cls, word_list): | def from_list(cls, word_list): | ||||
| """ | """ | ||||
| build a vocab object from a list of word | |||||
| build a vocab object from a list of word. | |||||
| Args: | Args: | ||||
| word_list(list): a list of string where each element is a word | |||||
| word_list(list): a list of string where each element is a word. | |||||
| """ | """ | ||||
| return super().from_list(word_list) | return super().from_list(word_list) | ||||
| @@ -68,11 +73,12 @@ class Vocab(cde.Vocab): | |||||
| @check_from_file | @check_from_file | ||||
| def from_file(cls, file_path, delimiter=None, vocab_size=None): | def from_file(cls, file_path, delimiter=None, vocab_size=None): | ||||
| """ | """ | ||||
| build a vocab object from a list of word | |||||
| build a vocab object from a list of word. | |||||
| Args: | Args: | ||||
| file_path(str): path to the file which contains the vocab list | |||||
| delimiter(None, str): a delimiter to break up each line in file, the first element is taken to be the word | |||||
| vocab_size(None, int): number of words to read from file_path | |||||
| file_path(str): path to the file which contains the vocab list. | |||||
| delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be | |||||
| the word (default=None). | |||||
| vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken). | |||||
| """ | """ | ||||
| return super().from_file(file_path, delimiter, vocab_size) | return super().from_file(file_path, delimiter, vocab_size) | ||||
| @@ -82,7 +88,7 @@ class Vocab(cde.Vocab): | |||||
| """ | """ | ||||
| build a vocab object from a dict. | build a vocab object from a dict. | ||||
| Args: | Args: | ||||
| word_dict(dict): dict contains word, id pairs. id should start from 2 and continuous | |||||
| word_dict(dict): dict contains word, id pairs. id should start from 2 and be continuous. | |||||
| """ | """ | ||||
| return super().from_dict(word_dict) | return super().from_dict(word_dict) | ||||
| @@ -100,7 +106,7 @@ def to_str(array, encoding='utf8'): | |||||
| """ | """ | ||||
| if not isinstance(array, np.ndarray): | if not isinstance(array, np.ndarray): | ||||
| raise ValueError('input should be a numpy array') | |||||
| raise ValueError('input should be a numpy array.') | |||||
| return np.char.decode(array, encoding) | return np.char.decode(array, encoding) | ||||
| @@ -118,7 +124,7 @@ def to_bytes(array, encoding='utf8'): | |||||
| """ | """ | ||||
| if not isinstance(array, np.ndarray): | if not isinstance(array, np.ndarray): | ||||
| raise ValueError('input should be a numpy array') | |||||
| raise ValueError('input should be a numpy array.') | |||||
| return np.char.encode(array, encoding) | return np.char.encode(array, encoding) | ||||
| @@ -24,7 +24,7 @@ from ..transforms.validators import check_uint32, check_pos_int64 | |||||
| def check_lookup(method): | def check_lookup(method): | ||||
| """A wrapper that wrap a parameter checker to the original function(crop operation).""" | |||||
| """A wrapper that wrap a parameter checker to the original function.""" | |||||
| @wraps(method) | @wraps(method) | ||||
| def new_method(self, *args, **kwargs): | def new_method(self, *args, **kwargs): | ||||
| @@ -35,10 +35,10 @@ def check_lookup(method): | |||||
| unknown = kwargs.get("unknown") | unknown = kwargs.get("unknown") | ||||
| if unknown is not None: | if unknown is not None: | ||||
| if not (isinstance(unknown, int) and unknown >= 0): | if not (isinstance(unknown, int) and unknown >= 0): | ||||
| raise ValueError("unknown needs to be a non-negative integer") | |||||
| raise ValueError("unknown needs to be a non-negative integer.") | |||||
| if not isinstance(vocab, cde.Vocab): | if not isinstance(vocab, cde.Vocab): | ||||
| raise ValueError("vocab is not an instance of cde.Vocab") | |||||
| raise ValueError("vocab is not an instance of cde.Vocab.") | |||||
| kwargs["vocab"] = vocab | kwargs["vocab"] = vocab | ||||
| kwargs["unknown"] = unknown | kwargs["unknown"] = unknown | ||||
| @@ -48,7 +48,7 @@ def check_lookup(method): | |||||
| def check_from_file(method): | def check_from_file(method): | ||||
| """A wrapper that wrap a parameter checker to the original function(crop operation).""" | |||||
| """A wrapper that wrap a parameter checker to the original function.""" | |||||
| @wraps(method) | @wraps(method) | ||||
| def new_method(self, *args, **kwargs): | def new_method(self, *args, **kwargs): | ||||
| @@ -61,16 +61,16 @@ def check_from_file(method): | |||||
| vocab_size = kwargs.get("vocab_size") | vocab_size = kwargs.get("vocab_size") | ||||
| if not isinstance(file_path, str): | if not isinstance(file_path, str): | ||||
| raise ValueError("file_path needs to be str") | |||||
| raise ValueError("file_path needs to be str.") | |||||
| if delimiter is not None: | if delimiter is not None: | ||||
| if not isinstance(delimiter, str): | if not isinstance(delimiter, str): | ||||
| raise ValueError("delimiter needs to be str") | |||||
| raise ValueError("delimiter needs to be str.") | |||||
| else: | else: | ||||
| delimiter = "" | delimiter = "" | ||||
| if vocab_size is not None: | if vocab_size is not None: | ||||
| if not (isinstance(vocab_size, int) and vocab_size > 0): | if not (isinstance(vocab_size, int) and vocab_size > 0): | ||||
| raise ValueError("vocab size needs to be a positive integer") | |||||
| raise ValueError("vocab size needs to be a positive integer.") | |||||
| else: | else: | ||||
| vocab_size = -1 | vocab_size = -1 | ||||
| kwargs["file_path"] = file_path | kwargs["file_path"] = file_path | ||||
| @@ -82,7 +82,7 @@ def check_from_file(method): | |||||
| def check_from_list(method): | def check_from_list(method): | ||||
| """A wrapper that wrap a parameter checker to the original function(crop operation).""" | |||||
| """A wrapper that wrap a parameter checker to the original function.""" | |||||
| @wraps(method) | @wraps(method) | ||||
| def new_method(self, *args, **kwargs): | def new_method(self, *args, **kwargs): | ||||
| @@ -90,10 +90,10 @@ def check_from_list(method): | |||||
| if "word_list" in kwargs: | if "word_list" in kwargs: | ||||
| word_list = kwargs.get("word_list") | word_list = kwargs.get("word_list") | ||||
| if not isinstance(word_list, list): | if not isinstance(word_list, list): | ||||
| raise ValueError("word_list needs to be a list of words") | |||||
| raise ValueError("word_list needs to be a list of words.") | |||||
| for word in word_list: | for word in word_list: | ||||
| if not isinstance(word, str): | if not isinstance(word, str): | ||||
| raise ValueError("each word in word list needs to be type str") | |||||
| raise ValueError("each word in word list needs to be type str.") | |||||
| kwargs["word_list"] = word_list | kwargs["word_list"] = word_list | ||||
| return method(self, **kwargs) | return method(self, **kwargs) | ||||
| @@ -102,7 +102,7 @@ def check_from_list(method): | |||||
| def check_from_dict(method): | def check_from_dict(method): | ||||
| """A wrapper that wrap a parameter checker to the original function(crop operation).""" | |||||
| """A wrapper that wrap a parameter checker to the original function.""" | |||||
| @wraps(method) | @wraps(method) | ||||
| def new_method(self, *args, **kwargs): | def new_method(self, *args, **kwargs): | ||||
| @@ -110,12 +110,12 @@ def check_from_dict(method): | |||||
| if "word_dict" in kwargs: | if "word_dict" in kwargs: | ||||
| word_dict = kwargs.get("word_dict") | word_dict = kwargs.get("word_dict") | ||||
| if not isinstance(word_dict, dict): | if not isinstance(word_dict, dict): | ||||
| raise ValueError("word_dict needs to be a list of word,id pairs") | |||||
| raise ValueError("word_dict needs to be a list of word,id pairs.") | |||||
| for word, word_id in word_dict.items(): | for word, word_id in word_dict.items(): | ||||
| if not isinstance(word, str): | if not isinstance(word, str): | ||||
| raise ValueError("each word in word_dict needs to be type str") | |||||
| raise ValueError("each word in word_dict needs to be type str.") | |||||
| if not (isinstance(word_id, int) and word_id >= 0): | if not (isinstance(word_id, int) and word_id >= 0): | ||||
| raise ValueError("each word id needs to be positive integer") | |||||
| raise ValueError("each word id needs to be positive integer.") | |||||
| kwargs["word_dict"] = word_dict | kwargs["word_dict"] = word_dict | ||||
| return method(self, **kwargs) | return method(self, **kwargs) | ||||
| @@ -135,11 +135,11 @@ def check_jieba_init(method): | |||||
| mp_path = kwargs.get("mp_path") | mp_path = kwargs.get("mp_path") | ||||
| if hmm_path is None: | if hmm_path is None: | ||||
| raise ValueError( | raise ValueError( | ||||
| "the dict of HMMSegment in cppjieba is not provided") | |||||
| "the dict of HMMSegment in cppjieba is not provided.") | |||||
| kwargs["hmm_path"] = hmm_path | kwargs["hmm_path"] = hmm_path | ||||
| if mp_path is None: | if mp_path is None: | ||||
| raise ValueError( | raise ValueError( | ||||
| "the dict of MPSegment in cppjieba is not provided") | |||||
| "the dict of MPSegment in cppjieba is not provided.") | |||||
| kwargs["mp_path"] = mp_path | kwargs["mp_path"] = mp_path | ||||
| if model is not None: | if model is not None: | ||||
| kwargs["model"] = model | kwargs["model"] = model | ||||
| @@ -160,7 +160,7 @@ def check_jieba_add_word(method): | |||||
| if "freq" in kwargs: | if "freq" in kwargs: | ||||
| freq = kwargs.get("freq") | freq = kwargs.get("freq") | ||||
| if word is None: | if word is None: | ||||
| raise ValueError("word is not provided") | |||||
| raise ValueError("word is not provided.") | |||||
| kwargs["word"] = word | kwargs["word"] = word | ||||
| if freq is not None: | if freq is not None: | ||||
| check_uint32(freq) | check_uint32(freq) | ||||
| @@ -179,7 +179,7 @@ def check_jieba_add_dict(method): | |||||
| if "user_dict" in kwargs: | if "user_dict" in kwargs: | ||||
| user_dict = kwargs.get("user_dict") | user_dict = kwargs.get("user_dict") | ||||
| if user_dict is None: | if user_dict is None: | ||||
| raise ValueError("user_dict is not provided") | |||||
| raise ValueError("user_dict is not provided.") | |||||
| kwargs["user_dict"] = user_dict | kwargs["user_dict"] = user_dict | ||||
| return method(self, **kwargs) | return method(self, **kwargs) | ||||
| @@ -187,7 +187,7 @@ def check_jieba_add_dict(method): | |||||
| def check_from_dataset(method): | def check_from_dataset(method): | ||||
| """A wrapper that wrap a parameter checker to the original function(crop operation).""" | |||||
| """A wrapper that wrap a parameter checker to the original function.""" | |||||
| # def from_dataset(cls, dataset, columns, freq_range=None, top_k=None): | # def from_dataset(cls, dataset, columns, freq_range=None, top_k=None): | ||||
| @wraps(method) | @wraps(method) | ||||
| @@ -210,27 +210,27 @@ def check_from_dataset(method): | |||||
| for column in columns: | for column in columns: | ||||
| if not isinstance(column, str): | if not isinstance(column, str): | ||||
| raise ValueError("columns need to be a list of strings") | |||||
| raise ValueError("columns need to be a list of strings.") | |||||
| if freq_range is None: | if freq_range is None: | ||||
| freq_range = (None, None) | freq_range = (None, None) | ||||
| if not isinstance(freq_range, tuple) or len(freq_range) != 2: | if not isinstance(freq_range, tuple) or len(freq_range) != 2: | ||||
| raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None") | |||||
| raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None.") | |||||
| for num in freq_range: | for num in freq_range: | ||||
| if num is not None and (not isinstance(num, int)): | if num is not None and (not isinstance(num, int)): | ||||
| raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None") | |||||
| raise ValueError("freq_range needs to be either None or a tuple of 2 integers or an int and a None.") | |||||
| if isinstance(freq_range[0], int) and isinstance(freq_range[1], int): | if isinstance(freq_range[0], int) and isinstance(freq_range[1], int): | ||||
| if freq_range[0] > freq_range[1] or freq_range[0] < 0: | if freq_range[0] > freq_range[1] or freq_range[0] < 0: | ||||
| raise ValueError("frequency range [a,b] should be 0 <= a <= b (a,b are inclusive)") | |||||
| raise ValueError("frequency range [a,b] should be 0 <= a <= b (a,b are inclusive).") | |||||
| if top_k is not None and (not isinstance(top_k, int)): | if top_k is not None and (not isinstance(top_k, int)): | ||||
| raise ValueError("top_k needs to be a positive integer") | |||||
| raise ValueError("top_k needs to be a positive integer.") | |||||
| if isinstance(top_k, int) and top_k <= 0: | if isinstance(top_k, int) and top_k <= 0: | ||||
| raise ValueError("top_k needs to be a positive integer") | |||||
| raise ValueError("top_k needs to be a positive integer.") | |||||
| kwargs["dataset"] = dataset | kwargs["dataset"] = dataset | ||||
| kwargs["columns"] = columns | kwargs["columns"] = columns | ||||
| @@ -243,7 +243,7 @@ def check_from_dataset(method): | |||||
| def check_ngram(method): | def check_ngram(method): | ||||
| """A wrapper that wrap a parameter checker to the original function(crop operation).""" | |||||
| """A wrapper that wrap a parameter checker to the original function.""" | |||||
| @wraps(method) | @wraps(method) | ||||
| def new_method(self, *args, **kwargs): | def new_method(self, *args, **kwargs): | ||||
| @@ -261,11 +261,11 @@ def check_ngram(method): | |||||
| n = [n] | n = [n] | ||||
| if not (isinstance(n, list) and n != []): | if not (isinstance(n, list) and n != []): | ||||
| raise ValueError("n needs to be a non-empty list of positive integers") | |||||
| raise ValueError("n needs to be a non-empty list of positive integers.") | |||||
| for gram in n: | for gram in n: | ||||
| if not (isinstance(gram, int) and gram > 0): | if not (isinstance(gram, int) and gram > 0): | ||||
| raise ValueError("n in ngram needs to be a positive number\n") | |||||
| raise ValueError("n in ngram needs to be a positive number.") | |||||
| if left_pad is None: | if left_pad is None: | ||||
| left_pad = ("", 0) | left_pad = ("", 0) | ||||
| @@ -275,20 +275,20 @@ def check_ngram(method): | |||||
| if not (isinstance(left_pad, tuple) and len(left_pad) == 2 and isinstance(left_pad[0], str) and isinstance( | if not (isinstance(left_pad, tuple) and len(left_pad) == 2 and isinstance(left_pad[0], str) and isinstance( | ||||
| left_pad[1], int)): | left_pad[1], int)): | ||||
| raise ValueError("left_pad needs to be a tuple of (str, int) str is pad token and int is pad_width") | |||||
| raise ValueError("left_pad needs to be a tuple of (str, int) str is pad token and int is pad_width.") | |||||
| if not (isinstance(right_pad, tuple) and len(right_pad) == 2 and isinstance(right_pad[0], str) and isinstance( | if not (isinstance(right_pad, tuple) and len(right_pad) == 2 and isinstance(right_pad[0], str) and isinstance( | ||||
| right_pad[1], int)): | right_pad[1], int)): | ||||
| raise ValueError("right_pad needs to be a tuple of (str, int) str is pad token and int is pad_width") | |||||
| raise ValueError("right_pad needs to be a tuple of (str, int) str is pad token and int is pad_width.") | |||||
| if not (left_pad[1] >= 0 and right_pad[1] >= 0): | if not (left_pad[1] >= 0 and right_pad[1] >= 0): | ||||
| raise ValueError("padding width need to be positive numbers") | |||||
| raise ValueError("padding width need to be positive numbers.") | |||||
| if separator is None: | if separator is None: | ||||
| separator = " " | separator = " " | ||||
| if not isinstance(separator, str): | if not isinstance(separator, str): | ||||
| raise ValueError("separator needs to be a string") | |||||
| raise ValueError("separator needs to be a string.") | |||||
| kwargs["n"] = n | kwargs["n"] = n | ||||
| kwargs["left_pad"] = left_pad | kwargs["left_pad"] = left_pad | ||||
| @@ -16,7 +16,7 @@ | |||||
| Testing Ngram in mindspore.dataset | Testing Ngram in mindspore.dataset | ||||
| """ | """ | ||||
| import mindspore.dataset as ds | import mindspore.dataset as ds | ||||
| import mindspore.dataset.text as nlp | |||||
| import mindspore.dataset.text as text | |||||
| import numpy as np | import numpy as np | ||||
| @@ -39,7 +39,7 @@ def test_multiple_ngrams(): | |||||
| yield (np.array(line.split(" "), dtype='S'),) | yield (np.array(line.split(" "), dtype='S'),) | ||||
| dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"]) | dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"]) | ||||
| dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram([1, 2, 3], ("_", 2), ("_", 2), " ")) | |||||
| dataset = dataset.map(input_columns=["text"], operations=text.Ngram([1, 2, 3], ("_", 2), ("_", 2), " ")) | |||||
| i = 0 | i = 0 | ||||
| for data in dataset.create_dict_iterator(): | for data in dataset.create_dict_iterator(): | ||||
| @@ -61,7 +61,7 @@ def test_simple_ngram(): | |||||
| yield (np.array(line.split(" "), dtype='S'),) | yield (np.array(line.split(" "), dtype='S'),) | ||||
| dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"]) | dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"]) | ||||
| dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram(3, separator=None)) | |||||
| dataset = dataset.map(input_columns=["text"], operations=text.Ngram(3, separator=None)) | |||||
| i = 0 | i = 0 | ||||
| for data in dataset.create_dict_iterator(): | for data in dataset.create_dict_iterator(): | ||||
| @@ -73,11 +73,11 @@ def test_corner_cases(): | |||||
| """ testing various corner cases and exceptions""" | """ testing various corner cases and exceptions""" | ||||
| def test_config(input_line, output_line, n, l_pad=None, r_pad=None, sep=None): | def test_config(input_line, output_line, n, l_pad=None, r_pad=None, sep=None): | ||||
| def gen(text): | |||||
| yield (np.array(text.split(" "), dtype='S'),) | |||||
| def gen(texts): | |||||
| yield (np.array(texts.split(" "), dtype='S'),) | |||||
| dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"]) | dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"]) | ||||
| dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram(n, l_pad, r_pad, separator=sep)) | |||||
| dataset = dataset.map(input_columns=["text"], operations=text.Ngram(n, l_pad, r_pad, separator=sep)) | |||||
| for data in dataset.create_dict_iterator(): | for data in dataset.create_dict_iterator(): | ||||
| assert [d.decode("utf8") for d in data["text"]] == output_line, output_line | assert [d.decode("utf8") for d in data["text"]] == output_line, output_line | ||||