You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 12 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. The module text.utils provides some general methods for NLP text processing.
  16. For example, you can use Vocab to build a dictionary,
  17. use to_bytes and to_str to encode and decode strings into a specified format.
  18. """
  19. from enum import IntEnum
  20. import copy
  21. import numpy as np
  22. import mindspore._c_dataengine as cde
  23. from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset, \
  24. check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model
  25. __all__ = [
  26. "Vocab", "SentencePieceVocab", "to_str", "to_bytes"
  27. ]
  28. class Vocab(cde.Vocab):
  29. """
  30. Vocab object that is used to lookup a word.
  31. It contains a map that maps each word(str) to an id (int).
  32. """
  33. @classmethod
  34. @check_from_dataset
  35. def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None,
  36. special_first=True):
  37. """
  38. Build a vocab from a dataset.
  39. This would collect all unique words in a dataset and return a vocab within
  40. the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
  41. Words in vocab are ordered from highest frequency to lowest frequency. Words with the same frequency would be
  42. ordered lexicographically.
  43. Args:
  44. dataset(Dataset): dataset to build vocab from.
  45. columns(list[str], optional): column names to get words from. It can be a list of column names.
  46. (default=None, where all columns will be used. If any column isn't string type, will return error).
  47. freq_range(tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
  48. range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
  49. min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
  50. min_frequency/max_frequency can be None, which corresponds to 0/total_words separately
  51. (default=None, all words are included).
  52. top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
  53. taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
  54. all words are included).
  55. special_tokens(list, optional): a list of strings, each one is a special token. for example
  56. special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
  57. special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
  58. is specified and special_first is set to True, special_tokens will be prepended (default=True).
  59. Returns:
  60. Vocab, Vocab object built from dataset.
  61. """
  62. vocab = Vocab()
  63. if columns is None:
  64. columns = []
  65. if not isinstance(columns, list):
  66. columns = [columns]
  67. if freq_range is None:
  68. freq_range = (None, None)
  69. if special_tokens is None:
  70. special_tokens = []
  71. root = copy.deepcopy(dataset).build_vocab(vocab, columns, freq_range, top_k, special_tokens, special_first)
  72. for d in root.create_dict_iterator(num_epochs=1):
  73. if d is not None:
  74. raise ValueError("from_dataset should receive data other than None.")
  75. return vocab
  76. @classmethod
  77. @check_from_list
  78. def from_list(cls, word_list, special_tokens=None, special_first=True):
  79. """
  80. Build a vocab object from a list of word.
  81. Args:
  82. word_list(list): a list of string where each element is a word of type string.
  83. special_tokens(list, optional): a list of strings, each one is a special token. for example
  84. special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
  85. special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
  86. is specified and special_first is set to True, special_tokens will be prepended (default=True).
  87. """
  88. if special_tokens is None:
  89. special_tokens = []
  90. return super().from_list(word_list, special_tokens, special_first)
  91. @classmethod
  92. @check_from_file
  93. def from_file(cls, file_path, delimiter="", vocab_size=None, special_tokens=None, special_first=True):
  94. """
  95. Build a vocab object from a list of word.
  96. Args:
  97. file_path (str): path to the file which contains the vocab list.
  98. delimiter (str, optional): a delimiter to break up each line in file, the first element is taken to be
  99. the word (default="").
  100. vocab_size (int, optional): number of words to read from file_path (default=None, all words are taken).
  101. special_tokens (list, optional): a list of strings, each one is a special token. for example
  102. special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
  103. special_first (bool, optional): whether special_tokens will be prepended/appended to vocab,
  104. If special_tokens is specified and special_first is set to True,
  105. special_tokens will be prepended (default=True).
  106. """
  107. if vocab_size is None:
  108. vocab_size = -1
  109. if special_tokens is None:
  110. special_tokens = []
  111. return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
  112. @classmethod
  113. @check_from_dict
  114. def from_dict(cls, word_dict):
  115. """
  116. Build a vocab object from a dict.
  117. Args:
  118. word_dict (dict): dict contains word and id pairs, where word should be str and id be int. id is recommended
  119. to start from 0 and be continuous. ValueError will be raised if id is negative.
  120. """
  121. return super().from_dict(word_dict)
  122. class SentencePieceVocab(cde.SentencePieceVocab):
  123. """
  124. SentencePiece obiect that is used to segmentate words
  125. """
  126. @classmethod
  127. @check_from_dataset_sentencepiece
  128. def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
  129. """
  130. Build a sentencepiece from a dataset
  131. Args:
  132. dataset(Dataset): Dataset to build sentencepiece.
  133. col_names(list): The list of the col name.
  134. vocab_size(int): Vocabulary size.
  135. character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
  136. languages. with rich character set like Japanese or Chinese and 1.0 for other languages with small
  137. character set.
  138. model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
  139. must be pretokenized when using word type.
  140. params(dict): A dictionary with no incoming parameters.
  141. Returns:
  142. SentencePiece, SentencePiece object from dataset.
  143. """
  144. vocab = SentencePieceVocab()
  145. root = copy.deepcopy(dataset).build_sentencepiece_vocab(vocab, col_names, vocab_size, character_coverage,
  146. model_type, params)
  147. for d in root.create_dict_iterator(num_epochs=1):
  148. if d is None:
  149. raise ValueError("from_dataset should receive data other than None.")
  150. return vocab
  151. @classmethod
  152. @check_from_file_sentencepiece
  153. def from_file(cls, file_path, vocab_size, character_coverage, model_type, params):
  154. """
  155. Build a SentencePiece object from a list of word.
  156. Args:
  157. file_path(list): Path to the file which contains the sentencepiece list.
  158. vocab_size(int): Vocabulary size, the type of uint32_t.
  159. character_coverage(float): Amount of characters covered by the model, good defaults are: 0.9995 for
  160. languages. with rich character set like Japanse or Chinese and 1.0 for other languages with small
  161. character set.
  162. model_type(SentencePieceModel): Choose from unigram (default), bpe, char, or word. The input sentence
  163. must be pretokenized when using word type.
  164. params(dict): A dictionary with no incoming parameters(The parameters are derived from SentencePiece
  165. library).
  166. .. code-block::
  167. input_sentence_size 0
  168. max_sentencepiece_length 16
  169. """
  170. return super().from_file(file_path, vocab_size, character_coverage,
  171. DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
  172. @classmethod
  173. @check_save_model
  174. def save_model(cls, vocab, path, filename):
  175. """
  176. Save model to filepath
  177. Args:
  178. vocab(SentencePieceVocab): A sentencepiece object.
  179. path(str): Path to store model.
  180. filename(str): The name of the file.
  181. """
  182. return super().save_model(vocab, path, filename)
  183. def to_str(array, encoding='utf8'):
  184. """
  185. Convert NumPy array of `bytes` to array of `str` by decoding each element based on charset `encoding`.
  186. Args:
  187. array (numpy.ndarray): Array of type `bytes` representing strings.
  188. encoding (str): Indicating the charset for decoding.
  189. Returns:
  190. numpy.ndarray, NumPy array of `str`.
  191. """
  192. if not isinstance(array, np.ndarray):
  193. raise ValueError('input should be a NumPy array.')
  194. return np.char.decode(array, encoding)
  195. def to_bytes(array, encoding='utf8'):
  196. """
  197. Convert NumPy array of `str` to array of `bytes` by encoding each element based on charset `encoding`.
  198. Args:
  199. array (numpy.ndarray): Array of type `str` representing strings.
  200. encoding (str): Indicating the charset for encoding.
  201. Returns:
  202. numpy.ndarray, NumPy array of `bytes`.
  203. """
  204. if not isinstance(array, np.ndarray):
  205. raise ValueError('input should be a NumPy array.')
  206. return np.char.encode(array, encoding)
  207. class JiebaMode(IntEnum):
  208. """An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM."""
  209. MIX = 0
  210. MP = 1
  211. HMM = 2
  212. class NormalizeForm(IntEnum):
  213. """An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD."""
  214. NONE = 0
  215. NFC = 1
  216. NFKC = 2
  217. NFD = 3
  218. NFKD = 4
  219. class SentencePieceModel(IntEnum):
  220. """An enumeration for SentencePieceModel, effective enumeration types are UNIGRAM, BPE, CHAR, WORD."""
  221. UNIGRAM = 0
  222. BPE = 1
  223. CHAR = 2
  224. WORD = 3
  225. DE_C_INTER_SENTENCEPIECE_MODE = {
  226. SentencePieceModel.UNIGRAM: cde.SentencePieceModel.DE_SENTENCE_PIECE_UNIGRAM,
  227. SentencePieceModel.BPE: cde.SentencePieceModel.DE_SENTENCE_PIECE_BPE,
  228. SentencePieceModel.CHAR: cde.SentencePieceModel.DE_SENTENCE_PIECE_CHAR,
  229. SentencePieceModel.WORD: cde.SentencePieceModel.DE_SENTENCE_PIECE_WORD
  230. }
  231. class SPieceTokenizerOutType(IntEnum):
  232. """An enumeration for SPieceTokenizerOutType, effective enumeration types are STRING, INT."""
  233. STRING = 0
  234. INT = 1
  235. class SPieceTokenizerLoadType(IntEnum):
  236. """An enumeration for SPieceTokenizerLoadType, effective enumeration types are FILE, MODEL."""
  237. FILE = 0
  238. MODEL = 1