You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

transforms.py 43 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. The module text.transforms is inherited from _c_dataengine
  16. and is implemented based on ICU4C and cppjieba in C++.
  17. It's a high performance module to process NLP text.
  18. Users can use Vocab to build their own dictionary,
  19. use appropriate tokenizers to split sentences into different tokens,
  20. and use Lookup to find the index of tokens in Vocab.
  21. .. Note::
  22. A constructor's arguments for every class in this module must be saved into the
  23. class attributes (self.xxx) to support save() and load().
  24. Examples:
  25. >>> text_file_dataset_dir = "/path/to/text_file_dataset_file"
  26. >>> # Create a dataset for text sentences saved as line data in a file
  27. >>> text_file_dataset = ds.TextFileDataset(text_file_dataset_dir, shuffle=False)
  28. >>> # Tokenize sentences to unicode characters
  29. >>> tokenizer = text.UnicodeCharTokenizer()
  30. >>> # Load vocabulary from list
  31. >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
  32. >>> # Use Lookup operator to map tokens to ids
  33. >>> lookup = text.Lookup(vocab)
  34. >>> text_file_dataset = text_file_dataset.map(operations=[tokenizer, lookup])
  35. >>> for i in text_file_dataset.create_dict_iterator():
  36. ... print(i)
  37. >>> # if text line in dataset_file is:
  38. >>> # 深圳欢迎您
  39. >>> # then the output will be:
  40. >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)}
  41. """
  42. import os
  43. import re
  44. import platform
  45. import numpy as np
  46. import mindspore._c_dataengine as cde
  47. import mindspore.common.dtype as mstype
  48. from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
  49. from .validators import check_lookup, check_jieba_add_dict, \
  50. check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
  51. check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate, \
  52. check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow
  53. from ..core.datatypes import mstype_to_detype
  54. from ..core.validator_helpers import replace_none
  55. from ..transforms.c_transforms import TensorOperation
  56. class TextTensorOperation(TensorOperation):
  57. """
  58. Base class of Text Tensor Ops
  59. """
  60. def __call__(self, input_tensor):
  61. if not isinstance(input_tensor, list):
  62. input_list = [input_tensor]
  63. else:
  64. input_list = input_tensor
  65. tensor_list = []
  66. for tensor in input_list:
  67. if not isinstance(tensor, str):
  68. raise TypeError("Input should be string or list of strings, got {}.".format(type(tensor)))
  69. tensor_list.append(cde.Tensor(np.asarray(tensor)))
  70. callable_op = cde.Execute(self.parse())
  71. output_list = callable_op(tensor_list)
  72. for i, element in enumerate(output_list):
  73. arr = element.as_array()
  74. if arr.dtype.char == 'S':
  75. output_list[i] = to_str(arr)
  76. else:
  77. output_list[i] = arr
  78. if not isinstance(input_tensor, list) and len(output_list) == 1:
  79. output_list = output_list[0]
  80. return output_list
  81. def parse(self):
  82. raise NotImplementedError("TextTensorOperation has to implement parse() method.")
  83. DE_C_INTER_JIEBA_MODE = {
  84. JiebaMode.MIX: cde.JiebaMode.DE_JIEBA_MIX,
  85. JiebaMode.MP: cde.JiebaMode.DE_JIEBA_MP,
  86. JiebaMode.HMM: cde.JiebaMode.DE_JIEBA_HMM
  87. }
  88. DE_C_INTER_SENTENCEPIECE_LOADTYPE = {
  89. SPieceTokenizerLoadType.FILE: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KFILE,
  90. SPieceTokenizerLoadType.MODEL: cde.SPieceTokenizerLoadType.DE_SPIECE_TOKENIZER_LOAD_KMODEL
  91. }
  92. DE_C_INTER_SENTENCEPIECE_OUTTYPE = {
  93. SPieceTokenizerOutType.STRING: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KString,
  94. SPieceTokenizerOutType.INT: cde.SPieceTokenizerOutType.DE_SPIECE_TOKENIZER_OUTTYPE_KINT
  95. }
  96. class JiebaTokenizer(TextTensorOperation):
  97. """
  98. Tokenize Chinese string into words based on dictionary.
  99. Note:
  100. The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed.
  101. Args:
  102. hmm_path (str): Dictionary file is used by HMMSegment algorithm.
  103. The dictionary can be obtained on the official website of cppjieba.
  104. mp_path (str): Dictionary file is used by MPSegment algorithm.
  105. The dictionary can be obtained on the official website of cppjieba.
  106. mode (JiebaMode, optional): Valid values can be any of [JiebaMode.MP, JiebaMode.HMM,
  107. JiebaMode.MIX](default=JiebaMode.MIX).
  108. - JiebaMode.MP, tokenize with MPSegment algorithm.
  109. - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
  110. - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
  111. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  112. Examples:
  113. >>> from mindspore.dataset.text import JiebaMode
  114. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  115. >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
  116. >>> jieba_mp_file = "/path/to/jieba/mp/file"
  117. >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=False)
  118. >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
  119. >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
  120. ... # ["offsets_limit", dtype=uint32]}
  121. >>> tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP, with_offsets=True)
  122. >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
  123. ... output_columns=["token", "offsets_start", "offsets_limit"],
  124. ... column_order=["token", "offsets_start", "offsets_limit"])
  125. """
  126. @check_jieba_init
  127. def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False):
  128. if not isinstance(mode, JiebaMode):
  129. raise TypeError("Wrong input type for mode, should be JiebaMode.")
  130. self.mode = mode
  131. self.__check_path__(hmm_path)
  132. self.hmm_path = hmm_path
  133. self.__check_path__(mp_path)
  134. self.mp_path = mp_path
  135. self.with_offsets = with_offsets
  136. self.words = []
  137. def parse(self):
  138. jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path,
  139. DE_C_INTER_JIEBA_MODE[self.mode],
  140. self.with_offsets)
  141. for word in self.words:
  142. jieba_tokenizer.add_word(word[0], word[1])
  143. return jieba_tokenizer
  144. @check_jieba_add_word
  145. def add_word(self, word, freq=None):
  146. """
  147. Add user defined word to JiebaTokenizer's dictionary.
  148. Args:
  149. word (str): The word to be added to the JiebaTokenizer instance.
  150. The added word will not be written into the built-in dictionary on disk.
  151. freq (int, optional): The frequency of the word to be added. The higher the frequency,
  152. the better chance the word will be tokenized (default=None, use default frequency).
  153. Examples:
  154. >>> from mindspore.dataset.text import JiebaMode
  155. >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
  156. >>> jieba_mp_file = "/path/to/jieba/mp/file"
  157. >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=text.JiebaMode.MP)
  158. >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
  159. >>> with open(sentence_piece_vocab_file, 'r') as f:
  160. >>> for line in f:
  161. ... word = line.split(',')[0]
  162. ... jieba_op.add_word(word)
  163. >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
  164. """
  165. if freq is None:
  166. self.words.append((word, 0))
  167. else:
  168. self.words.append((word, freq))
  169. @check_jieba_add_dict
  170. def add_dict(self, user_dict):
  171. """
  172. Add user defined word to JiebaTokenizer's dictionary.
  173. Args:
  174. user_dict (Union[str, dict]): One of the two loading methods is file path(str) loading
  175. (according to the Jieba dictionary format) and the other is Python dictionary(dict) loading,
  176. Python Dict format: {word1:freq1, word2:freq2,...}.
  177. Jieba dictionary format : word(required), freq(optional), such as:
  178. .. code-block::
  179. word1 freq1
  180. word2 None
  181. word3 freq3
  182. Examples:
  183. >>> from mindspore.dataset.text import JiebaMode
  184. >>> jieba_hmm_file = "/path/to/jieba/hmm/file"
  185. >>> jieba_mp_file = "/path/to/jieba/mp/file"
  186. >>> user_dict = {"男默女泪": 10}
  187. >>> jieba_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.MP)
  188. >>> jieba_op.add_dict(user_dict)
  189. >>> text_file_dataset = text_file_dataset.map(operations=jieba_op, input_columns=["text"])
  190. """
  191. if isinstance(user_dict, str):
  192. self.__add_dict_py_file(user_dict)
  193. elif isinstance(user_dict, dict):
  194. for k, v in user_dict.items():
  195. self.add_word(k, v)
  196. else:
  197. raise TypeError("The type of user_dict must str or dict.")
  198. def __add_dict_py_file(self, file_path):
  199. """Add user defined word by file"""
  200. words_list = self.__parser_file(file_path)
  201. for data in words_list:
  202. if data[1] is None:
  203. freq = 0
  204. else:
  205. freq = int(data[1])
  206. self.add_word(data[0], freq)
  207. def __parser_file(self, file_path):
  208. """parser user defined word by file"""
  209. if not os.path.exists(file_path):
  210. raise ValueError(
  211. "user dict file {} is not exist.".format(file_path))
  212. real_file_path = os.path.realpath(file_path)
  213. file_dict = open(real_file_path)
  214. data_re = re.compile('^(.+?)( [0-9]+)?$', re.U)
  215. words_list = []
  216. for item in file_dict:
  217. data = item.strip()
  218. if not isinstance(data, str):
  219. data = self.__decode(data)
  220. words = data_re.match(data).groups()
  221. if len(words) != 2:
  222. raise ValueError(
  223. "user dict file {} format error.".format(real_file_path))
  224. words_list.append(words)
  225. file_dict.close()
  226. return words_list
  227. def __decode(self, data):
  228. """decode the dict file to utf8"""
  229. try:
  230. data = data.decode('utf-8')
  231. except UnicodeDecodeError:
  232. raise ValueError("user dict file must be utf8 format.")
  233. return data.lstrip('\ufeff')
  234. def __check_path__(self, model_path):
  235. """check model path"""
  236. if not os.path.exists(model_path):
  237. raise ValueError(
  238. " jieba mode file {} is not exist.".format(model_path))
  239. class Lookup(TextTensorOperation):
  240. """
  241. Look up a word into an id according to the input vocabulary table.
  242. Args:
  243. vocab (Vocab): A vocabulary object.
  244. unknown_token (str, optional): Word used for lookup if the word being looked up is out-of-vocabulary (OOV).
  245. If unknown_token is OOV, a runtime error will be thrown (default=None).
  246. data_type (mindspore.dtype, optional): mindspore.dtype that lookup maps string to (default=mindspore.int32)
  247. Examples:
  248. >>> # Load vocabulary from list
  249. >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
  250. >>> # Use Lookup operator to map tokens to ids
  251. >>> lookup = text.Lookup(vocab)
  252. >>> text_file_dataset = text_file_dataset.map(operations=[lookup])
  253. """
  254. @check_lookup
  255. def __init__(self, vocab, unknown_token=None, data_type=mstype.int32):
  256. self.vocab = vocab
  257. self.unknown_token = unknown_token
  258. self.data_type = data_type
  259. def parse(self):
  260. return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type)))
  261. class Ngram(TextTensorOperation):
  262. """
  263. TensorOp to generate n-gram from a 1-D string Tensor.
  264. Refer to https://en.wikipedia.org/wiki/N-gram#Examples for an overview of what n-gram is and how it works.
  265. Args:
  266. n (list[int]): n in n-gram, which is a list of positive integers. For example, if n=[4, 3], then the result
  267. would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
  268. for a n-gram, an empty string will be returned. For example, 3 grams on ["mindspore", "best"] will result in
  269. an empty string produced.
  270. left_pad (tuple, optional): Padding performed on left side of the sequence shaped like ("pad_token", pad_width).
  271. `pad_width` will be capped at n-1. For example, specifying left_pad=("_", 2) would pad left side of the
  272. sequence with "__" (default=None).
  273. right_pad (tuple, optional): Padding performed on right side of the sequence shaped like
  274. ("pad_token", pad_width). `pad_width` will be capped at n-1. For example, specifying right_pad=("-", 2)
  275. would pad right side of the sequence with "--" (default=None).
  276. separator (str, optional): Symbol used to join strings together. For example. if 2-gram is
  277. ["mindspore", "amazing"] with separator="-", the result would be ["mindspore-amazing"]
  278. (default=None, which will use whitespace as separator).
  279. Examples:
  280. >>> text_file_dataset = text_file_dataset.map(operations=text.Ngram(3, separator=""))
  281. """
  282. @check_ngram
  283. def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "):
  284. self.ngrams = n
  285. self.left_pad = left_pad
  286. self.right_pad = right_pad
  287. self.separator = separator
  288. def parse(self):
  289. return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator)
  290. class SentencePieceTokenizer(TextTensorOperation):
  291. """
  292. Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  293. Args:
  294. mode (Union[str, SentencePieceVocab]): If the input parameter is a file, then it is of type string.
  295. If the input parameter is a SentencePieceVocab object, then it is of type SentencePieceVocab.
  296. out_type (Union[str, int]): The type of output.
  297. Examples:
  298. >>> from mindspore.dataset.text import SentencePieceModel, SPieceTokenizerOutType
  299. >>> sentence_piece_vocab_file = "/path/to/sentence/piece/vocab/file"
  300. >>> vocab = text.SentencePieceVocab.from_file([sentence_piece_vocab_file], 5000, 0.9995,
  301. ... SentencePieceModel.UNIGRAM, {})
  302. >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
  303. >>> text_file_dataset = text_file_dataset.map(operations=tokenizer)
  304. """
  305. def __init__(self, mode, out_type):
  306. self.mode = mode
  307. self.out_type = out_type
  308. def parse(self):
  309. return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])
  310. class SlidingWindow(TextTensorOperation):
  311. """
  312. TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
  313. is a slice of data starting at the corresponding position, with a specified width.
  314. Args:
  315. width (int): The width of the window. It must be an integer and greater than zero.
  316. axis (int, optional): The axis along which the sliding window is computed (default=0).
  317. Examples:
  318. >>> import mindspore.dataset.text as text
  319. >>>
  320. >>> # Data before
  321. >>> # | col1 |
  322. >>> # +-------------+
  323. >>> # | [1,2,3,4,5] |
  324. >>> # +-------------+
  325. >>> data1 = data1.map(operations=text.SlidingWindow(3, 0))
  326. >>> # Data after
  327. >>> # | col1 |
  328. >>> # +-------------+
  329. >>> # | [[1,2,3], |
  330. >>> # | [2,3,4], |
  331. >>> # | [3,4,5]] |
  332. >>> # +--------------+
  333. """
  334. @check_slidingwindow
  335. def __init__(self, width, axis=0):
  336. self.width = width
  337. self.axis = axis
  338. def parse(self):
  339. return cde.SlidingWindowOperation(self.width, self.axis)
  340. class ToNumber(TextTensorOperation):
  341. """
  342. Tensor operation to convert every element of a string tensor to a number.
  343. Strings are casted according to the rules specified in the following links:
  344. https://en.cppreference.com/w/cpp/string/basic_string/stof,
  345. https://en.cppreference.com/w/cpp/string/basic_string/stoul,
  346. except that any strings which represent negative numbers cannot be cast to an
  347. unsigned integer type.
  348. Args:
  349. data_type (mindspore.dtype): mindspore.dtype to be casted to. Must be
  350. a numeric type.
  351. Raises:
  352. RuntimeError: If strings are invalid to cast, or are out of range after being casted.
  353. Examples:
  354. >>> import mindspore.common.dtype as mstype
  355. >>> data = [["1", "2", "3"]]
  356. >>> dataset = ds.NumpySlicesDataset(data)
  357. >>> to_number_op = text.ToNumber(mstype.int8)
  358. >>> dataset = dataset.map(operations=to_number_op)
  359. """
  360. @check_to_number
  361. def __init__(self, data_type):
  362. data_type = mstype_to_detype(data_type)
  363. self.data_type = str(data_type)
  364. def parse(self):
  365. return cde.ToNumberOperation(self.data_type)
  366. class TruncateSequencePair(TextTensorOperation):
  367. """
  368. Truncate a pair of rank-1 tensors such that the total length is less than max_length.
  369. This operation takes two input tensors and returns two output Tensors.
  370. Args:
  371. max_length (int): Maximum length required.
  372. Examples:
  373. >>> import mindspore.dataset.text as text
  374. >>>
  375. >>> # Data before
  376. >>> # | col1 | col2 |
  377. >>> # +---------+---------|
  378. >>> # | [1,2,3] | [4,5] |
  379. >>> # +---------+---------+
  380. >>> data1 = data1.map(operations=text.TruncateSequencePair(4))
  381. >>> # Data after
  382. >>> # | col1 | col2 |
  383. >>> # +---------+---------+
  384. >>> # | [1,2] | [4,5] |
  385. >>> # +---------+---------+
  386. """
  387. @check_pair_truncate
  388. def __init__(self, max_length):
  389. self.max_length = max_length
  390. def parse(self):
  391. return cde.TruncateSequencePairOperation(self.max_length)
  392. class UnicodeCharTokenizer(TextTensorOperation):
  393. """
  394. Tokenize a scalar tensor of UTF-8 string to Unicode characters.
  395. Args:
  396. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  397. Examples:
  398. >>> import mindspore.dataset.text as text
  399. >>>
  400. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  401. >>> tokenizer_op = text.UnicodeCharTokenizer()
  402. >>> data1 = data1.map(operations=tokenizer_op)
  403. >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
  404. >>> # ["offsets_limit", dtype=uint32]}
  405. >>> tokenizer_op = text.UnicodeCharTokenizer(True)
  406. >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
  407. >>> output_columns=["token", "offsets_start", "offsets_limit"],
  408. >>> column_order=["token", "offsets_start", "offsets_limit"])
  409. """
  410. @check_with_offsets
  411. def __init__(self, with_offsets=False):
  412. self.with_offsets = with_offsets
  413. def parse(self):
  414. return cde.UnicodeCharTokenizerOperation(self.with_offsets)
  415. # TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++
  416. class WordpieceTokenizer(cde.WordpieceTokenizerOp):
  417. """
  418. Tokenize scalar token or 1-D tokens to 1-D subword tokens.
  419. Args:
  420. vocab (Vocab): A vocabulary object.
  421. suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
  422. max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
  423. unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string,
  424. return the token directly, else return 'unknown_token' (default='[UNK]').
  425. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  426. Examples:
  427. >>> import mindspore.dataset.text as text
  428. >>>
  429. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  430. >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
  431. ... max_bytes_per_token=100, with_offsets=False)
  432. >>> data1 = data1.map(operations=tokenizer_op)
  433. >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
  434. >>> # ["offsets_limit", dtype=uint32]}
  435. >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token='[UNK]',
  436. ... max_bytes_per_token=100, with_offsets=True)
  437. >>> data2 = data2.map(operations=tokenizer_op,
  438. ... input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
  439. ... column_order=["token", "offsets_start", "offsets_limit"])
  440. """
  441. @check_wordpiece_tokenizer
  442. def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
  443. unknown_token='[UNK]', with_offsets=False):
  444. self.vocab = vocab
  445. self.suffix_indicator = suffix_indicator
  446. self.max_bytes_per_token = max_bytes_per_token
  447. self.unknown_token = unknown_token
  448. self.with_offsets = with_offsets
  449. super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
  450. self.unknown_token, self.with_offsets)
  451. class PythonTokenizer:
  452. """
  453. Callable class to be used for user-defined string tokenizer.
  454. Args:
  455. tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
  456. Examples:
  457. >>> def my_tokenizer(line):
  458. ... return line.split()
  459. >>> text_file_dataset = text_file_dataset.map(operations=text.PythonTokenizer(my_tokenizer))
  460. """
  461. @check_python_tokenizer
  462. def __init__(self, tokenizer):
  463. self.tokenizer = np.vectorize(lambda x: np.array(tokenizer(x), dtype='U'), signature='()->(n)')
  464. self.random = False
  465. def __call__(self, in_array):
  466. in_array = to_str(in_array)
  467. tokens = self.tokenizer(in_array)
  468. return tokens
  469. if platform.system().lower() != 'windows':
  470. DE_C_INTER_NORMALIZE_FORM = {
  471. NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
  472. NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
  473. NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
  474. NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
  475. NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
  476. }
  477. class BasicTokenizer(TextTensorOperation):
  478. """
  479. Tokenize a scalar tensor of UTF-8 string by specific rules.
  480. Note:
  481. BasicTokenizer is not supported on Windows platform yet.
  482. Args:
  483. lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation
  484. on input text to fold the text to lower case and strip accents characters. If False, only apply
  485. NormalizeUTF8 operation with the specified mode on input text (default=False).
  486. keep_whitespace (bool, optional): If True, the whitespace will be kept in output tokens (default=False).
  487. normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode. This is
  488. only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
  489. preserve_unused_token (bool, optional): If True, do not split special tokens like
  490. '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
  491. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  492. Examples:
  493. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  494. >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
  495. ... keep_whitespace=False,
  496. ... normalization_form=NormalizeForm.NONE,
  497. ... preserve_unused_token=True,
  498. ... with_offsets=False)
  499. >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
  500. >>> # If with_offsets=False, then output three columns {["token", dtype=str],
  501. >>> # ["offsets_start", dtype=uint32],
  502. >>> # ["offsets_limit", dtype=uint32]}
  503. >>> tokenizer_op = text.BasicTokenizer(lower_case=False,
  504. ... keep_whitespace=False,
  505. ... normalization_form=NormalizeForm.NONE,
  506. ... preserve_unused_token=True,
  507. ... with_offsets=True)
  508. >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
  509. ... output_columns=["token", "offsets_start",
  510. ... "offsets_limit"],
  511. ... column_order=["token", "offsets_start",
  512. ... "offsets_limit"])
  513. """
  514. @check_basic_tokenizer
  515. def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
  516. preserve_unused_token=True, with_offsets=False):
  517. if not isinstance(normalization_form, NormalizeForm):
  518. raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
  519. self.lower_case = lower_case
  520. self.keep_whitespace = keep_whitespace
  521. self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
  522. self.preserve_unused_token = preserve_unused_token
  523. self.with_offsets = with_offsets
  524. def parse(self):
  525. return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form,
  526. self.preserve_unused_token, self.with_offsets)
  527. class BertTokenizer(TextTensorOperation):
  528. """
  529. Tokenizer used for Bert text process.
  530. Note:
  531. BertTokenizer is not supported on Windows platform yet.
  532. Args:
  533. vocab (Vocab): A vocabulary object.
  534. suffix_indicator (str, optional): Used to show that the subword is the last part of a word (default='##').
  535. max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100).
  536. unknown_token (str, optional): When an unknown token is found, return the token directly if `unknown_token`
  537. is an empty string, else return `unknown_token` instead (default='[UNK]').
  538. lower_case (bool, optional): If True, apply CaseFold, NormalizeUTF8 with `NFD` mode, RegexReplace operation
  539. on input text to fold the text to lower case and strip accented characters. If False, only apply
  540. NormalizeUTF8 operation with the specified mode on input text (default=False).
  541. keep_whitespace (bool, optional): If True, the whitespace will be kept in out tokens (default=False).
  542. normalization_form (NormalizeForm, optional): Used to specify a specific normalize mode,
  543. only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE).
  544. preserve_unused_token (bool, optional): If True, do not split special tokens like
  545. '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True).
  546. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  547. Examples:
  548. >>> from mindspore.dataset.text import NormalizeForm
  549. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  550. >>> vocab_list = ["床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低",
  551. ... "思", "故", "乡","繁", "體", "字", "嘿", "哈", "大", "笑", "嘻", "i", "am", "mak",
  552. ... "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", "😀", "😃",
  553. ... "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I", "[CLS]", "[SEP]",
  554. ... "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"]
  555. >>> vocab = text.Vocab.from_list(vocab_list)
  556. >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
  557. ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
  558. ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
  559. ... with_offsets=False)
  560. >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
  561. >>> # If with_offsets=False, then output three columns {["token", dtype=str],
  562. >>> # ["offsets_start", dtype=uint32],
  563. >>> # ["offsets_limit", dtype=uint32]}
  564. >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
  565. ... unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
  566. ... normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
  567. ... with_offsets=True)
  568. >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
  569. ... output_columns=["token", "offsets_start",
  570. ... "offsets_limit"],
  571. ... column_order=["token", "offsets_start",
  572. ... "offsets_limit"])
  573. """
  574. @check_bert_tokenizer
  575. def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]',
  576. lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
  577. preserve_unused_token=True, with_offsets=False):
  578. if not isinstance(normalization_form, NormalizeForm):
  579. raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
  580. self.vocab = vocab
  581. self.suffix_indicator = suffix_indicator
  582. self.max_bytes_per_token = max_bytes_per_token
  583. self.unknown_token = unknown_token
  584. self.lower_case = lower_case
  585. self.keep_whitespace = keep_whitespace
  586. self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
  587. self.preserve_unused_token = preserve_unused_token
  588. self.with_offsets = with_offsets
  589. def parse(self):
  590. return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
  591. self.unknown_token, self.lower_case, self.keep_whitespace,
  592. self.normalization_form, self.preserve_unused_token, self.with_offsets)
  593. class CaseFold(TextTensorOperation):
  594. """
  595. Apply case fold operation on UTF-8 string tensor, which is aggressive that can convert more characters into
  596. lower case.
  597. Note:
  598. CaseFold is not supported on Windows platform yet.
  599. Examples:
  600. >>> case_op = text.CaseFold()
  601. >>> text_file_dataset = text_file_dataset.map(operations=case_op)
  602. """
  603. def parse(self):
  604. return cde.CaseFoldOperation()
  605. class NormalizeUTF8(TextTensorOperation):
  606. """
  607. Apply normalize operation on UTF-8 string tensor.
  608. Note:
  609. NormalizeUTF8 is not supported on Windows platform yet.
  610. Args:
  611. normalize_form (NormalizeForm, optional): Valid values can be any of [NormalizeForm.NONE,
  612. NormalizeForm.NFC, NormalizeForm.NFKC, NormalizeForm.NFD,
  613. NormalizeForm.NFKD](default=NormalizeForm.NFKC).
  614. See http://unicode.org/reports/tr15/ for details.
  615. - NormalizeForm.NONE, do nothing for input string tensor.
  616. - NormalizeForm.NFC, normalize with Normalization Form C.
  617. - NormalizeForm.NFKC, normalize with Normalization Form KC.
  618. - NormalizeForm.NFD, normalize with Normalization Form D.
  619. - NormalizeForm.NFKD, normalize with Normalization Form KD.
  620. Examples:
  621. >>> from mindspore.dataset.text import NormalizeForm
  622. >>> normalize_op = text.NormalizeUTF8(normalize_form=NormalizeForm.NFC)
  623. >>> text_file_dataset = text_file_dataset.map(operations=normalize_op)
  624. """
  625. def __init__(self, normalize_form=NormalizeForm.NFKC):
  626. if not isinstance(normalize_form, NormalizeForm):
  627. raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.")
  628. normalize_form = replace_none(normalize_form, NormalizeForm.NFKC)
  629. self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
  630. def parse(self):
  631. return cde.NormalizeUTF8Operation(self.normalize_form)
  632. class RegexReplace(TextTensorOperation):
  633. """
  634. Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
  635. See http://userguide.icu-project.org/strings/regexp for support regex pattern.
  636. Note:
  637. RegexReplace is not supported on Windows platform yet.
  638. Args:
  639. pattern (str): the regex expression patterns.
  640. replace (str): the string to replace matched element.
  641. replace_all (bool, optional): If False, only replace first matched element;
  642. if True, replace all matched elements (default=True).
  643. Examples:
  644. >>> pattern = 'Canada'
  645. >>> replace = 'China'
  646. >>> replace_op = text.RegexReplace(pattern, replace)
  647. >>> text_file_dataset = text_file_dataset.map(operations=replace_op)
  648. """
  649. def __init__(self, pattern, replace, replace_all=True):
  650. self.pattern = pattern
  651. self.replace = replace
  652. self.replace_all = replace_all
  653. def parse(self):
  654. return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all)
  655. class RegexTokenizer(TextTensorOperation):
  656. """
  657. Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
  658. See http://userguide.icu-project.org/strings/regexp for support regex pattern.
  659. Note:
  660. RegexTokenizer is not supported on Windows platform yet.
  661. Args:
  662. delim_pattern (str): The pattern of regex delimiters.
  663. The original string will be split by matched elements.
  664. keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token
  665. if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('')
  666. which means that delimiters will not be kept as an output token (default='').
  667. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  668. Examples:
  669. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  670. >>> delim_pattern = r"[ |,]"
  671. >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=False)
  672. >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
  673. >>> # If with_offsets=False, then output three columns {["token", dtype=str],
  674. >>> # ["offsets_start", dtype=uint32],
  675. >>> # ["offsets_limit", dtype=uint32]}
  676. >>> tokenizer_op = text.RegexTokenizer(delim_pattern, with_offsets=True)
  677. >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
  678. ... output_columns=["token", "offsets_start",
  679. ... "offsets_limit"],
  680. ... column_order=["token", "offsets_start",
  681. ... "offsets_limit"])
  682. """
  683. @check_regex_tokenizer
  684. def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False):
  685. self.delim_pattern = delim_pattern
  686. self.keep_delim_pattern = keep_delim_pattern
  687. self.with_offsets = with_offsets
  688. def parse(self):
  689. return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
  690. class UnicodeScriptTokenizer(TextTensorOperation):
  691. """
  692. Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
  693. Note:
  694. UnicodeScriptTokenizer is not supported on Windows platform yet.
  695. Args:
  696. keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
  697. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  698. Examples:
  699. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  700. >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=False)
  701. >>> text_file_dataset = text_file_dataset.map(operations=tokenizer_op)
  702. >>> # If with_offsets=False, then output three columns {["token", dtype=str],
  703. >>> # ["offsets_start", dtype=uint32],
  704. >>> # ["offsets_limit", dtype=uint32]}
  705. >>> tokenizer_op = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
  706. >>> text_file_dataset_1 = text_file_dataset_1.map(operations=tokenizer_op, input_columns=["text"],
  707. ... output_columns=["token", "offsets_start",
  708. ... "offsets_limit"],
  709. ... column_order=["token", "offsets_start",
  710. ... "offsets_limit"])
  711. """
  712. @check_unicode_script_tokenizer
  713. def __init__(self, keep_whitespace=False, with_offsets=False):
  714. keep_whitespace = replace_none(keep_whitespace, False)
  715. with_offsets = replace_none(with_offsets, False)
  716. self.keep_whitespace = keep_whitespace
  717. self.with_offsets = with_offsets
  718. def parse(self):
  719. return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets)
  720. class WhitespaceTokenizer(TextTensorOperation):
  721. """
  722. Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'.
  723. Note:
  724. WhitespaceTokenizer is not supported on Windows platform yet.
  725. Args:
  726. with_offsets (bool, optional): If or not output offsets of tokens (default=False).
  727. Examples:
  728. >>> # If with_offsets=False, default output one column {["text", dtype=str]}
  729. >>> tokenizer_op = text.WhitespaceTokenizer()
  730. >>> data1 = data1.map(operations=tokenizer_op)
  731. >>> # If with_offsets=False, then output three columns {["token", dtype=str],
  732. >>> # ["offsets_start", dtype=uint32],
  733. >>> # ["offsets_limit", dtype=uint32]}
  734. >>> tokenizer_op = text.WhitespaceTokenizer(True)
  735. >>> data2 = data2.map(operations=tokenizer_op, input_columns=["text"],
  736. >>> output_columns=["token", "offsets_start", "offsets_limit"],
  737. >>> column_order=["token", "offsets_start", "offsets_limit"])
  738. """
  739. @check_with_offsets
  740. def __init__(self, with_offsets=False):
  741. self.with_offsets = with_offsets
  742. def parse(self):
  743. return cde.WhitespaceTokenizerOperation(self.with_offsets)