You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.h 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  18. #include <memory>
  19. #include <optional>
  20. #include <string>
  21. #include <utility>
  22. #include <vector>
  23. #include "include/api/dual_abi_helper.h"
  24. #include "include/api/status.h"
  25. #include "minddata/dataset/include/constants.h"
  26. #include "minddata/dataset/include/transforms.h"
  27. namespace mindspore {
  28. namespace dataset {
  29. class Vocab;
  30. class SentencePieceVocab;
  31. class TensorOperation;
  32. // Transform operations for text
  33. namespace text {
  34. #ifndef _WIN32
  35. /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
  36. /// \notes BasicTokenizer is not supported on Windows platform yet.
  37. class BasicTokenizer final : public TensorTransform {
  38. public:
  39. /// \brief Constructor.
  40. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  41. /// fold the text to lower case and strip accents characters. If false, only apply
  42. /// NormalizeUTF8('normalization_form' mode) operation on input text (default=false).
  43. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  44. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  45. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  46. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  47. /// '[MASK]' (default=true).
  48. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  49. explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
  50. const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
  51. bool with_offsets = false);
  52. /// \brief Destructor
  53. ~BasicTokenizer() = default;
  54. protected:
  55. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  56. /// \return Shared pointer to TensorOperation object.
  57. std::shared_ptr<TensorOperation> Parse() override;
  58. private:
  59. struct Data;
  60. std::shared_ptr<Data> data_;
  61. };
  62. /// \brief Tokenizer used for Bert text process.
  63. /// \notes BertTokenizer is not supported on Windows platform yet.
  64. class BertTokenizer final : public TensorTransform {
  65. public:
  66. /// \brief Constructor.
  67. /// \param[in] vocab A Vocab object.
  68. /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
  69. /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
  70. /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
  71. /// string, else return the string specified(default='[UNK]').
  72. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  73. /// fold the text to lower case and strip accents characters. If false, only apply
  74. /// NormalizeUTF8('normalization_form' mode) operation on input text (default=false).
  75. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  76. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  77. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  78. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  79. /// '[MASK]' (default=true).
  80. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  81. explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
  82. int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
  83. bool lower_case = false, bool keep_whitespace = false,
  84. const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
  85. bool with_offsets = false)
  86. : BertTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
  87. lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets) {}
  88. explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  89. int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
  90. bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  91. bool with_offsets);
  92. /// \brief Destructor
  93. ~BertTokenizer() = default;
  94. protected:
  95. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  96. /// \return Shared pointer to TensorOperation object.
  97. std::shared_ptr<TensorOperation> Parse() override;
  98. private:
  99. struct Data;
  100. std::shared_ptr<Data> data_;
  101. };
  102. /// \brief Apply case fold operation on UTF-8 string tensor.
  103. /// \return Shared pointer to the current TensorOperation.
  104. class CaseFold final : public TensorTransform {
  105. public:
  106. /// \brief Constructor.
  107. CaseFold();
  108. /// \brief Destructor
  109. ~CaseFold() = default;
  110. protected:
  111. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  112. //// \return Shared pointer to TensorOperation object.
  113. std::shared_ptr<TensorOperation> Parse() override;
  114. };
  115. #endif
  116. /// \brief Tokenize Chinese string into words based on dictionary.
  117. /// \notes The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed.
  118. class JiebaTokenizer final : public TensorTransform {
  119. public:
  120. /// \brief Constructor.
  121. /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
  122. /// official website of cppjieba.
  123. /// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
  124. /// official website of cppjieba.
  125. /// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
  126. /// - JiebaMode.kMP, tokenize with MPSegment algorithm.
  127. /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
  128. /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
  129. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  130. explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  131. const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false)
  132. : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {}
  133. explicit JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode,
  134. bool with_offsets);
  135. /// \brief Destructor
  136. ~JiebaTokenizer() = default;
  137. /// \brief Add user defined word to JiebaTokenizer's dictionary.
  138. /// \param[in] word The word to be added to the JiebaTokenizer instance.
  139. /// The added word will not be written into the built-in dictionary on disk.
  140. /// \param[in] freq The frequency of the word to be added. The higher the frequency,
  141. /// the better chance the word will be tokenized (default=None, use default frequency).
  142. Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); }
  143. /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary.
  144. /// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary.
  145. Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) {
  146. return AddDictChar(PairStringInt64ToPairCharInt64(user_dict));
  147. }
  148. /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file.
  149. /// Only valid word-freq pairs in user provided file will be added into the dictionary.
  150. /// Rows containing invalid input will be ignored, no error nor warning Status is returned.
  151. /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs.
  152. Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); }
  153. protected:
  154. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  155. /// \return Shared pointer to TensorOperation object.
  156. std::shared_ptr<TensorOperation> Parse() override;
  157. private:
  158. /// \brief Parser user defined word by file.
  159. /// \param[in] file_path Path to the user defined file.
  160. /// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file.
  161. Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict);
  162. /// \brief Used to translate all API string to vector of char and back
  163. Status AddWordChar(const std::vector<char> &word, int64_t freq = 0);
  164. /// \brief Used to translate all API string to vector of char and back
  165. Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict);
  166. /// \brief Used to translate all API string to vector of char and back
  167. Status AddDictChar(const std::vector<char> &file_path);
  168. struct Data;
  169. std::shared_ptr<Data> data_;
  170. };
  171. /// \brief Look up a word into an id according to the input vocabulary table.
  172. class Lookup final : public TensorTransform {
  173. public:
  174. /// \brief Constructor.
  175. /// \param[in] vocab a Vocab object.
  176. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
  177. /// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to
  178. /// specify unknown_token when word being out of Vocabulary (default={}).
  179. /// \param[in] data_type type of the tensor after lookup, typically int32.
  180. explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
  181. const std::string &data_type = "int32")
  182. : Lookup(vocab, OptionalStringToChar(unknown_token), StringToChar(data_type)) {}
  183. explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  184. const std::vector<char> &data_type);
  185. /// \brief Destructor
  186. ~Lookup() = default;
  187. protected:
  188. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  189. /// \return Shared pointer to TensorOperation object.
  190. std::shared_ptr<TensorOperation> Parse() override;
  191. private:
  192. struct Data;
  193. std::shared_ptr<Data> data_;
  194. };
  195. /// \brief TensorOp to generate n-gram from a 1-D string Tensor.
  196. class Ngram final : public TensorTransform {
  197. public:
  198. /// \brief Constructor.
  199. /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
  200. /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
  201. /// for a n-gram, an empty string will be returned.
  202. /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
  203. /// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
  204. /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
  205. /// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
  206. /// \param[in] separator Symbol used to join strings together (default=" ").
  207. explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0},
  208. const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ")
  209. : Ngram(ngrams, PairStringToChar(left_pad), PairStringToChar(right_pad), StringToChar(separator)) {}
  210. explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  211. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator);
  212. /// \brief Destructor
  213. ~Ngram() = default;
  214. protected:
  215. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  216. /// \return Shared pointer to TensorOperation object.
  217. std::shared_ptr<TensorOperation> Parse() override;
  218. private:
  219. struct Data;
  220. std::shared_ptr<Data> data_;
  221. };
  222. #ifndef _WIN32
  223. /// \brief Apply normalize operation on UTF-8 string tensor.
  224. class NormalizeUTF8 final : public TensorTransform {
  225. public:
  226. /// \brief Constructor.
  227. /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
  228. /// NormalizeForm::kNfkc,
  229. /// NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
  230. /// See http://unicode.org/reports/tr15/ for details.
  231. /// - NormalizeForm.NONE, do nothing for input string tensor.
  232. /// - NormalizeForm.NFC, normalize with Normalization Form C.
  233. /// - NormalizeForm.NFKC, normalize with Normalization Form KC.
  234. /// - NormalizeForm.NFD, normalize with Normalization Form D.
  235. /// - NormalizeForm.NFKD, normalize with Normalization Form KD.
  236. explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
  237. /// \brief Destructor
  238. ~NormalizeUTF8() = default;
  239. protected:
  240. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  241. /// \return Shared pointer to TensorOperation object.
  242. std::shared_ptr<TensorOperation> Parse() override;
  243. private:
  244. struct Data;
  245. std::shared_ptr<Data> data_;
  246. };
  247. /// \brief Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
  248. class RegexReplace final : public TensorTransform {
  249. public:
  250. /// \brief Constructor.
  251. /// \param[in] pattern The regex expression patterns.
  252. /// \param[in] replace The string to replace matched element.
  253. /// \param[in] replace_all Confirm whether to replace all. If false, only replace first matched element;
  254. /// if true, replace all matched elements (default=true).
  255. explicit RegexReplace(std::string pattern, std::string replace, bool replace_all = true)
  256. : RegexReplace(StringToChar(pattern), StringToChar(replace), replace_all) {}
  257. explicit RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all);
  258. /// \brief Destructor
  259. ~RegexReplace() = default;
  260. protected:
  261. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  262. /// \return Shared pointer to TensorOperation object.
  263. std::shared_ptr<TensorOperation> Parse() override;
  264. private:
  265. struct Data;
  266. std::shared_ptr<Data> data_;
  267. };
  268. /// \brief Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
  269. class RegexTokenizer final : public TensorTransform {
  270. public:
  271. /// \brief Constructor.
  272. /// \param[in] delim_pattern The pattern of regex delimiters.
  273. /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
  274. /// matched by 'keep_delim_pattern'. The default value is an empty string ("")
  275. /// which means that delimiters will not be kept as an output token (default="").
  276. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  277. explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false)
  278. : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {}
  279. explicit RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
  280. bool with_offsets);
  281. /// \brief Destructor
  282. ~RegexTokenizer() = default;
  283. protected:
  284. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  285. /// \return Shared pointer to TensorOperation object.
  286. std::shared_ptr<TensorOperation> Parse() override;
  287. private:
  288. struct Data;
  289. std::shared_ptr<Data> data_;
  290. };
  291. #endif
  292. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  293. class SentencePieceTokenizer final : public TensorTransform {
  294. public:
  295. /// \brief Constructor.
  296. /// \param[in] vocab a SentencePieceVocab object.
  297. /// \param[in] out_type The type of output.
  298. SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
  299. mindspore::dataset::SPieceTokenizerOutType out_type);
  300. /// \brief Constructor.
  301. /// \param[in] vocab_path vocab model file path.
  302. /// \param[in] out_type The type of output.
  303. SentencePieceTokenizer(const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type)
  304. : SentencePieceTokenizer(StringToChar(vocab_path), out_type) {}
  305. SentencePieceTokenizer(const std::vector<char> &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
  306. /// \brief Destructor
  307. ~SentencePieceTokenizer() = default;
  308. protected:
  309. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  310. /// \return Shared pointer to TensorOperation object.
  311. std::shared_ptr<TensorOperation> Parse() override;
  312. private:
  313. struct Data;
  314. std::shared_ptr<Data> data_;
  315. };
  316. /// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
  317. /// axis is a slice of data starting at the corresponding position, with a specified width.
  318. class SlidingWindow final : public TensorTransform {
  319. public:
  320. /// \brief Constructor.
  321. /// \param[in] width The width of the window. It must be an integer and greater than zero.
  322. /// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
  323. /// for now.
  324. explicit SlidingWindow(const int32_t width, const int32_t axis = 0);
  325. /// \brief Destructor
  326. ~SlidingWindow() = default;
  327. protected:
  328. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  329. /// \return Shared pointer to TensorOperation object.
  330. std::shared_ptr<TensorOperation> Parse() override;
  331. private:
  332. struct Data;
  333. std::shared_ptr<Data> data_;
  334. };
  335. /// \brief Tensor operation to convert every element of a string tensor to a number.
  336. /// Strings are cast according to the rules specified in the following links:
  337. /// https://en.cppreference.com/w/cpp/string/basic_string/stof,
  338. /// https://en.cppreference.com/w/cpp/string/basic_string/stoul,
  339. /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
  340. class ToNumber final : public TensorTransform {
  341. public:
  342. /// \brief Constructor.
  343. /// \param[in] data_type of the tensor to be cast to. Must be a numeric type.
  344. explicit ToNumber(const std::string &data_type) : ToNumber(StringToChar(data_type)) {}
  345. explicit ToNumber(const std::vector<char> &data_type);
  346. /// \brief Destructor
  347. ~ToNumber() = default;
  348. protected:
  349. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  350. /// \return Shared pointer to TensorOperation object.
  351. std::shared_ptr<TensorOperation> Parse() override;
  352. private:
  353. struct Data;
  354. std::shared_ptr<Data> data_;
  355. };
  356. /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
  357. class TruncateSequencePair final : public TensorTransform {
  358. public:
  359. /// \brief Constructor.
  360. /// \param[in] max_length Maximum length required.
  361. explicit TruncateSequencePair(int32_t max_length);
  362. /// \brief Destructor
  363. ~TruncateSequencePair() = default;
  364. protected:
  365. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  366. /// \return Shared pointer to TensorOperation object.
  367. std::shared_ptr<TensorOperation> Parse() override;
  368. private:
  369. struct Data;
  370. std::shared_ptr<Data> data_;
  371. };
  372. /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
  373. class UnicodeCharTokenizer final : public TensorTransform {
  374. public:
  375. /// \brief Constructor.
  376. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  377. explicit UnicodeCharTokenizer(bool with_offsets = false);
  378. /// \brief Destructor
  379. ~UnicodeCharTokenizer() = default;
  380. protected:
  381. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  382. /// \return Shared pointer to TensorOperation object.
  383. std::shared_ptr<TensorOperation> Parse() override;
  384. private:
  385. struct Data;
  386. std::shared_ptr<Data> data_;
  387. };
  388. /// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens.
  389. class WordpieceTokenizer final : public TensorTransform {
  390. public:
  391. /// \brief Constructor.
  392. /// \param[in] vocab A Vocab object.
  393. /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
  394. /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
  395. /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
  396. /// string, else return the string specified (default='[UNK]').
  397. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  398. explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
  399. int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
  400. bool with_offsets = false)
  401. : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
  402. with_offsets) {}
  403. explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  404. int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets);
  405. /// \brief Destructor
  406. ~WordpieceTokenizer() = default;
  407. protected:
  408. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  409. /// \return Shared pointer to TensorOperation object.
  410. std::shared_ptr<TensorOperation> Parse() override;
  411. private:
  412. struct Data;
  413. std::shared_ptr<Data> data_;
  414. };
  415. #ifndef _WIN32
  416. /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
  417. class UnicodeScriptTokenizer final : public TensorTransform {
  418. public:
  419. /// \brief Constructor.
  420. /// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false).
  421. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  422. explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
  423. /// \brief Destructor
  424. ~UnicodeScriptTokenizer() = default;
  425. protected:
  426. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  427. /// \return Shared pointer to TensorOperation object.
  428. std::shared_ptr<TensorOperation> Parse() override;
  429. private:
  430. struct Data;
  431. std::shared_ptr<Data> data_;
  432. };
  433. /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
  434. class WhitespaceTokenizer final : public TensorTransform {
  435. public:
  436. /// \brief Constructor.
  437. /// \param[in] with_offsets Whether or not output offsets of tokens (default=false).
  438. explicit WhitespaceTokenizer(bool with_offsets = false);
  439. /// \brief Destructor
  440. ~WhitespaceTokenizer() = default;
  441. protected:
  442. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  443. /// \return Shared pointer to TensorOperation object.
  444. std::shared_ptr<TensorOperation> Parse() override;
  445. private:
  446. struct Data;
  447. std::shared_ptr<Data> data_;
  448. };
  449. #endif
  450. } // namespace text
  451. } // namespace dataset
  452. } // namespace mindspore
  453. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_