You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.h 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  18. #include <memory>
  19. #include <string>
  20. #include <utility>
  21. #include <vector>
  22. #include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
  23. #include "minddata/dataset/core/constants.h"
  24. #include "minddata/dataset/include/transforms.h"
  25. #include "minddata/dataset/util/status.h"
  26. #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
  27. #include "minddata/dataset/text/sentence_piece_vocab.h"
  28. #include "minddata/dataset/text/vocab.h"
  29. namespace mindspore {
  30. namespace dataset {
  31. // Transform operations for text
  32. namespace text {
  33. // Char arrays storing name of corresponding classes (in alphabetical order)
  34. constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
  35. constexpr char kBertTokenizerOperation[] = "BertTokenizer";
  36. constexpr char kCaseFoldOperation[] = "CaseFold";
  37. constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
  38. constexpr char kLookupOperation[] = "Lookup";
  39. constexpr char kNgramOperation[] = "Ngram";
  40. constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
  41. constexpr char kRegexReplaceOperation[] = "RegexReplace";
  42. constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
  43. constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
  44. constexpr char kSlidingWindowOperation[] = "SlidingWindow";
  45. constexpr char kToNumberOperation[] = "ToNumber";
  46. constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
  47. constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
  48. constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
  49. constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
  50. // Text Op classes (in alphabetical order)
  51. #ifndef _WIN32
  52. class BasicTokenizerOperation;
  53. class BertTokenizerOperation;
  54. class CaseFoldOperation;
  55. #endif
  56. class JiebaTokenizerOperation;
  57. class LookupOperation;
  58. class NgramOperation;
  59. #ifndef _WIN32
  60. class NormalizeUTF8Operation;
  61. class RegexReplaceOperation;
  62. class RegexTokenizerOperation;
  63. #endif
  64. class SentencePieceTokenizerOperation;
  65. class SlidingWindowOperation;
  66. class ToNumberOperation;
  67. class TruncateSequencePairOperation;
  68. class UnicodeCharTokenizerOperation;
  69. #ifndef _WIN32
  70. class UnicodeScriptTokenizerOperation;
  71. class WhitespaceTokenizerOperation;
  72. #endif
  73. #ifndef _WIN32
  74. /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
  75. /// \notes BasicTokenizer is not supported on Windows platform yet.
  76. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  77. /// fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
  78. /// mode) operation on input text (default=false).
  79. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  80. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  81. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  82. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  83. /// '[MASK]' (default=true).
  84. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  85. /// \return Shared pointer to the current TensorOperation.
  86. std::shared_ptr<BasicTokenizerOperation> BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
  87. const NormalizeForm normalize_form = NormalizeForm::kNone,
  88. bool preserve_unused_token = true, bool with_offsets = false);
  89. /// \brief Tokenizer used for Bert text process.
  90. /// \notes BertTokenizer is not supported on Windows platform yet.
  91. /// \param[in] vocab A Vocab object.
  92. /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
  93. /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
  94. /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
  95. /// string, else return the string specified(default='[UNK]').
  96. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  97. /// fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
  98. /// mode) operation on input text (default=false).
  99. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  100. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  101. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  102. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  103. /// '[MASK]' (default=true).
  104. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  105. /// \return Shared pointer to the current TensorOperation.
  106. std::shared_ptr<BertTokenizerOperation> BertTokenizer(const std::shared_ptr<Vocab> &vocab,
  107. const std::string &suffix_indicator = "##",
  108. int32_t max_bytes_per_token = 100,
  109. const std::string &unknown_token = "[UNK]",
  110. bool lower_case = false, bool keep_whitespace = false,
  111. const NormalizeForm normalize_form = NormalizeForm::kNone,
  112. bool preserve_unused_token = true, bool with_offsets = false);
  113. /// \brief Apply case fold operation on UTF-8 string tensor.
  114. /// \return Shared pointer to the current TensorOperation.
  115. std::shared_ptr<CaseFoldOperation> CaseFold();
  116. #endif
  117. /// \brief Tokenize Chinese string into words based on dictionary.
  118. /// \notes The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed.
  119. /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
  120. /// official website of cppjieba.
  121. /// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
  122. /// official website of cppjieba.
  123. /// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
  124. /// - JiebaMode.kMP, tokenize with MPSegment algorithm.
  125. /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
  126. /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
  127. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  128. /// \return Shared pointer to the current TensorOperation.
  129. std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  130. const JiebaMode &mode = JiebaMode::kMix,
  131. bool with_offsets = false);
  132. /// \brief Lookup operator that looks up a word to an id.
  133. /// \param[in] vocab a Vocab object.
  134. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
  135. /// If unknown_token is oov, runtime error will be thrown.
  136. /// \param[in] DataType type of the tensor after lookup, typically int32.
  137. /// \return Shared pointer to the current TensorOperation.
  138. std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  139. const mindspore::dataset::DataType &data_type = DataType("int32"));
  140. /// \brief TensorOp to generate n-gram from a 1-D string Tensor.
  141. /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
  142. /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
  143. /// for a n-gram, an empty string will be returned.
  144. /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
  145. /// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
  146. /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
  147. /// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
  148. /// \param[in] separator Symbol used to join strings together (default=" ").
  149. /// \return Shared pointer to the current TensorOperation.
  150. std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
  151. const std::pair<std::string, int32_t> &left_pad = {"", 0},
  152. const std::pair<std::string, int32_t> &right_pad = {"", 0},
  153. const std::string &separator = " ");
  154. #ifndef _WIN32
  155. /// \brief Apply normalize operation on UTF-8 string tensor.
  156. /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
  157. /// NormalizeForm::kNfkc,
  158. /// NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
  159. /// See http://unicode.org/reports/tr15/ for details.
  160. /// - NormalizeForm.NONE, do nothing for input string tensor.
  161. /// - NormalizeForm.NFC, normalize with Normalization Form C.
  162. /// - NormalizeForm.NFKC, normalize with Normalization Form KC.
  163. /// - NormalizeForm.NFD, normalize with Normalization Form D.
  164. /// - NormalizeForm.NFKD, normalize with Normalization Form KD.
  165. /// \return Shared pointer to the current TensorOperation.
  166. std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
  167. /// \brief Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
  168. /// \param[in] pattern The regex expression patterns.
  169. /// \param[in] replace The string to replace matched element.
  170. /// \param[in] replace_all Confirm whether to replace all. If false, only replace first matched element;
  171. /// if true, replace all matched elements (default=true).
  172. /// \return Shared pointer to the current TensorOperation.
  173. std::shared_ptr<RegexReplaceOperation> RegexReplace(std::string pattern, std::string replace, bool replace_all = true);
  174. /// \brief Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
  175. /// \param[in] delim_pattern The pattern of regex delimiters.
  176. /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
  177. /// matched by 'keep_delim_pattern'. The default value is an empty string ("")
  178. /// which means that delimiters will not be kept as an output token (default="").
  179. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  180. /// \return Shared pointer to the current TensorOperation.
  181. std::shared_ptr<RegexTokenizerOperation> RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "",
  182. bool with_offsets = false);
  183. #endif
  184. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  185. /// \param[in] vocab a SentencePieceVocab object.
  186. /// \param[in] out_type The type of output.
  187. /// \return Shared pointer to the current TensorOperation.
  188. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  189. const std::shared_ptr<SentencePieceVocab> &vocab, mindspore::dataset::SPieceTokenizerOutType out_type);
  190. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  191. /// \param[in] vocab_path vocab model file path.
  192. /// \param[in] out_type The type of output.
  193. /// \return Shared pointer to the current TensorOperation.
  194. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  195. const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
  196. /// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
  197. /// axis is a slice of data starting at the corresponding position, with a specified width.
  198. /// \param[in] width The width of the window. It must be an integer and greater than zero.
  199. /// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
  200. /// for now.
  201. /// \return Shared pointer to the current TensorOperation.
  202. std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis = 0);
  203. /// \brief Tensor operation to convert every element of a string tensor to a number.
  204. /// Strings are casted according to the rules specified in the following links:
  205. /// https://en.cppreference.com/w/cpp/string/basic_string/stof,
  206. /// https://en.cppreference.com/w/cpp/string/basic_string/stoul,
  207. /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
  208. /// \param[in] data_type DataType of the tensor to be casted to. Must be a numeric type.
  209. /// \return Shared pointer to the current TensorOperation.
  210. std::shared_ptr<ToNumberOperation> ToNumber(const DataType data_type);
  211. /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
  212. /// \param[in] max_length Maximum length required.
  213. /// \return Shared pointer to the current TensorOperation.
  214. std::shared_ptr<TruncateSequencePairOperation> TruncateSequencePair(int32_t max_length);
  215. /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
  216. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  217. /// \return Shared pointer to the current TensorOperation.
  218. std::shared_ptr<UnicodeCharTokenizerOperation> UnicodeCharTokenizer(bool with_offsets = false);
  219. #ifndef _WIN32
  220. /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
  221. /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
  222. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  223. /// \return Shared pointer to the current TensorOperation.
  224. std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool keep_whitespace = false,
  225. bool with_offsets = false);
  226. /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
  227. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  228. /// \return Shared pointer to the current TensorOperation.
  229. std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
  230. #endif
  231. /* ####################################### Derived TensorOperation classes ################################# */
  232. #ifndef _WIN32
  233. class BasicTokenizerOperation : public TensorOperation {
  234. public:
  235. BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
  236. bool preserve_unused_token, bool with_offsets);
  237. ~BasicTokenizerOperation() = default;
  238. std::shared_ptr<TensorOp> Build() override;
  239. Status ValidateParams() override;
  240. std::string Name() const override { return kBasicTokenizerOperation; }
  241. private:
  242. bool lower_case_;
  243. bool keep_whitespace_;
  244. NormalizeForm normalize_form_;
  245. bool preserve_unused_token_;
  246. bool with_offsets_;
  247. };
  248. class BertTokenizerOperation : public TensorOperation {
  249. public:
  250. BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
  251. int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case,
  252. bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  253. bool with_offsets);
  254. ~BertTokenizerOperation() = default;
  255. std::shared_ptr<TensorOp> Build() override;
  256. Status ValidateParams() override;
  257. std::string Name() const override { return kBertTokenizerOperation; }
  258. private:
  259. std::shared_ptr<Vocab> vocab_;
  260. std::string suffix_indicator_;
  261. int32_t max_bytes_per_token_;
  262. std::string unknown_token_;
  263. bool lower_case_;
  264. bool keep_whitespace_;
  265. NormalizeForm normalize_form_;
  266. bool preserve_unused_token_;
  267. bool with_offsets_;
  268. };
  269. class CaseFoldOperation : public TensorOperation {
  270. public:
  271. CaseFoldOperation() = default;
  272. ~CaseFoldOperation() = default;
  273. std::shared_ptr<TensorOp> Build() override;
  274. Status ValidateParams() override;
  275. std::string Name() const override { return kCaseFoldOperation; }
  276. };
  277. #endif
  278. class JiebaTokenizerOperation : public TensorOperation {
  279. public:
  280. explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
  281. bool with_offsets);
  282. ~JiebaTokenizerOperation() = default;
  283. std::shared_ptr<TensorOp> Build() override;
  284. Status ValidateParams() override;
  285. std::string Name() const override { return kJiebaTokenizerOperation; }
  286. private:
  287. std::string hmm_path_;
  288. std::string mp_path_;
  289. JiebaMode mode_;
  290. bool with_offsets_;
  291. };
  292. class LookupOperation : public TensorOperation {
  293. public:
  294. explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  295. const DataType &data_type);
  296. ~LookupOperation() = default;
  297. std::shared_ptr<TensorOp> Build() override;
  298. Status ValidateParams() override;
  299. std::string Name() const override { return kLookupOperation; }
  300. private:
  301. std::shared_ptr<Vocab> vocab_;
  302. std::string unknown_token_;
  303. int32_t default_id_;
  304. DataType data_type_;
  305. };
  306. class NgramOperation : public TensorOperation {
  307. public:
  308. explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
  309. const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
  310. ~NgramOperation() = default;
  311. std::shared_ptr<TensorOp> Build() override;
  312. Status ValidateParams() override;
  313. std::string Name() const override { return kNgramOperation; }
  314. private:
  315. std::vector<int32_t> ngrams_;
  316. std::pair<std::string, int32_t> left_pad_;
  317. std::pair<std::string, int32_t> right_pad_;
  318. std::string separator_;
  319. };
  320. #ifndef _WIN32
  321. class NormalizeUTF8Operation : public TensorOperation {
  322. public:
  323. explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
  324. ~NormalizeUTF8Operation() = default;
  325. std::shared_ptr<TensorOp> Build() override;
  326. Status ValidateParams() override;
  327. std::string Name() const override { return kNormalizeUTF8Operation; }
  328. private:
  329. NormalizeForm normalize_form_;
  330. };
  331. class RegexReplaceOperation : public TensorOperation {
  332. public:
  333. RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all);
  334. ~RegexReplaceOperation() = default;
  335. std::shared_ptr<TensorOp> Build() override;
  336. Status ValidateParams() override;
  337. std::string Name() const override { return kRegexReplaceOperation; }
  338. private:
  339. std::string pattern_;
  340. std::string replace_;
  341. bool replace_all_;
  342. };
  343. class RegexTokenizerOperation : public TensorOperation {
  344. public:
  345. explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets);
  346. ~RegexTokenizerOperation() = default;
  347. std::shared_ptr<TensorOp> Build() override;
  348. Status ValidateParams() override;
  349. std::string Name() const override { return kRegexTokenizerOperation; }
  350. private:
  351. std::string delim_pattern_;
  352. std::string keep_delim_pattern_;
  353. bool with_offsets_;
  354. };
  355. #endif
  356. class SentencePieceTokenizerOperation : public TensorOperation {
  357. public:
  358. SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
  359. SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
  360. ~SentencePieceTokenizerOperation() = default;
  361. std::shared_ptr<TensorOp> Build() override;
  362. Status ValidateParams() override;
  363. std::string Name() const override { return kSentencepieceTokenizerOperation; }
  364. private:
  365. std::shared_ptr<SentencePieceVocab> vocab_;
  366. std::string vocab_path_;
  367. SPieceTokenizerLoadType load_type_;
  368. SPieceTokenizerOutType out_type_;
  369. };
  370. class SlidingWindowOperation : public TensorOperation {
  371. public:
  372. explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
  373. ~SlidingWindowOperation() = default;
  374. std::shared_ptr<TensorOp> Build() override;
  375. Status ValidateParams() override;
  376. std::string Name() const override { return kSlidingWindowOperation; }
  377. private:
  378. int32_t width_;
  379. int32_t axis_;
  380. };
  381. class ToNumberOperation : public TensorOperation {
  382. public:
  383. explicit ToNumberOperation(DataType data_type);
  384. ~ToNumberOperation() = default;
  385. std::shared_ptr<TensorOp> Build() override;
  386. Status ValidateParams() override;
  387. std::string Name() const override { return kToNumberOperation; }
  388. private:
  389. DataType data_type_;
  390. };
  391. class TruncateSequencePairOperation : public TensorOperation {
  392. public:
  393. explicit TruncateSequencePairOperation(int32_t max_length);
  394. ~TruncateSequencePairOperation() = default;
  395. std::shared_ptr<TensorOp> Build() override;
  396. Status ValidateParams() override;
  397. std::string Name() const override { return kTruncateSequencePairOperation; }
  398. private:
  399. int32_t max_length_;
  400. };
  401. class UnicodeCharTokenizerOperation : public TensorOperation {
  402. public:
  403. explicit UnicodeCharTokenizerOperation(bool with_offsets);
  404. ~UnicodeCharTokenizerOperation() = default;
  405. std::shared_ptr<TensorOp> Build() override;
  406. Status ValidateParams() override;
  407. std::string Name() const override { return kUnicodeCharTokenizerOperation; }
  408. private:
  409. bool with_offsets_;
  410. };
  411. #ifndef _WIN32
  412. class UnicodeScriptTokenizerOperation : public TensorOperation {
  413. public:
  414. explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
  415. ~UnicodeScriptTokenizerOperation() = default;
  416. std::shared_ptr<TensorOp> Build() override;
  417. Status ValidateParams() override;
  418. std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
  419. private:
  420. bool keep_whitespace_;
  421. bool with_offsets_;
  422. };
  423. class WhitespaceTokenizerOperation : public TensorOperation {
  424. public:
  425. explicit WhitespaceTokenizerOperation(bool with_offsets);
  426. ~WhitespaceTokenizerOperation() = default;
  427. std::shared_ptr<TensorOp> Build() override;
  428. Status ValidateParams() override;
  429. std::string Name() const override { return kWhitespaceTokenizerOperation; }
  430. private:
  431. bool with_offsets_;
  432. };
  433. #endif
  434. } // namespace text
  435. } // namespace dataset
  436. } // namespace mindspore
  437. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_