You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.h 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  18. #include <memory>
  19. #include <optional>
  20. #include <string>
  21. #include <utility>
  22. #include <vector>
  23. #include "include/api/status.h"
  24. #include "minddata/dataset/include/constants.h"
  25. #include "minddata/dataset/include/transforms.h"
  26. namespace mindspore {
  27. namespace dataset {
  28. class Vocab;
  29. class SentencePieceVocab;
  30. class TensorOperation;
  31. // Transform operations for text
  32. namespace text {
  33. #ifndef _WIN32
  34. /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
  35. /// \notes BasicTokenizer is not supported on Windows platform yet.
  36. class BasicTokenizer : public TensorTransform {
  37. public:
  38. /// \brief Constructor.
  39. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  40. /// fold the text to lower case and strip accents characters. If false, only apply
  41. /// NormalizeUTF8('normalization_form' mode) operation on input text (default=false).
  42. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  43. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  44. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  45. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  46. /// '[MASK]' (default=true).
  47. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  48. explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
  49. const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
  50. bool with_offsets = false);
  51. /// \brief Destructor
  52. ~BasicTokenizer() = default;
  53. protected:
  54. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  55. /// \return Shared pointer to TensorOperation object.
  56. std::shared_ptr<TensorOperation> Parse() override;
  57. private:
  58. bool lower_case_;
  59. bool keep_whitespace_;
  60. NormalizeForm normalize_form_;
  61. bool preserve_unused_token_;
  62. bool with_offsets_;
  63. };
  64. /// \brief Tokenizer used for Bert text process.
  65. /// \notes BertTokenizer is not supported on Windows platform yet.
  66. class BertTokenizer : public TensorTransform {
  67. public:
  68. /// \brief Constructor.
  69. /// \param[in] vocab A Vocab object.
  70. /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
  71. /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
  72. /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
  73. /// string, else return the string specified(default='[UNK]').
  74. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  75. /// fold the text to lower case and strip accents characters. If false, only apply
  76. /// NormalizeUTF8('normalization_form' mode) operation on input text (default=false).
  77. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  78. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  79. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  80. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  81. /// '[MASK]' (default=true).
  82. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  83. explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
  84. int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
  85. bool lower_case = false, bool keep_whitespace = false,
  86. const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
  87. bool with_offsets = false);
  88. /// \brief Destructor
  89. ~BertTokenizer() = default;
  90. protected:
  91. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  92. /// \return Shared pointer to TensorOperation object.
  93. std::shared_ptr<TensorOperation> Parse() override;
  94. private:
  95. std::shared_ptr<Vocab> vocab_;
  96. std::string suffix_indicator_;
  97. int32_t max_bytes_per_token_;
  98. std::string unknown_token_;
  99. bool lower_case_;
  100. bool keep_whitespace_;
  101. NormalizeForm normalize_form_;
  102. bool preserve_unused_token_;
  103. bool with_offsets_;
  104. };
  105. /// \brief Apply case fold operation on UTF-8 string tensor.
  106. /// \return Shared pointer to the current TensorOperation.
  107. class CaseFold : public TensorTransform {
  108. public:
  109. /// \brief Constructor.
  110. CaseFold();
  111. /// \brief Destructor
  112. ~CaseFold() = default;
  113. protected:
  114. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  115. //// \return Shared pointer to TensorOperation object.
  116. std::shared_ptr<TensorOperation> Parse() override;
  117. };
  118. #endif
  119. /// \brief Tokenize Chinese string into words based on dictionary.
  120. /// \notes The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed.
  121. class JiebaTokenizer : public TensorTransform {
  122. public:
  123. /// \brief Constructor.
  124. /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
  125. /// official website of cppjieba.
  126. /// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
  127. /// official website of cppjieba.
  128. /// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
  129. /// - JiebaMode.kMP, tokenize with MPSegment algorithm.
  130. /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
  131. /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
  132. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  133. explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  134. const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false);
  135. /// \brief Destructor
  136. ~JiebaTokenizer() = default;
  137. Status AddWord(const std::string &word, int64_t freq = 0);
  138. protected:
  139. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  140. /// \return Shared pointer to TensorOperation object.
  141. std::shared_ptr<TensorOperation> Parse() override;
  142. private:
  143. std::string hmm_path_;
  144. std::string mp_path_;
  145. JiebaMode mode_;
  146. bool with_offsets_;
  147. std::vector<std::pair<std::string, int64_t>> words_list_;
  148. };
  149. /// \brief Look up a word into an id according to the input vocabulary table.
  150. class Lookup : public TensorTransform {
  151. public:
  152. /// \brief Constructor.
  153. /// \param[in] vocab a Vocab object.
  154. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
  155. /// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to
  156. /// specify unknown_token when word being out of Vocabulary (default={}).
  157. /// \param[in] data_type type of the tensor after lookup, typically int32.
  158. explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
  159. const std::string &data_type = "int32");
  160. /// \brief Destructor
  161. ~Lookup() = default;
  162. protected:
  163. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  164. /// \return Shared pointer to TensorOperation object.
  165. std::shared_ptr<TensorOperation> Parse() override;
  166. private:
  167. std::shared_ptr<Vocab> vocab_;
  168. std::optional<std::string> unknown_token_;
  169. std::string data_type_;
  170. };
  171. /// \brief TensorOp to generate n-gram from a 1-D string Tensor.
  172. class Ngram : public TensorTransform {
  173. public:
  174. /// \brief Constructor.
  175. /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
  176. /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
  177. /// for a n-gram, an empty string will be returned.
  178. /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
  179. /// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
  180. /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
  181. /// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
  182. /// \param[in] separator Symbol used to join strings together (default=" ").
  183. explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0},
  184. const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ");
  185. /// \brief Destructor
  186. ~Ngram() = default;
  187. protected:
  188. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  189. /// \return Shared pointer to TensorOperation object.
  190. std::shared_ptr<TensorOperation> Parse() override;
  191. private:
  192. std::vector<int32_t> ngrams_;
  193. std::pair<std::string, int32_t> left_pad_;
  194. std::pair<std::string, int32_t> right_pad_;
  195. std::string separator_;
  196. };
  197. #ifndef _WIN32
  198. /// \brief Apply normalize operation on UTF-8 string tensor.
  199. class NormalizeUTF8 : public TensorTransform {
  200. public:
  201. /// \brief Constructor.
  202. /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
  203. /// NormalizeForm::kNfkc,
  204. /// NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
  205. /// See http://unicode.org/reports/tr15/ for details.
  206. /// - NormalizeForm.NONE, do nothing for input string tensor.
  207. /// - NormalizeForm.NFC, normalize with Normalization Form C.
  208. /// - NormalizeForm.NFKC, normalize with Normalization Form KC.
  209. /// - NormalizeForm.NFD, normalize with Normalization Form D.
  210. /// - NormalizeForm.NFKD, normalize with Normalization Form KD.
  211. explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
  212. /// \brief Destructor
  213. ~NormalizeUTF8() = default;
  214. protected:
  215. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  216. /// \return Shared pointer to TensorOperation object.
  217. std::shared_ptr<TensorOperation> Parse() override;
  218. private:
  219. NormalizeForm normalize_form_;
  220. };
  221. /// \brief Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
  222. class RegexReplace : public TensorTransform {
  223. public:
  224. /// \brief Constructor.
  225. /// \param[in] pattern The regex expression patterns.
  226. /// \param[in] replace The string to replace matched element.
  227. /// \param[in] replace_all Confirm whether to replace all. If false, only replace first matched element;
  228. /// if true, replace all matched elements (default=true).
  229. explicit RegexReplace(std::string pattern, std::string replace, bool replace_all = true);
  230. /// \brief Destructor
  231. ~RegexReplace() = default;
  232. protected:
  233. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  234. /// \return Shared pointer to TensorOperation object.
  235. std::shared_ptr<TensorOperation> Parse() override;
  236. private:
  237. std::string pattern_;
  238. std::string replace_;
  239. bool replace_all_;
  240. };
  241. /// \brief Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
  242. class RegexTokenizer : public TensorTransform {
  243. public:
  244. /// \brief Constructor.
  245. /// \param[in] delim_pattern The pattern of regex delimiters.
  246. /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
  247. /// matched by 'keep_delim_pattern'. The default value is an empty string ("")
  248. /// which means that delimiters will not be kept as an output token (default="").
  249. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  250. explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false);
  251. /// \brief Destructor
  252. ~RegexTokenizer() = default;
  253. protected:
  254. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  255. /// \return Shared pointer to TensorOperation object.
  256. std::shared_ptr<TensorOperation> Parse() override;
  257. private:
  258. std::string delim_pattern_;
  259. std::string keep_delim_pattern_;
  260. bool with_offsets_;
  261. };
  262. #endif
  263. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  264. class SentencePieceTokenizer : public TensorTransform {
  265. public:
  266. /// \brief Constructor.
  267. /// \param[in] vocab a SentencePieceVocab object.
  268. /// \param[in] out_type The type of output.
  269. SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
  270. mindspore::dataset::SPieceTokenizerOutType out_typee);
  271. /// \brief Constructor.
  272. /// \param[in] vocab_path vocab model file path.
  273. /// \param[in] out_type The type of output.
  274. SentencePieceTokenizer(const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
  275. /// \brief Destructor
  276. ~SentencePieceTokenizer() = default;
  277. protected:
  278. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  279. /// \return Shared pointer to TensorOperation object.
  280. std::shared_ptr<TensorOperation> Parse() override;
  281. private:
  282. std::shared_ptr<SentencePieceVocab> vocab_;
  283. std::string vocab_path_;
  284. SPieceTokenizerLoadType load_type_;
  285. SPieceTokenizerOutType out_type_;
  286. };
  287. /// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
  288. /// axis is a slice of data starting at the corresponding position, with a specified width.
  289. class SlidingWindow : public TensorTransform {
  290. public:
  291. /// \brief Constructor.
  292. /// \param[in] width The width of the window. It must be an integer and greater than zero.
  293. /// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
  294. /// for now.
  295. explicit SlidingWindow(const int32_t width, const int32_t axis = 0);
  296. /// \brief Destructor
  297. ~SlidingWindow() = default;
  298. protected:
  299. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  300. /// \return Shared pointer to TensorOperation object.
  301. std::shared_ptr<TensorOperation> Parse() override;
  302. private:
  303. int32_t width_;
  304. int32_t axis_;
  305. };
  306. /// \brief Tensor operation to convert every element of a string tensor to a number.
  307. /// Strings are casted according to the rules specified in the following links:
  308. /// https://en.cppreference.com/w/cpp/string/basic_string/stof,
  309. /// https://en.cppreference.com/w/cpp/string/basic_string/stoul,
  310. /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
  311. class ToNumber : public TensorTransform {
  312. public:
  313. /// \brief Constructor.
  314. /// \param[in] data_type of the tensor to be casted to. Must be a numeric type.
  315. explicit ToNumber(const std::string &data_type);
  316. /// \brief Destructor
  317. ~ToNumber() = default;
  318. protected:
  319. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  320. /// \return Shared pointer to TensorOperation object.
  321. std::shared_ptr<TensorOperation> Parse() override;
  322. private:
  323. std::string data_type_;
  324. };
  325. /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
  326. class TruncateSequencePair : public TensorTransform {
  327. public:
  328. /// \brief Constructor.
  329. /// \param[in] max_length Maximum length required.
  330. explicit TruncateSequencePair(int32_t max_length);
  331. /// \brief Destructor
  332. ~TruncateSequencePair() = default;
  333. protected:
  334. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  335. /// \return Shared pointer to TensorOperation object.
  336. std::shared_ptr<TensorOperation> Parse() override;
  337. private:
  338. int32_t max_length_;
  339. };
  340. /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
  341. class UnicodeCharTokenizer : public TensorTransform {
  342. public:
  343. /// \brief Constructor.
  344. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  345. explicit UnicodeCharTokenizer(bool with_offsets = false);
  346. /// \brief Destructor
  347. ~UnicodeCharTokenizer() = default;
  348. protected:
  349. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  350. /// \return Shared pointer to TensorOperation object.
  351. std::shared_ptr<TensorOperation> Parse() override;
  352. private:
  353. bool with_offsets_;
  354. };
  355. #ifndef _WIN32
  356. /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
  357. class UnicodeScriptTokenizer : public TensorTransform {
  358. public:
  359. /// \brief Constructor.
  360. /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
  361. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  362. explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
  363. /// \brief Destructor
  364. ~UnicodeScriptTokenizer() = default;
  365. protected:
  366. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  367. /// \return Shared pointer to TensorOperation object.
  368. std::shared_ptr<TensorOperation> Parse() override;
  369. private:
  370. bool keep_whitespace_;
  371. bool with_offsets_;
  372. };
  373. /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
  374. class WhitespaceTokenizer : public TensorTransform {
  375. public:
  376. /// \brief Constructor.
  377. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  378. explicit WhitespaceTokenizer(bool with_offsets = false);
  379. /// \brief Destructor
  380. ~WhitespaceTokenizer() = default;
  381. protected:
  382. /// \brief Function to convert TensorTransform object into a TensorOperation object.
  383. /// \return Shared pointer to TensorOperation object.
  384. std::shared_ptr<TensorOperation> Parse() override;
  385. private:
  386. bool with_offsets_;
  387. };
  388. #endif
  389. } // namespace text
  390. } // namespace dataset
  391. } // namespace mindspore
  392. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_