You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  18. #include <memory>
  19. #include <optional>
  20. #include <string>
  21. #include <utility>
  22. #include <vector>
  23. #include "include/api/status.h"
  24. #include "minddata/dataset/include/constants.h"
  25. #include "minddata/dataset/include/transforms.h"
  26. // FIXME - This internal IR header will be removed when external API classes are provided
  27. #include "minddata/dataset/text/ir/kernels/text_ir.h"
  28. namespace mindspore {
  29. namespace dataset {
  30. class Vocab;
  31. class SentencePieceVocab;
  32. // Transform operations for text
  33. namespace text {
  34. // Text Op classes (in alphabetical order)
  35. #ifndef _WIN32
  36. class BasicTokenizerOperation;
  37. class BertTokenizerOperation;
  38. class CaseFoldOperation;
  39. #endif
  40. class JiebaTokenizerOperation;
  41. class LookupOperation;
  42. class NgramOperation;
  43. #ifndef _WIN32
  44. class NormalizeUTF8Operation;
  45. class RegexReplaceOperation;
  46. class RegexTokenizerOperation;
  47. #endif
  48. class SentencePieceTokenizerOperation;
  49. class SlidingWindowOperation;
  50. class ToNumberOperation;
  51. class TruncateSequencePairOperation;
  52. class UnicodeCharTokenizerOperation;
  53. #ifndef _WIN32
  54. class UnicodeScriptTokenizerOperation;
  55. class WhitespaceTokenizerOperation;
  56. #endif
  57. #ifndef _WIN32
  58. /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
  59. /// \notes BasicTokenizer is not supported on Windows platform yet.
  60. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  61. /// fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
  62. /// mode) operation on input text (default=false).
  63. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  64. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  65. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  66. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  67. /// '[MASK]' (default=true).
  68. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  69. /// \return Shared pointer to the current TensorOperation.
  70. std::shared_ptr<BasicTokenizerOperation> BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
  71. const NormalizeForm normalize_form = NormalizeForm::kNone,
  72. bool preserve_unused_token = true, bool with_offsets = false);
  73. /// \brief Tokenizer used for Bert text process.
  74. /// \notes BertTokenizer is not supported on Windows platform yet.
  75. /// \param[in] vocab A Vocab object.
  76. /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##').
  77. /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
  78. /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
  79. /// string, else return the string specified(default='[UNK]').
  80. /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation on input text to
  81. /// fold the text to lower case and strip accents characters. If false, only apply NormalizeUTF8('normalization_form'
  82. /// mode) operation on input text (default=false).
  83. /// \param[in] keep_whitespace If true, the whitespace will be kept in out tokens (default=false).
  84. /// \param[in] normalize_form Used to specify a specific normalize mode. This is only effective when 'lower_case' is
  85. /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
  86. /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]',
  87. /// '[MASK]' (default=true).
  88. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  89. /// \return Shared pointer to the current TensorOperation.
  90. std::shared_ptr<BertTokenizerOperation> BertTokenizer(const std::shared_ptr<Vocab> &vocab,
  91. const std::string &suffix_indicator = "##",
  92. int32_t max_bytes_per_token = 100,
  93. const std::string &unknown_token = "[UNK]",
  94. bool lower_case = false, bool keep_whitespace = false,
  95. const NormalizeForm normalize_form = NormalizeForm::kNone,
  96. bool preserve_unused_token = true, bool with_offsets = false);
  97. /// \brief Apply case fold operation on UTF-8 string tensor.
  98. /// \return Shared pointer to the current TensorOperation.
  99. std::shared_ptr<CaseFoldOperation> CaseFold();
  100. #endif
  101. /// \brief Tokenize Chinese string into words based on dictionary.
  102. /// \notes The integrity of the HMMSEgment algorithm and MPSegment algorithm files must be confirmed.
  103. /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
  104. /// official website of cppjieba.
  105. /// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
  106. /// official website of cppjieba.
  107. /// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
  108. /// - JiebaMode.kMP, tokenize with MPSegment algorithm.
  109. /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm.
  110. /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
  111. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  112. /// \return Shared pointer to the current TensorOperation.
  113. std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  114. const JiebaMode &mode = JiebaMode::kMix,
  115. bool with_offsets = false);
  116. /// \brief Look up a word into an id according to the input vocabulary table.
  117. /// \param[in] vocab a Vocab object.
  118. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
  119. /// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to
  120. // specify unknown_token when word being out of Vocabulary (default={}).
  121. /// \param[in] data_type type of the tensor after lookup, typically int32.
  122. /// \return Shared pointer to the current TensorOperation.
  123. std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab,
  124. const std::optional<std::string> &unknown_token = {},
  125. const std::string &data_type = "int32");
  126. /// \brief TensorOp to generate n-gram from a 1-D string Tensor.
  127. /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
  128. /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
  129. /// for a n-gram, an empty string will be returned.
  130. /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
  131. /// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
  132. /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
  133. /// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
  134. /// \param[in] separator Symbol used to join strings together (default=" ").
  135. /// \return Shared pointer to the current TensorOperation.
  136. std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
  137. const std::pair<std::string, int32_t> &left_pad = {"", 0},
  138. const std::pair<std::string, int32_t> &right_pad = {"", 0},
  139. const std::string &separator = " ");
  140. #ifndef _WIN32
  141. /// \brief Apply normalize operation on UTF-8 string tensor.
  142. /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
  143. /// NormalizeForm::kNfkc,
  144. /// NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
  145. /// See http://unicode.org/reports/tr15/ for details.
  146. /// - NormalizeForm.NONE, do nothing for input string tensor.
  147. /// - NormalizeForm.NFC, normalize with Normalization Form C.
  148. /// - NormalizeForm.NFKC, normalize with Normalization Form KC.
  149. /// - NormalizeForm.NFD, normalize with Normalization Form D.
  150. /// - NormalizeForm.NFKD, normalize with Normalization Form KD.
  151. /// \return Shared pointer to the current TensorOperation.
  152. std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
  153. /// \brief Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
  154. /// \param[in] pattern The regex expression patterns.
  155. /// \param[in] replace The string to replace matched element.
  156. /// \param[in] replace_all Confirm whether to replace all. If false, only replace first matched element;
  157. /// if true, replace all matched elements (default=true).
  158. /// \return Shared pointer to the current TensorOperation.
  159. std::shared_ptr<RegexReplaceOperation> RegexReplace(std::string pattern, std::string replace, bool replace_all = true);
  160. /// \brief Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
  161. /// \param[in] delim_pattern The pattern of regex delimiters.
  162. /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be
  163. /// matched by 'keep_delim_pattern'. The default value is an empty string ("")
  164. /// which means that delimiters will not be kept as an output token (default="").
  165. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  166. /// \return Shared pointer to the current TensorOperation.
  167. std::shared_ptr<RegexTokenizerOperation> RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "",
  168. bool with_offsets = false);
  169. #endif
  170. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  171. /// \param[in] vocab a SentencePieceVocab object.
  172. /// \param[in] out_type The type of output.
  173. /// \return Shared pointer to the current TensorOperation.
  174. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  175. const std::shared_ptr<SentencePieceVocab> &vocab, mindspore::dataset::SPieceTokenizerOutType out_type);
  176. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  177. /// \param[in] vocab_path vocab model file path.
  178. /// \param[in] out_type The type of output.
  179. /// \return Shared pointer to the current TensorOperation.
  180. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  181. const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
  182. /// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
  183. /// axis is a slice of data starting at the corresponding position, with a specified width.
  184. /// \param[in] width The width of the window. It must be an integer and greater than zero.
  185. /// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
  186. /// for now.
  187. /// \return Shared pointer to the current TensorOperation.
  188. std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis = 0);
  189. /// \brief Tensor operation to convert every element of a string tensor to a number.
  190. /// Strings are casted according to the rules specified in the following links:
  191. /// https://en.cppreference.com/w/cpp/string/basic_string/stof,
  192. /// https://en.cppreference.com/w/cpp/string/basic_string/stoul,
  193. /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
  194. /// \param[in] data_type of the tensor to be casted to. Must be a numeric type.
  195. /// \return Shared pointer to the current TensorOperation.
  196. std::shared_ptr<ToNumberOperation> ToNumber(const std::string &data_type);
  197. /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
  198. /// \param[in] max_length Maximum length required.
  199. /// \return Shared pointer to the current TensorOperation.
  200. std::shared_ptr<TruncateSequencePairOperation> TruncateSequencePair(int32_t max_length);
  201. /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
  202. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  203. /// \return Shared pointer to the current TensorOperation.
  204. std::shared_ptr<UnicodeCharTokenizerOperation> UnicodeCharTokenizer(bool with_offsets = false);
  205. #ifndef _WIN32
  206. /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
  207. /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
  208. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  209. /// \return Shared pointer to the current TensorOperation.
  210. std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool keep_whitespace = false,
  211. bool with_offsets = false);
  212. /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
  213. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  214. /// \return Shared pointer to the current TensorOperation.
  215. std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
  216. #endif
  217. } // namespace text
  218. } // namespace dataset
  219. } // namespace mindspore
  220. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_