You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_
  18. #include <memory>
  19. #include <string>
  20. #include <utility>
  21. #include <vector>
  22. #include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
  23. #include "minddata/dataset/core/constants.h"
  24. #include "minddata/dataset/include/transforms.h"
  25. #include "minddata/dataset/util/status.h"
  26. #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
  27. #include "minddata/dataset/text/sentence_piece_vocab.h"
  28. #include "minddata/dataset/text/vocab.h"
  29. namespace mindspore {
  30. namespace dataset {
  31. // Transform operations for text
  32. namespace text {
  33. // Char arrays storing name of corresponding classes (in alphabetical order)
  34. constexpr char kCaseFoldOperation[] = "CaseFold";
  35. constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
  36. constexpr char kLookupOperation[] = "Lookup";
  37. constexpr char kNgramOperation[] = "Ngram";
  38. constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8";
  39. constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
  40. constexpr char kSlidingWindowOperation[] = "SlidingWindow";
  41. constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
  42. constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
  43. constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer";
  44. // Text Op classes (in alphabetical order)
  45. #ifndef _WIN32
  46. class CaseFoldOperation;
  47. #endif
  48. class JiebaTokenizerOperation;
  49. class LookupOperation;
  50. class NgramOperation;
  51. #ifndef _WIN32
  52. class NormalizeUTF8Operation;
  53. #endif
  54. class SentencePieceTokenizerOperation;
  55. class SlidingWindowOperation;
  56. class UnicodeCharTokenizerOperation;
  57. #ifndef _WIN32
  58. class UnicodeScriptTokenizerOperation;
  59. class WhitespaceTokenizerOperation;
  60. #endif
  61. #ifndef _WIN32
  62. /// \brief Apply case fold operation on UTF-8 string tensor.
  63. /// \return Shared pointer to the current TensorOperation.
  64. std::shared_ptr<CaseFoldOperation> CaseFold();
  65. #endif
  66. /// \brief Tokenize Chinese string into words based on dictionary.
  67. /// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
  68. /// official website of cppjieba.
  69. /// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
  70. /// official website of cppjieba.
  71. /// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
  72. /// - JiebaMode.MP, tokenize with MPSegment algorithm.
  73. /// - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
  74. /// - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
  75. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  76. /// \return Shared pointer to the current TensorOperation.
  77. std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  78. const JiebaMode &mode = JiebaMode::kMix,
  79. bool with_offsets = false);
  80. /// \brief Lookup operator that looks up a word to an id.
  81. /// \param[in] vocab a Vocab object.
  82. /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
  83. /// If unknown_token is oov, runtime error will be thrown.
  84. /// \param[in] DataType type of the tensor after lookup, typically int32.
  85. /// \return Shared pointer to the current TensorOperation.
  86. std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  87. const mindspore::dataset::DataType &data_type = DataType("int32"));
  88. /// \brief TensorOp to generate n-gram from a 1-D string Tensor.
  89. /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
  90. /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
  91. /// for a n-gram, an empty string will be returned.
  92. /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
  93. /// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
  94. /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
  95. /// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
  96. /// \param[in] separator Symbol used to join strings together (default=" ").
  97. /// \return Shared pointer to the current TensorOperation.
  98. std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
  99. const std::pair<std::string, int32_t> &left_pad = {"", 0},
  100. const std::pair<std::string, int32_t> &right_pad = {"", 0},
  101. const std::string &separator = " ");
  102. #ifndef _WIN32
  103. /// \brief Apply normalize operation on UTF-8 string tensor.
  104. /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
  105. /// NormalizeForm::kNfkc,
  106. /// NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
  107. /// See http://unicode.org/reports/tr15/ for details.
  108. /// - NormalizeForm.NONE, do nothing for input string tensor.
  109. /// - NormalizeForm.NFC, normalize with Normalization Form C.
  110. /// - NormalizeForm.NFKC, normalize with Normalization Form KC.
  111. /// - NormalizeForm.NFD, normalize with Normalization Form D.
  112. /// - NormalizeForm.NFKD, normalize with Normalization Form KD.
  113. /// \return Shared pointer to the current TensorOperation.
  114. std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
  115. #endif
  116. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  117. /// \param[in] vocab a SentencePieceVocab object.
  118. /// \param[in] out_type The type of output.
  119. /// \return Shared pointer to the current TensorOperation.
  120. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  121. const std::shared_ptr<SentencePieceVocab> &vocab, mindspore::dataset::SPieceTokenizerOutType out_type);
  122. /// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
  123. /// \param[in] vocab_path vocab model file path.
  124. /// \param[in] out_type The type of output.
  125. /// \return Shared pointer to the current TensorOperation.
  126. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  127. const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
  128. /// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
  129. /// axis is a slice of data starting at the corresponding position, with a specified width.
  130. /// \param[in] width The width of the window. It must be an integer and greater than zero.
  131. /// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
  132. /// for now.
  133. /// \return Shared pointer to the current TensorOperation.
  134. std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis = 0);
  135. /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
  136. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  137. /// \return Shared pointer to the current TensorOperation.
  138. std::shared_ptr<UnicodeCharTokenizerOperation> UnicodeCharTokenizer(bool with_offsets = false);
  139. #ifndef _WIN32
  140. /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
  141. /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false).
  142. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  143. /// \return Shared pointer to the current TensorOperation.
  144. std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool keep_whitespace = false,
  145. bool with_offsets = false);
  146. /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
  147. /// \param[in] with_offsets If or not output offsets of tokens (default=false).
  148. /// \return Shared pointer to the current TensorOperation.
  149. std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
  150. #endif
  151. /* ####################################### Derived TensorOperation classes ################################# */
  152. #ifndef _WIN32
  153. class CaseFoldOperation : public TensorOperation {
  154. public:
  155. CaseFoldOperation() = default;
  156. ~CaseFoldOperation() = default;
  157. std::shared_ptr<TensorOp> Build() override;
  158. Status ValidateParams() override;
  159. std::string Name() const override { return kCaseFoldOperation; }
  160. };
  161. #endif
  162. class JiebaTokenizerOperation : public TensorOperation {
  163. public:
  164. explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
  165. bool with_offsets);
  166. ~JiebaTokenizerOperation() = default;
  167. std::shared_ptr<TensorOp> Build() override;
  168. Status ValidateParams() override;
  169. std::string Name() const override { return kJiebaTokenizerOperation; }
  170. private:
  171. std::string hmm_path_;
  172. std::string mp_path_;
  173. JiebaMode mode_;
  174. bool with_offsets_;
  175. };
  176. class LookupOperation : public TensorOperation {
  177. public:
  178. explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  179. const DataType &data_type);
  180. ~LookupOperation() = default;
  181. std::shared_ptr<TensorOp> Build() override;
  182. Status ValidateParams() override;
  183. std::string Name() const override { return kLookupOperation; }
  184. private:
  185. std::shared_ptr<Vocab> vocab_;
  186. std::string unknown_token_;
  187. int32_t default_id_;
  188. DataType data_type_;
  189. };
  190. class NgramOperation : public TensorOperation {
  191. public:
  192. explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
  193. const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
  194. ~NgramOperation() = default;
  195. std::shared_ptr<TensorOp> Build() override;
  196. Status ValidateParams() override;
  197. std::string Name() const override { return kNgramOperation; }
  198. private:
  199. std::vector<int32_t> ngrams_;
  200. std::pair<std::string, int32_t> left_pad_;
  201. std::pair<std::string, int32_t> right_pad_;
  202. std::string separator_;
  203. };
  204. #ifndef _WIN32
  205. class NormalizeUTF8Operation : public TensorOperation {
  206. public:
  207. explicit NormalizeUTF8Operation(NormalizeForm normalize_form);
  208. ~NormalizeUTF8Operation() = default;
  209. std::shared_ptr<TensorOp> Build() override;
  210. Status ValidateParams() override;
  211. std::string Name() const override { return kNormalizeUTF8Operation; }
  212. private:
  213. NormalizeForm normalize_form_;
  214. };
  215. #endif
  216. class SentencePieceTokenizerOperation : public TensorOperation {
  217. public:
  218. SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
  219. SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type);
  220. ~SentencePieceTokenizerOperation() = default;
  221. std::shared_ptr<TensorOp> Build() override;
  222. Status ValidateParams() override;
  223. std::string Name() const override { return kSentencepieceTokenizerOperation; }
  224. private:
  225. std::shared_ptr<SentencePieceVocab> vocab_;
  226. std::string vocab_path_;
  227. SPieceTokenizerLoadType load_type_;
  228. SPieceTokenizerOutType out_type_;
  229. };
  230. class SlidingWindowOperation : public TensorOperation {
  231. public:
  232. explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
  233. ~SlidingWindowOperation() = default;
  234. std::shared_ptr<TensorOp> Build() override;
  235. Status ValidateParams() override;
  236. std::string Name() const override { return kSlidingWindowOperation; }
  237. private:
  238. int32_t width_;
  239. int32_t axis_;
  240. };
  241. class UnicodeCharTokenizerOperation : public TensorOperation {
  242. public:
  243. explicit UnicodeCharTokenizerOperation(bool with_offsets);
  244. ~UnicodeCharTokenizerOperation() = default;
  245. std::shared_ptr<TensorOp> Build() override;
  246. Status ValidateParams() override;
  247. std::string Name() const override { return kUnicodeCharTokenizerOperation; }
  248. private:
  249. bool with_offsets_;
  250. };
  251. #ifndef _WIN32
  252. class UnicodeScriptTokenizerOperation : public TensorOperation {
  253. public:
  254. explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets);
  255. ~UnicodeScriptTokenizerOperation() = default;
  256. std::shared_ptr<TensorOp> Build() override;
  257. Status ValidateParams() override;
  258. std::string Name() const override { return kUnicodeScriptTokenizerOperation; }
  259. private:
  260. bool keep_whitespace_;
  261. bool with_offsets_;
  262. };
  263. class WhitespaceTokenizerOperation : public TensorOperation {
  264. public:
  265. explicit WhitespaceTokenizerOperation(bool with_offsets);
  266. ~WhitespaceTokenizerOperation() = default;
  267. std::shared_ptr<TensorOp> Build() override;
  268. Status ValidateParams() override;
  269. std::string Name() const override { return kWhitespaceTokenizerOperation; }
  270. private:
  271. bool with_offsets_;
  272. };
  273. #endif
  274. } // namespace text
  275. } // namespace dataset
  276. } // namespace mindspore
  277. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TEXT_H_