You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.cc 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <unistd.h>
  17. #include "minddata/dataset/include/text.h"
  18. #include "minddata/dataset/text/ir/kernels/text_ir.h"
  19. namespace mindspore {
  20. namespace dataset {
  21. // Transform operations for text.
  22. namespace text {
  23. // FUNCTIONS TO CREATE TEXT OPERATIONS
  24. // (In alphabetical order)
  25. #ifndef _WIN32
  26. // BasicTokenizer
  27. struct BasicTokenizer::Data {
  28. Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  29. bool with_offsets)
  30. : lower_case_(lower_case),
  31. keep_whitespace_(keep_whitespace),
  32. normalize_form_(normalize_form),
  33. preserve_unused_token_(preserve_unused_token),
  34. with_offsets_(with_offsets) {}
  35. bool lower_case_;
  36. bool keep_whitespace_;
  37. NormalizeForm normalize_form_;
  38. bool preserve_unused_token_;
  39. bool with_offsets_;
  40. };
  41. BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
  42. bool preserve_unused_token, bool with_offsets)
  43. : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
  44. std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
  45. return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
  46. data_->preserve_unused_token_, data_->with_offsets_);
  47. }
  48. // BertTokenizer
  49. struct BertTokenizer::Data {
  50. Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
  51. const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
  52. const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
  53. : vocab_(vocab),
  54. suffix_indicator_(CharToString(suffix_indicator)),
  55. max_bytes_per_token_(max_bytes_per_token),
  56. unknown_token_(CharToString(unknown_token)),
  57. lower_case_(lower_case),
  58. keep_whitespace_(keep_whitespace),
  59. normalize_form_(normalize_form),
  60. preserve_unused_token_(preserve_unused_token),
  61. with_offsets_(with_offsets) {}
  62. std::shared_ptr<Vocab> vocab_;
  63. std::string suffix_indicator_;
  64. int32_t max_bytes_per_token_;
  65. std::string unknown_token_;
  66. bool lower_case_;
  67. bool keep_whitespace_;
  68. NormalizeForm normalize_form_;
  69. bool preserve_unused_token_;
  70. bool with_offsets_;
  71. };
  72. BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  73. int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
  74. bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  75. bool with_offsets)
  76. : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
  77. keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
  78. std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
  79. return std::make_shared<BertTokenizerOperation>(
  80. data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
  81. data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
  82. }
  83. // CaseFold
  84. CaseFold::CaseFold() {}
  85. std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
  86. #endif
  87. // JiebaTokenizer
  88. struct JiebaTokenizer::Data {
  89. Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
  90. : hmm_path_(CharToString(hmm_path)),
  91. mp_path_(CharToString(mp_path)),
  92. mode_(mode),
  93. with_offsets_(with_offsets),
  94. words_list_({}) {}
  95. std::string hmm_path_;
  96. std::string mp_path_;
  97. JiebaMode mode_;
  98. bool with_offsets_;
  99. std::vector<std::pair<std::string, int64_t>> words_list_;
  100. };
  101. JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
  102. const JiebaMode &mode, bool with_offsets)
  103. : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
  104. std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
  105. std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
  106. std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
  107. for (auto &word : data_->words_list_) {
  108. Status rc = jieba_tokenizer->AddWord(word.first, word.second);
  109. if (rc.IsError()) {
  110. MS_LOG(ERROR) << rc;
  111. return {};
  112. }
  113. }
  114. return jieba_tokenizer;
  115. }
  116. Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) {
  117. if (word.empty()) {
  118. std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
  119. MS_LOG(ERROR) << err_msg;
  120. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  121. }
  122. if (freq < 0) {
  123. std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
  124. MS_LOG(ERROR) << err_msg;
  125. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  126. }
  127. data_->words_list_.emplace_back(word, freq);
  128. return Status::OK();
  129. }
  130. // Lookup
  131. struct Lookup::Data {
  132. Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  133. const std::vector<char> &data_type)
  134. : vocab_(vocab), unknown_token_(OptionalCharToString(unknown_token)), data_type_(CharToString(data_type)) {}
  135. std::shared_ptr<Vocab> vocab_;
  136. std::optional<std::string> unknown_token_;
  137. std::string data_type_;
  138. };
  139. Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  140. const std::vector<char> &data_type)
  141. : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {}
  142. std::shared_ptr<TensorOperation> Lookup::Parse() {
  143. return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
  144. }
  145. // Ngram
  146. struct Ngram::Data {
  147. Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  148. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
  149. : ngrams_(ngrams),
  150. left_pad_(PairCharToString(left_pad)),
  151. right_pad_(PairCharToString(right_pad)),
  152. separator_(CharToString(separator)) {}
  153. std::vector<int32_t> ngrams_;
  154. std::pair<std::string, int32_t> left_pad_;
  155. std::pair<std::string, int32_t> right_pad_;
  156. std::string separator_;
  157. };
  158. Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  159. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
  160. : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
  161. std::shared_ptr<TensorOperation> Ngram::Parse() {
  162. return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
  163. }
  164. #ifndef _WIN32
  165. // NormalizeUTF8
  166. struct NormalizeUTF8::Data {
  167. explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
  168. NormalizeForm normalize_form_;
  169. };
  170. NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
  171. std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
  172. return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
  173. }
  174. // RegexReplace
  175. struct RegexReplace::Data {
  176. Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
  177. : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
  178. std::string pattern_;
  179. std::string replace_;
  180. bool replace_all_;
  181. };
  182. RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
  183. : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
  184. std::shared_ptr<TensorOperation> RegexReplace::Parse() {
  185. return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
  186. }
  187. // RegexTokenizer
  188. struct RegexTokenizer::Data {
  189. Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
  190. : delim_pattern_(CharToString(delim_pattern)),
  191. keep_delim_pattern_(CharToString(keep_delim_pattern)),
  192. with_offsets_(with_offsets) {}
  193. std::string delim_pattern_;
  194. std::string keep_delim_pattern_;
  195. bool with_offsets_;
  196. };
  197. RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
  198. bool with_offsets)
  199. : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
  200. std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
  201. return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
  202. data_->with_offsets_);
  203. }
  204. #endif
  205. // SentencePieceTokenizer
  206. struct SentencePieceTokenizer::Data {
  207. Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
  208. : vocab_(vocab), out_type_(out_type) {}
  209. Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
  210. : vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
  211. std::shared_ptr<SentencePieceVocab> vocab_;
  212. std::string vocab_path_;
  213. SPieceTokenizerLoadType load_type_;
  214. SPieceTokenizerOutType out_type_;
  215. };
  216. SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
  217. SPieceTokenizerOutType out_type)
  218. : data_(std::make_shared<Data>(vocab, out_type)) {}
  219. SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
  220. : data_(std::make_shared<Data>(vocab_path, out_type)) {}
  221. std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
  222. if (data_->vocab_ != nullptr) {
  223. return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
  224. } else {
  225. return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
  226. }
  227. }
  228. // SlidingWindow
  229. struct SlidingWindow::Data {
  230. Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
  231. int32_t width_;
  232. int32_t axis_;
  233. };
  234. SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
  235. std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
  236. return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
  237. }
  238. // ToNumber
  239. struct ToNumber::Data {
  240. explicit Data(const std::vector<char> &data_type) : data_type_(CharToString(data_type)) {}
  241. std::string data_type_;
  242. };
  243. ToNumber::ToNumber(const std::vector<char> &data_type) : data_(std::make_shared<Data>(data_type)) {}
  244. std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
  245. // TruncateSequencePair
  246. struct TruncateSequencePair::Data {
  247. explicit Data(int32_t max_length) : max_length_(max_length) {}
  248. int32_t max_length_;
  249. };
  250. TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
  251. std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
  252. return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
  253. }
  254. // UnicodeCharTokenizer
  255. struct UnicodeCharTokenizer::Data {
  256. explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
  257. bool with_offsets_;
  258. };
  259. UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
  260. std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
  261. return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
  262. }
  263. #ifndef _WIN32
  264. // UnicodeScriptTokenizer
  265. struct UnicodeScriptTokenizer::Data {
  266. Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
  267. bool keep_whitespace_;
  268. bool with_offsets_;
  269. };
  270. UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
  271. : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
  272. std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
  273. return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
  274. }
  275. // WhitespaceTokenizer
  276. struct WhitespaceTokenizer::Data {
  277. explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
  278. bool with_offsets_;
  279. };
  280. WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
  281. std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
  282. return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
  283. }
  284. #endif
  285. } // namespace text
  286. } // namespace dataset
  287. } // namespace mindspore