You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.cc 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <unistd.h>
  17. #include <fstream>
  18. #include <regex>
  19. #include "minddata/dataset/include/text.h"
  20. #include "minddata/dataset/text/ir/kernels/text_ir.h"
  21. namespace mindspore {
  22. namespace dataset {
  23. // Transform operations for text.
  24. namespace text {
  25. // FUNCTIONS TO CREATE TEXT OPERATIONS
  26. // (In alphabetical order)
  27. #ifndef _WIN32
  28. // BasicTokenizer
  29. struct BasicTokenizer::Data {
  30. Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  31. bool with_offsets)
  32. : lower_case_(lower_case),
  33. keep_whitespace_(keep_whitespace),
  34. normalize_form_(normalize_form),
  35. preserve_unused_token_(preserve_unused_token),
  36. with_offsets_(with_offsets) {}
  37. bool lower_case_;
  38. bool keep_whitespace_;
  39. NormalizeForm normalize_form_;
  40. bool preserve_unused_token_;
  41. bool with_offsets_;
  42. };
  43. BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
  44. bool preserve_unused_token, bool with_offsets)
  45. : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
  46. std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
  47. return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
  48. data_->preserve_unused_token_, data_->with_offsets_);
  49. }
  50. // BertTokenizer
  51. struct BertTokenizer::Data {
  52. Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
  53. const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
  54. const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
  55. : vocab_(vocab),
  56. suffix_indicator_(CharToString(suffix_indicator)),
  57. max_bytes_per_token_(max_bytes_per_token),
  58. unknown_token_(CharToString(unknown_token)),
  59. lower_case_(lower_case),
  60. keep_whitespace_(keep_whitespace),
  61. normalize_form_(normalize_form),
  62. preserve_unused_token_(preserve_unused_token),
  63. with_offsets_(with_offsets) {}
  64. std::shared_ptr<Vocab> vocab_;
  65. std::string suffix_indicator_;
  66. int32_t max_bytes_per_token_;
  67. std::string unknown_token_;
  68. bool lower_case_;
  69. bool keep_whitespace_;
  70. NormalizeForm normalize_form_;
  71. bool preserve_unused_token_;
  72. bool with_offsets_;
  73. };
  74. BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  75. int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
  76. bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  77. bool with_offsets)
  78. : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
  79. keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
  80. std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
  81. return std::make_shared<BertTokenizerOperation>(
  82. data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
  83. data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
  84. }
  85. // CaseFold
  86. CaseFold::CaseFold() {}
  87. std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
  88. #endif
  89. // JiebaTokenizer
  90. struct JiebaTokenizer::Data {
  91. Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
  92. : hmm_path_(CharToString(hmm_path)),
  93. mp_path_(CharToString(mp_path)),
  94. mode_(mode),
  95. with_offsets_(with_offsets),
  96. words_list_({}) {}
  97. std::string hmm_path_;
  98. std::string mp_path_;
  99. JiebaMode mode_;
  100. bool with_offsets_;
  101. std::vector<std::pair<std::string, int64_t>> words_list_;
  102. };
  103. JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
  104. const JiebaMode &mode, bool with_offsets)
  105. : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
  106. std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
  107. std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
  108. std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
  109. for (auto &word : data_->words_list_) {
  110. Status rc = jieba_tokenizer->AddWord(word.first, word.second);
  111. if (rc.IsError()) {
  112. MS_LOG(ERROR) << rc;
  113. return {};
  114. }
  115. }
  116. return jieba_tokenizer;
  117. }
  118. Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
  119. if (word.empty()) {
  120. std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
  121. MS_LOG(ERROR) << err_msg;
  122. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  123. }
  124. if (freq < 0) {
  125. std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
  126. MS_LOG(ERROR) << err_msg;
  127. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  128. }
  129. data_->words_list_.emplace_back(CharToString(word), freq);
  130. return Status::OK();
  131. }
  132. Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
  133. for (auto &word_freq_pair : user_dict) {
  134. RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
  135. }
  136. return Status::OK();
  137. }
  138. Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
  139. std::vector<std::pair<std::string, int64_t>> user_dict;
  140. RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
  141. RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
  142. return Status::OK();
  143. }
  144. Status JiebaTokenizer::ParserFile(const std::string &file_path,
  145. std::vector<std::pair<std::string, int64_t>> *const user_dict) {
  146. std::ifstream ifs(file_path);
  147. if (!ifs) {
  148. std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
  149. MS_LOG(ERROR) << err_msg;
  150. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  151. }
  152. std::string line;
  153. while (std::getline(ifs, line)) {
  154. if (line.empty()) {
  155. continue;
  156. }
  157. std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
  158. std::smatch tokens;
  159. std::regex_match(line, tokens, regex);
  160. if (std::regex_match(line, tokens, regex)) {
  161. if (tokens.size() == 2) {
  162. user_dict->emplace_back(tokens.str(1), 0);
  163. } else if (tokens.size() == 3) {
  164. user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0));
  165. } else {
  166. continue;
  167. }
  168. } else {
  169. continue;
  170. }
  171. }
  172. MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
  173. MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
  174. for (std::size_t i = 0; i != user_dict->size(); ++i) {
  175. if (i >= 10) break;
  176. MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
  177. }
  178. return Status::OK();
  179. }
  180. // Lookup
  181. struct Lookup::Data {
  182. Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  183. const std::vector<char> &data_type)
  184. : vocab_(vocab), unknown_token_(OptionalCharToString(unknown_token)), data_type_(CharToString(data_type)) {}
  185. std::shared_ptr<Vocab> vocab_;
  186. std::optional<std::string> unknown_token_;
  187. std::string data_type_;
  188. };
  189. Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  190. const std::vector<char> &data_type)
  191. : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {}
  192. std::shared_ptr<TensorOperation> Lookup::Parse() {
  193. return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
  194. }
  195. // Ngram
  196. struct Ngram::Data {
  197. Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  198. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
  199. : ngrams_(ngrams),
  200. left_pad_(PairCharToString(left_pad)),
  201. right_pad_(PairCharToString(right_pad)),
  202. separator_(CharToString(separator)) {}
  203. std::vector<int32_t> ngrams_;
  204. std::pair<std::string, int32_t> left_pad_;
  205. std::pair<std::string, int32_t> right_pad_;
  206. std::string separator_;
  207. };
  208. Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  209. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
  210. : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
  211. std::shared_ptr<TensorOperation> Ngram::Parse() {
  212. return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
  213. }
  214. #ifndef _WIN32
  215. // NormalizeUTF8
  216. struct NormalizeUTF8::Data {
  217. explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
  218. NormalizeForm normalize_form_;
  219. };
  220. NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
  221. std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
  222. return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
  223. }
  224. // RegexReplace
  225. struct RegexReplace::Data {
  226. Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
  227. : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
  228. std::string pattern_;
  229. std::string replace_;
  230. bool replace_all_;
  231. };
  232. RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
  233. : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
  234. std::shared_ptr<TensorOperation> RegexReplace::Parse() {
  235. return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
  236. }
  237. // RegexTokenizer
  238. struct RegexTokenizer::Data {
  239. Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
  240. : delim_pattern_(CharToString(delim_pattern)),
  241. keep_delim_pattern_(CharToString(keep_delim_pattern)),
  242. with_offsets_(with_offsets) {}
  243. std::string delim_pattern_;
  244. std::string keep_delim_pattern_;
  245. bool with_offsets_;
  246. };
  247. RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
  248. bool with_offsets)
  249. : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
  250. std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
  251. return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
  252. data_->with_offsets_);
  253. }
  254. #endif
  255. // SentencePieceTokenizer
  256. struct SentencePieceTokenizer::Data {
  257. Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
  258. : vocab_(vocab), out_type_(out_type) {}
  259. Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
  260. : vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
  261. std::shared_ptr<SentencePieceVocab> vocab_;
  262. std::string vocab_path_;
  263. SPieceTokenizerOutType out_type_;
  264. };
  265. SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
  266. SPieceTokenizerOutType out_type)
  267. : data_(std::make_shared<Data>(vocab, out_type)) {}
  268. SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
  269. : data_(std::make_shared<Data>(vocab_path, out_type)) {}
  270. std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
  271. if (data_->vocab_ != nullptr) {
  272. return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
  273. } else {
  274. return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
  275. }
  276. }
  277. // SlidingWindow
  278. struct SlidingWindow::Data {
  279. Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
  280. int32_t width_;
  281. int32_t axis_;
  282. };
  283. SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
  284. std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
  285. return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
  286. }
  287. // ToNumber
  288. struct ToNumber::Data {
  289. explicit Data(const std::vector<char> &data_type) : data_type_(CharToString(data_type)) {}
  290. std::string data_type_;
  291. };
  292. ToNumber::ToNumber(const std::vector<char> &data_type) : data_(std::make_shared<Data>(data_type)) {}
  293. std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
  294. // TruncateSequencePair
  295. struct TruncateSequencePair::Data {
  296. explicit Data(int32_t max_length) : max_length_(max_length) {}
  297. int32_t max_length_;
  298. };
  299. TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
  300. std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
  301. return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
  302. }
  303. // UnicodeCharTokenizer
  304. struct UnicodeCharTokenizer::Data {
  305. explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
  306. bool with_offsets_;
  307. };
  308. UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
  309. std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
  310. return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
  311. }
  312. // WordpieceTokenizer
  313. struct WordpieceTokenizer::Data {
  314. Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
  315. const std::vector<char> &unknown_token, bool with_offsets)
  316. : vocab_(vocab),
  317. suffix_indicator_(CharToString(suffix_indicator)),
  318. max_bytes_per_token_(max_bytes_per_token),
  319. unknown_token_(CharToString(unknown_token)),
  320. with_offsets_(with_offsets) {}
  321. std::shared_ptr<Vocab> vocab_;
  322. std::string suffix_indicator_;
  323. int32_t max_bytes_per_token_;
  324. std::string unknown_token_;
  325. bool with_offsets_;
  326. };
  327. WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  328. int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
  329. bool with_offsets)
  330. : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
  331. std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
  332. return std::make_shared<WordpieceTokenizerOperation>(
  333. data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
  334. }
  335. #ifndef _WIN32
  336. // UnicodeScriptTokenizer
  337. struct UnicodeScriptTokenizer::Data {
  338. Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
  339. bool keep_whitespace_;
  340. bool with_offsets_;
  341. };
  342. UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
  343. : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
  344. std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
  345. return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
  346. }
  347. // WhitespaceTokenizer
  348. struct WhitespaceTokenizer::Data {
  349. explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
  350. bool with_offsets_;
  351. };
  352. WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
  353. std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
  354. return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
  355. }
  356. #endif
  357. } // namespace text
  358. } // namespace dataset
  359. } // namespace mindspore