You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.cc 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. /**
  2. * Copyright 2020-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "minddata/dataset/include/dataset/text.h"
  17. #include <unistd.h>
  18. #include <fstream>
  19. #include <regex>
  20. #include "minddata/dataset/core/type_id.h"
  21. #include "minddata/dataset/text/ir/kernels/text_ir.h"
  22. #include "mindspore/core/ir/dtype/type_id.h"
  23. #include "utils/file_utils.h"
  24. namespace mindspore {
  25. namespace dataset {
  26. // Transform operations for text.
  27. namespace text {
  28. constexpr size_t size_two = 2;
  29. constexpr size_t size_three = 3;
  30. constexpr int64_t value_one = 1;
  31. constexpr int64_t value_two = 2;
  32. constexpr size_t kMaxLoggedRows = 10;
  33. // FUNCTIONS TO CREATE TEXT OPERATIONS
  34. // (In alphabetical order)
  35. #ifndef _WIN32
  36. // BasicTokenizer
  37. struct BasicTokenizer::Data {
  38. Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  39. bool with_offsets)
  40. : lower_case_(lower_case),
  41. keep_whitespace_(keep_whitespace),
  42. normalize_form_(normalize_form),
  43. preserve_unused_token_(preserve_unused_token),
  44. with_offsets_(with_offsets) {}
  45. bool lower_case_;
  46. bool keep_whitespace_;
  47. NormalizeForm normalize_form_;
  48. bool preserve_unused_token_;
  49. bool with_offsets_;
  50. };
  51. BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
  52. bool preserve_unused_token, bool with_offsets)
  53. : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
  54. std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
  55. return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
  56. data_->preserve_unused_token_, data_->with_offsets_);
  57. }
  58. // BertTokenizer
  59. struct BertTokenizer::Data {
  60. Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
  61. const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
  62. const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
  63. : vocab_(vocab),
  64. suffix_indicator_(CharToString(suffix_indicator)),
  65. max_bytes_per_token_(max_bytes_per_token),
  66. unknown_token_(CharToString(unknown_token)),
  67. lower_case_(lower_case),
  68. keep_whitespace_(keep_whitespace),
  69. normalize_form_(normalize_form),
  70. preserve_unused_token_(preserve_unused_token),
  71. with_offsets_(with_offsets) {}
  72. std::shared_ptr<Vocab> vocab_;
  73. std::string suffix_indicator_;
  74. int32_t max_bytes_per_token_;
  75. std::string unknown_token_;
  76. bool lower_case_;
  77. bool keep_whitespace_;
  78. NormalizeForm normalize_form_;
  79. bool preserve_unused_token_;
  80. bool with_offsets_;
  81. };
  82. BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  83. int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
  84. bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
  85. bool with_offsets)
  86. : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
  87. keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
  88. std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
  89. return std::make_shared<BertTokenizerOperation>(
  90. data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
  91. data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
  92. }
  93. // CaseFold
  94. CaseFold::CaseFold() = default;
  95. std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
  96. // FilterWikipediaXML
  97. FilterWikipediaXML::FilterWikipediaXML() {}
  98. std::shared_ptr<TensorOperation> FilterWikipediaXML::Parse() { return std::make_shared<FilterWikipediaXMLOperation>(); }
  99. #endif
  100. // JiebaTokenizer
  101. struct JiebaTokenizer::Data {
  102. Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
  103. : hmm_path_(CharToString(hmm_path)),
  104. mp_path_(CharToString(mp_path)),
  105. mode_(mode),
  106. with_offsets_(with_offsets),
  107. words_list_({}) {}
  108. std::string hmm_path_;
  109. std::string mp_path_;
  110. JiebaMode mode_;
  111. bool with_offsets_;
  112. std::vector<std::pair<std::string, int64_t>> words_list_;
  113. };
  114. JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
  115. const JiebaMode &mode, bool with_offsets)
  116. : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
  117. std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
  118. std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
  119. std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
  120. for (auto &word : data_->words_list_) {
  121. Status rc = jieba_tokenizer->AddWord(word.first, word.second);
  122. if (rc.IsError()) {
  123. MS_LOG(ERROR) << rc;
  124. return {};
  125. }
  126. }
  127. return jieba_tokenizer;
  128. }
  129. Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
  130. if (word.empty()) {
  131. std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
  132. LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
  133. }
  134. if (freq < 0) {
  135. std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
  136. LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
  137. }
  138. (void)data_->words_list_.emplace_back(CharToString(word), freq);
  139. return Status::OK();
  140. }
  141. Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
  142. for (auto &word_freq_pair : user_dict) {
  143. RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
  144. }
  145. return Status::OK();
  146. }
  147. Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
  148. std::vector<std::pair<std::string, int64_t>> user_dict;
  149. RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
  150. RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
  151. return Status::OK();
  152. }
  153. Status JiebaTokenizer::ParserFile(const std::string &file_path,
  154. std::vector<std::pair<std::string, int64_t>> *const user_dict) {
  155. RETURN_UNEXPECTED_IF_NULL(user_dict);
  156. auto realpath = FileUtils::GetRealPath(file_path.c_str());
  157. if (!realpath.has_value()) {
  158. std::string err_msg = "Get real path failed, path: " + file_path;
  159. LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
  160. }
  161. std::ifstream ifs(realpath.value());
  162. if (!ifs) {
  163. std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
  164. LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
  165. }
  166. std::string line;
  167. while (std::getline(ifs, line)) {
  168. if (line.empty()) {
  169. continue;
  170. }
  171. std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
  172. std::smatch tokens;
  173. if (std::regex_match(line, tokens, regex)) {
  174. if (tokens.size() == size_two) {
  175. (void)user_dict->emplace_back(tokens.str(value_one), 0);
  176. } else if (tokens.size() == size_three) {
  177. (void)user_dict->emplace_back(tokens.str(value_one), strtoll(tokens.str(value_two).c_str(), nullptr, 0));
  178. } else {
  179. continue;
  180. }
  181. } else {
  182. continue;
  183. }
  184. }
  185. ifs.close();
  186. MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
  187. MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
  188. for (std::size_t i = 0; i != user_dict->size(); ++i) {
  189. if (i >= kMaxLoggedRows) break;
  190. MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
  191. }
  192. return Status::OK();
  193. }
  194. // Lookup
  195. struct Lookup::Data {
  196. Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  197. mindspore::DataType data_type)
  198. : vocab_(vocab), data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {
  199. if (unknown_token == std::nullopt) {
  200. unknown_token_ = std::nullopt;
  201. } else {
  202. unknown_token_ = std::string(unknown_token->begin(), unknown_token->end());
  203. }
  204. }
  205. std::shared_ptr<Vocab> vocab_;
  206. std::optional<std::string> unknown_token_;
  207. dataset::DataType data_type_;
  208. };
  209. Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
  210. mindspore::DataType data_type)
  211. : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
  212. data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
  213. }
  214. std::shared_ptr<TensorOperation> Lookup::Parse() {
  215. return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
  216. }
  217. // Ngram
  218. struct Ngram::Data {
  219. Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  220. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
  221. : ngrams_(ngrams),
  222. left_pad_(PairCharToString(left_pad)),
  223. right_pad_(PairCharToString(right_pad)),
  224. separator_(CharToString(separator)) {}
  225. std::vector<int32_t> ngrams_;
  226. std::pair<std::string, int32_t> left_pad_;
  227. std::pair<std::string, int32_t> right_pad_;
  228. std::string separator_;
  229. };
  230. Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
  231. const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
  232. : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
  233. std::shared_ptr<TensorOperation> Ngram::Parse() {
  234. return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
  235. }
  236. #ifndef _WIN32
  237. // NormalizeUTF8
  238. struct NormalizeUTF8::Data {
  239. explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
  240. NormalizeForm normalize_form_;
  241. };
  242. NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
  243. std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
  244. return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
  245. }
  246. // RegexReplace
  247. struct RegexReplace::Data {
  248. Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
  249. : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
  250. std::string pattern_;
  251. std::string replace_;
  252. bool replace_all_;
  253. };
  254. RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
  255. : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
  256. std::shared_ptr<TensorOperation> RegexReplace::Parse() {
  257. return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
  258. }
  259. // RegexTokenizer
  260. struct RegexTokenizer::Data {
  261. Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
  262. : delim_pattern_(CharToString(delim_pattern)),
  263. keep_delim_pattern_(CharToString(keep_delim_pattern)),
  264. with_offsets_(with_offsets) {}
  265. std::string delim_pattern_;
  266. std::string keep_delim_pattern_;
  267. bool with_offsets_;
  268. };
  269. RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
  270. bool with_offsets)
  271. : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
  272. std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
  273. return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
  274. data_->with_offsets_);
  275. }
  276. #endif
  277. // SentencePieceTokenizer
  278. struct SentencePieceTokenizer::Data {
  279. Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
  280. : vocab_(vocab), vocab_path_(""), out_type_(out_type) {}
  281. Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
  282. : vocab_(nullptr), vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
  283. std::shared_ptr<SentencePieceVocab> vocab_;
  284. std::string vocab_path_;
  285. SPieceTokenizerOutType out_type_;
  286. };
  287. SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
  288. SPieceTokenizerOutType out_type)
  289. : data_(std::make_shared<Data>(vocab, out_type)) {}
  290. SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
  291. : data_(std::make_shared<Data>(vocab_path, out_type)) {}
  292. std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
  293. if (data_->vocab_ != nullptr) {
  294. return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
  295. } else {
  296. return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
  297. }
  298. }
  299. // SlidingWindow
  300. struct SlidingWindow::Data {
  301. Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
  302. int32_t width_;
  303. int32_t axis_;
  304. };
  305. SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
  306. std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
  307. return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
  308. }
  309. // ToNumber
  310. struct ToNumber::Data {
  311. dataset::DataType data_type_;
  312. };
  313. ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
  314. data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
  315. }
  316. std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
  317. // ToVectors
  318. struct ToVectors::Data {
  319. Data(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup)
  320. : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
  321. std::shared_ptr<Vectors> vectors_;
  322. std::vector<float> unk_init_;
  323. bool lower_case_backup_;
  324. };
  325. ToVectors::ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
  326. bool lower_case_backup)
  327. : data_(std::make_shared<Data>(vectors, unk_init, lower_case_backup)) {}
  328. std::shared_ptr<TensorOperation> ToVectors::Parse() {
  329. return std::make_shared<ToVectorsOperation>(data_->vectors_, data_->unk_init_, data_->lower_case_backup_);
  330. }
  331. // TruncateSequencePair
  332. struct TruncateSequencePair::Data {
  333. explicit Data(int32_t max_length) : max_length_(max_length) {}
  334. int32_t max_length_;
  335. };
  336. TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
  337. std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
  338. return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
  339. }
  340. // UnicodeCharTokenizer
  341. struct UnicodeCharTokenizer::Data {
  342. explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
  343. bool with_offsets_;
  344. };
  345. UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
  346. std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
  347. return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
  348. }
  349. // WordpieceTokenizer
  350. struct WordpieceTokenizer::Data {
  351. Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
  352. const std::vector<char> &unknown_token, bool with_offsets)
  353. : vocab_(vocab),
  354. suffix_indicator_(CharToString(suffix_indicator)),
  355. max_bytes_per_token_(max_bytes_per_token),
  356. unknown_token_(CharToString(unknown_token)),
  357. with_offsets_(with_offsets) {}
  358. std::shared_ptr<Vocab> vocab_;
  359. std::string suffix_indicator_;
  360. int32_t max_bytes_per_token_;
  361. std::string unknown_token_;
  362. bool with_offsets_;
  363. };
  364. WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
  365. int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
  366. bool with_offsets)
  367. : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
  368. std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
  369. return std::make_shared<WordpieceTokenizerOperation>(
  370. data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
  371. }
  372. #ifndef _WIN32
  373. // UnicodeScriptTokenizer
  374. struct UnicodeScriptTokenizer::Data {
  375. Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
  376. bool keep_whitespace_;
  377. bool with_offsets_;
  378. };
  379. UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
  380. : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
  381. std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
  382. return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
  383. }
  384. // WhitespaceTokenizer
  385. struct WhitespaceTokenizer::Data {
  386. explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
  387. bool with_offsets_;
  388. };
  389. WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
  390. std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
  391. return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
  392. }
  393. #endif
  394. } // namespace text
  395. } // namespace dataset
  396. } // namespace mindspore