You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

vocab.cc 8.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <fstream>
  17. #include <unordered_set>
  18. #include <unordered_map>
  19. #include <utility>
  20. #include <algorithm>
  21. #include "minddata/dataset/text/vocab.h"
  22. #ifndef ENABLE_ANDROID
  23. #include "utils/log_adapter.h"
  24. #else
  25. #include "mindspore/lite/src/common/log_adapter.h"
  26. #endif
  27. namespace mindspore {
  28. namespace dataset {
  29. Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }
  30. WordIdType Vocab::Lookup(const WordType &word) const {
  31. auto itr = word2id_.find(word);
  32. return itr == word2id_.end() ? kNoTokenExists : itr->second;
  33. }
  34. #ifdef ENABLE_PYTHON
  35. Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
  36. std::shared_ptr<Vocab> *vocab) {
  37. // check of duplication on both words and special_tokens will be performed in python
  38. // special_tokens and words both need to be unique, and shouldn't overlap
  39. std::unordered_map<WordType, WordIdType> word2id;
  40. // if special is added in front, normal words id will start from number of special tokens
  41. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  42. for (auto word : words) {
  43. word2id[py::str(word)] = word_id++;
  44. }
  45. word_id = prepend_special ? 0 : word2id.size();
  46. for (auto special_token : special_tokens) {
  47. word2id[py::str(special_token)] = word_id++;
  48. }
  49. *vocab = std::make_shared<Vocab>(std::move(word2id));
  50. return Status::OK();
  51. }
  52. Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
  53. std::unordered_map<WordType, WordIdType> word2id;
  54. for (auto p : words) {
  55. word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
  56. }
  57. *vocab = std::make_shared<Vocab>(std::move(word2id));
  58. return Status::OK();
  59. }
  60. #endif
  61. void Vocab::append_word(const std::string &word) {
  62. if (word2id_.find(word) == word2id_.end()) {
  63. word2id_[word] = word2id_.size();
  64. }
  65. }
  66. Status Vocab::BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
  67. std::shared_ptr<Vocab> *vocab) {
  68. // Validate parameters and build map
  69. std::unordered_map<WordType, WordIdType> word2id;
  70. for (auto p : words) {
  71. if (p.second < 0) {
  72. MS_LOG(ERROR) << "index can not be negetive, but got " << p.second;
  73. RETURN_STATUS_UNEXPECTED("index can not be negetive, but got " + std::to_string(p.second));
  74. }
  75. word2id[p.first] = p.second;
  76. }
  77. *vocab = std::make_shared<Vocab>(std::move(word2id));
  78. return Status::OK();
  79. }
  80. Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
  81. bool prepend_special, std::shared_ptr<Vocab> *vocab) {
  82. std::unordered_map<WordType, WordIdType> word2id;
  83. // if special is added in front, normal words id will start from number of special tokens
  84. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  85. for (auto word : words) {
  86. if (word2id.find(word) != word2id.end()) {
  87. MS_LOG(ERROR) << "word_list contains duplicate word: " + word + ".";
  88. RETURN_STATUS_UNEXPECTED("word_list contains duplicate word: " + word + ".");
  89. }
  90. word2id[word] = word_id++;
  91. }
  92. word_id = prepend_special ? 0 : word2id.size();
  93. for (auto special_token : special_tokens) {
  94. if (word2id.find(special_token) != word2id.end()) {
  95. MS_LOG(ERROR) << "special_tokens and word_list contain duplicate word: " + special_token + ".";
  96. RETURN_STATUS_UNEXPECTED("special_tokens and word_list contain duplicate word: " + special_token + ".");
  97. }
  98. word2id[special_token] = word_id++;
  99. }
  100. *vocab = std::make_shared<Vocab>(std::move(word2id));
  101. return Status::OK();
  102. }
  103. Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
  104. const std::vector<WordType> &special_tokens, bool prepend_special,
  105. std::shared_ptr<Vocab> *vocab) {
  106. // Validate parameters
  107. if (path.empty()) {
  108. MS_LOG(ERROR) << "vocab file path is not set!";
  109. RETURN_STATUS_UNEXPECTED("vocab file path is not set!");
  110. }
  111. if (vocab_size < 0 && vocab_size != -1) {
  112. MS_LOG(ERROR) << "vocab_size shoule be either -1 or positive integer, but got " << vocab_size;
  113. RETURN_STATUS_UNEXPECTED("vocab_size shoule be either -1 or positive integer, but got " +
  114. std::to_string(vocab_size));
  115. }
  116. std::string duplicate_sp;
  117. for (const WordType &sp : special_tokens) {
  118. if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) {
  119. if (duplicate_sp.find(sp) == std::string::npos) {
  120. duplicate_sp = duplicate_sp.empty() ? duplicate_sp + sp : duplicate_sp + ", " + sp;
  121. }
  122. }
  123. }
  124. if (!duplicate_sp.empty()) {
  125. MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp;
  126. RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp);
  127. }
  128. std::unordered_set<std::string> specials;
  129. // used to check that words in file don't contain any special token that already exists
  130. for (auto word : special_tokens) {
  131. specials.insert(word);
  132. }
  133. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  134. std::unordered_map<WordType, WordIdType> word2id;
  135. std::fstream handle(path, std::ios::in);
  136. if (!handle.good() || !handle.is_open()) {
  137. MS_LOG(ERROR) << "fail to open:" + path;
  138. RETURN_STATUS_UNEXPECTED("fail to open:" + path);
  139. }
  140. std::string word;
  141. while (std::getline(handle, word)) {
  142. if (!delimiter.empty()) {
  143. // if delimiter is not found, find_first_of would return std::string::npos which is -1
  144. word = word.substr(0, word.find_first_of(delimiter));
  145. }
  146. if (word2id.find(word) != word2id.end()) {
  147. MS_LOG(ERROR) << "word_list contains duplicate word:" + word + ".";
  148. RETURN_STATUS_UNEXPECTED("word_list contains duplicate word:" + word + ".");
  149. }
  150. if (specials.find(word) != specials.end()) {
  151. MS_LOG(ERROR) << "special_tokens and word_list contain duplicate word: " << word;
  152. RETURN_STATUS_UNEXPECTED("special_tokens and word_list contain duplicate word: " + word);
  153. }
  154. word2id[word] = word_id++;
  155. // break if enough row is read, if vocab_size is smaller than 0
  156. if (word2id.size() == vocab_size) break;
  157. }
  158. word_id = prepend_special ? 0 : word2id.size();
  159. for (auto special_token : special_tokens) {
  160. word2id[special_token] = word_id++;
  161. }
  162. *vocab = std::make_shared<Vocab>(std::move(word2id));
  163. return Status::OK();
  164. }
  165. Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
  166. const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
  167. // python validator checks special_tokens doesn't contain any duplicate words
  168. std::unordered_set<std::string> specials;
  169. // used to check that words in file don't contain any special token that already exists
  170. for (auto word : special_tokens) {
  171. specials.insert(py::str(word));
  172. }
  173. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  174. std::unordered_map<WordType, WordIdType> word2id;
  175. std::fstream handle(path, std::ios::in);
  176. CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "fail to open:" + path);
  177. std::string word;
  178. while (std::getline(handle, word)) {
  179. if (!delimiter.empty()) {
  180. // if delimiter is not found, find_first_of would return std::string::npos which is -1
  181. word = word.substr(0, word.find_first_of(delimiter));
  182. }
  183. CHECK_FAIL_RETURN_UNEXPECTED(word2id.find(word) == word2id.end(), "duplicate word:" + word + ".");
  184. CHECK_FAIL_RETURN_UNEXPECTED(specials.find(word) == specials.end(), word + " is already in special_tokens.");
  185. word2id[word] = word_id++;
  186. // break if enough row is read, if vocab_size is smaller than 0
  187. if (word2id.size() == vocab_size) break;
  188. }
  189. word_id = prepend_special ? 0 : word2id.size();
  190. for (auto special_token : special_tokens) {
  191. word2id[py::str(special_token)] = word_id++;
  192. }
  193. *vocab = std::make_shared<Vocab>(std::move(word2id));
  194. return Status::OK();
  195. }
  196. const WordIdType Vocab::kNoTokenExists = -1;
  197. } // namespace dataset
  198. } // namespace mindspore