You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

vocab.cc 8.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <fstream>
  17. #include <unordered_set>
  18. #include <unordered_map>
  19. #include <utility>
  20. #include <algorithm>
  21. #include "minddata/dataset/text/vocab.h"
  22. #ifndef ENABLE_ANDROID
  23. #include "utils/log_adapter.h"
  24. #else
  25. #include "mindspore/lite/src/common/log_adapter.h"
  26. #endif
  27. namespace mindspore {
  28. namespace dataset {
  29. Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }
  30. WordIdType Vocab::Lookup(const WordType &word) const {
  31. auto itr = word2id_.find(word);
  32. return itr == word2id_.end() ? kNoTokenExists : itr->second;
  33. }
  34. #ifdef ENABLE_PYTHON
  35. Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
  36. std::shared_ptr<Vocab> *vocab) {
  37. // check of duplication on both words and special_tokens will be performed in python
  38. // special_tokens and words both need to be unique, and shouldn't overlap
  39. std::unordered_map<WordType, WordIdType> word2id;
  40. // if special is added in front, normal words id will start from number of special tokens
  41. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  42. for (auto word : words) {
  43. word2id[py::str(word)] = word_id++;
  44. }
  45. word_id = prepend_special ? 0 : word2id.size();
  46. for (auto special_token : special_tokens) {
  47. word2id[py::str(special_token)] = word_id++;
  48. }
  49. *vocab = std::make_shared<Vocab>(std::move(word2id));
  50. return Status::OK();
  51. }
  52. Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
  53. std::unordered_map<WordType, WordIdType> word2id;
  54. for (auto p : words) {
  55. word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
  56. }
  57. *vocab = std::make_shared<Vocab>(std::move(word2id));
  58. return Status::OK();
  59. }
  60. #endif
  61. void Vocab::append_word(const std::string &word) {
  62. if (word2id_.find(word) == word2id_.end()) {
  63. word2id_[word] = word2id_.size();
  64. }
  65. }
  66. Status Vocab::BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
  67. std::shared_ptr<Vocab> *vocab) {
  68. // Validate parameters and build map
  69. std::unordered_map<WordType, WordIdType> word2id;
  70. for (auto p : words) {
  71. if (p.second < 0) {
  72. RETURN_STATUS_UNEXPECTED("from_dict: index can not be negetive, but got " + std::to_string(p.second));
  73. }
  74. word2id[p.first] = p.second;
  75. }
  76. *vocab = std::make_shared<Vocab>(std::move(word2id));
  77. return Status::OK();
  78. }
  79. Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
  80. bool prepend_special, std::shared_ptr<Vocab> *vocab) {
  81. std::unordered_map<WordType, WordIdType> word2id;
  82. // if special is added in front, normal words id will start from number of special tokens
  83. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  84. for (auto word : words) {
  85. if (word2id.find(word) != word2id.end()) {
  86. RETURN_STATUS_UNEXPECTED("from_list: word_list contains duplicate word: " + word + ".");
  87. }
  88. word2id[word] = word_id++;
  89. }
  90. word_id = prepend_special ? 0 : word2id.size();
  91. for (auto special_token : special_tokens) {
  92. if (word2id.find(special_token) != word2id.end()) {
  93. RETURN_STATUS_UNEXPECTED(
  94. "from_list: "
  95. "special_tokens and word_list contain duplicate word: " +
  96. special_token + ".");
  97. }
  98. word2id[special_token] = word_id++;
  99. }
  100. *vocab = std::make_shared<Vocab>(std::move(word2id));
  101. return Status::OK();
  102. }
  103. Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
  104. const std::vector<WordType> &special_tokens, bool prepend_special,
  105. std::shared_ptr<Vocab> *vocab) {
  106. // Validate parameters
  107. if (path.empty()) {
  108. RETURN_STATUS_UNEXPECTED("from_file: vocab file path is not set!");
  109. }
  110. if (vocab_size < 0 && vocab_size != -1) {
  111. RETURN_STATUS_UNEXPECTED(
  112. "from_file: "
  113. "vocab_size should be either -1 or positive integer, but got " +
  114. std::to_string(vocab_size));
  115. }
  116. std::string duplicate_sp;
  117. for (const WordType &sp : special_tokens) {
  118. if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) {
  119. if (duplicate_sp.find(sp) == std::string::npos) {
  120. duplicate_sp = duplicate_sp.empty() ? duplicate_sp + sp : duplicate_sp + ", " + sp;
  121. }
  122. }
  123. }
  124. if (!duplicate_sp.empty()) {
  125. RETURN_STATUS_UNEXPECTED(
  126. "from_file: "
  127. "special_tokens contains duplicate word: " +
  128. duplicate_sp);
  129. }
  130. std::unordered_set<std::string> specials;
  131. // used to check that words in file don't contain any special token that already exists
  132. for (auto word : special_tokens) {
  133. specials.insert(word);
  134. }
  135. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  136. std::unordered_map<WordType, WordIdType> word2id;
  137. std::fstream handle(path, std::ios::in);
  138. if (!handle.good() || !handle.is_open()) {
  139. RETURN_STATUS_UNEXPECTED("from_file: fail to open: " + path);
  140. }
  141. std::string word;
  142. while (std::getline(handle, word)) {
  143. if (!delimiter.empty()) {
  144. // if delimiter is not found, find_first_of would return std::string::npos which is -1
  145. word = word.substr(0, word.find_first_of(delimiter));
  146. }
  147. if (word2id.find(word) != word2id.end()) {
  148. RETURN_STATUS_UNEXPECTED("from_file: word_list contains duplicate word:" + word + ".");
  149. }
  150. if (specials.find(word) != specials.end()) {
  151. RETURN_STATUS_UNEXPECTED(
  152. "from_file: "
  153. "special_tokens and word_list contain duplicate word: " +
  154. word);
  155. }
  156. word2id[word] = word_id++;
  157. // break if enough row is read, if vocab_size is smaller than 0
  158. if (word2id.size() == vocab_size) break;
  159. }
  160. word_id = prepend_special ? 0 : word2id.size();
  161. for (auto special_token : special_tokens) {
  162. word2id[special_token] = word_id++;
  163. }
  164. *vocab = std::make_shared<Vocab>(std::move(word2id));
  165. return Status::OK();
  166. }
  167. Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
  168. const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
  169. // python validator checks special_tokens doesn't contain any duplicate words
  170. std::unordered_set<std::string> specials;
  171. // used to check that words in file don't contain any special token that already exists
  172. for (auto word : special_tokens) {
  173. specials.insert(py::str(word));
  174. }
  175. WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
  176. std::unordered_map<WordType, WordIdType> word2id;
  177. std::fstream handle(path, std::ios::in);
  178. CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "from_file: fail to open:" + path);
  179. std::string word;
  180. while (std::getline(handle, word)) {
  181. if (!delimiter.empty()) {
  182. // if delimiter is not found, find_first_of would return std::string::npos which is -1
  183. word = word.substr(0, word.find_first_of(delimiter));
  184. }
  185. CHECK_FAIL_RETURN_UNEXPECTED(word2id.find(word) == word2id.end(), "from_file: duplicate word:" + word + ".");
  186. CHECK_FAIL_RETURN_UNEXPECTED(specials.find(word) == specials.end(),
  187. "from_file: " + word + " is already in special_tokens.");
  188. word2id[word] = word_id++;
  189. // break if enough row is read, if vocab_size is smaller than 0
  190. if (word2id.size() == vocab_size) break;
  191. }
  192. word_id = prepend_special ? 0 : word2id.size();
  193. for (auto special_token : special_tokens) {
  194. word2id[py::str(special_token)] = word_id++;
  195. }
  196. *vocab = std::make_shared<Vocab>(std::move(word2id));
  197. return Status::OK();
  198. }
  199. const WordIdType Vocab::kNoTokenExists = -1;
  200. } // namespace dataset
  201. } // namespace mindspore