You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.cc 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <unistd.h>
  17. #include "minddata/dataset/include/text.h"
  18. #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
  19. #include "minddata/dataset/text/kernels/lookup_op.h"
  20. #include "minddata/dataset/text/kernels/ngram_op.h"
  21. #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
  22. #include "minddata/dataset/text/kernels/sliding_window_op.h"
  23. #ifndef _WIN32
  24. #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
  25. #endif
  26. #include "minddata/dataset/util/path.h"
  27. namespace mindspore {
  28. namespace dataset {
  29. // Transform operations for text.
  30. namespace text {
  31. // FUNCTIONS TO CREATE TEXT OPERATIONS
  32. // (In alphabetical order)
  33. std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  34. const JiebaMode &mode, bool with_offsets) {
  35. auto op = std::make_shared<JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
  36. return op->ValidateParams() ? op : nullptr;
  37. }
  38. std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  39. const DataType &data_type) {
  40. auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
  41. return op->ValidateParams() ? op : nullptr;
  42. }
  43. std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
  44. const std::pair<std::string, int32_t> &left_pad,
  45. const std::pair<std::string, int32_t> &right_pad, const std::string &separator) {
  46. auto op = std::make_shared<NgramOperation>(ngrams, left_pad, right_pad, separator);
  47. return op->ValidateParams() ? op : nullptr;
  48. }
  49. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  50. const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
  51. auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);
  52. return op->ValidateParams() ? op : nullptr;
  53. }
  54. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(const std::string &vocab_path,
  55. SPieceTokenizerOutType out_type) {
  56. auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab_path, out_type);
  57. return op->ValidateParams() ? op : nullptr;
  58. }
  59. std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis) {
  60. auto op = std::make_shared<SlidingWindowOperation>(width, axis);
  61. return op->ValidateParams() ? op : nullptr;
  62. }
  63. #ifndef _WIN32
  64. std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets) {
  65. auto op = std::make_shared<WhitespaceTokenizerOperation>(with_offsets);
  66. return op->ValidateParams() ? op : nullptr;
  67. }
  68. #endif
  69. /* ####################################### Validator Functions ############################################ */
  70. // Helper function to validate tokenizer directory parameter
  71. Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
  72. if (tokenizer_file.empty()) {
  73. std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
  74. MS_LOG(ERROR) << err_msg;
  75. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  76. }
  77. Path file(tokenizer_file);
  78. if (!file.Exists()) {
  79. std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
  80. MS_LOG(ERROR) << err_msg;
  81. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  82. }
  83. if (access(tokenizer_file.c_str(), R_OK) == -1) {
  84. std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
  85. MS_LOG(ERROR) << err_msg;
  86. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  87. }
  88. return Status::OK();
  89. }
  90. /* ####################################### Derived TensorOperation classes ################################# */
  91. // (In alphabetical order)
  92. // JiebaTokenizerOperation
  93. JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
  94. const JiebaMode &mode, bool with_offsets)
  95. : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
  96. Status JiebaTokenizerOperation::ValidateParams() {
  97. if (hmm_path_.empty()) {
  98. std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
  99. MS_LOG(ERROR) << err_msg;
  100. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  101. }
  102. if (mp_path_.empty()) {
  103. std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
  104. MS_LOG(ERROR) << err_msg;
  105. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  106. }
  107. RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
  108. RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
  109. return Status::OK();
  110. }
  111. std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
  112. std::shared_ptr<JiebaTokenizerOp> tensor_op =
  113. std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
  114. return tensor_op;
  115. }
  116. // LookupOperation
  117. LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  118. const DataType &data_type)
  119. : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
  120. Status LookupOperation::ValidateParams() {
  121. if (vocab_ == nullptr) {
  122. std::string err_msg = "Lookup: vocab object type is incorrect or null.";
  123. MS_LOG(ERROR) << err_msg;
  124. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  125. }
  126. default_id_ = vocab_->Lookup(unknown_token_);
  127. if (default_id_ == Vocab::kNoTokenExists) {
  128. std::string err_msg = "Lookup: " + unknown_token_ + " doesn't exist in vocab.";
  129. MS_LOG(ERROR) << err_msg;
  130. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  131. }
  132. return Status::OK();
  133. }
  134. std::shared_ptr<TensorOp> LookupOperation::Build() {
  135. std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_);
  136. return tensor_op;
  137. }
  138. // NgramOperation
  139. NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
  140. const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
  141. : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
  142. Status NgramOperation::ValidateParams() {
  143. if (ngrams_.size() == 0) {
  144. std::string err_msg = "Ngram : Container cannot be empty.";
  145. MS_LOG(ERROR) << err_msg;
  146. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  147. } else {
  148. for (int32_t i = 0; i < ngrams_.size(); ++i) {
  149. if (ngrams_[i] <= 0) {
  150. std::string err_msg =
  151. "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
  152. MS_LOG(ERROR) << err_msg;
  153. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  154. }
  155. }
  156. }
  157. if (left_pad_.second < 0) {
  158. std::string err_msg =
  159. "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
  160. std::to_string(left_pad_.second);
  161. MS_LOG(ERROR) << err_msg;
  162. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  163. }
  164. if (right_pad_.second < 0) {
  165. std::string err_msg =
  166. "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
  167. std::to_string(right_pad_.second);
  168. MS_LOG(ERROR) << err_msg;
  169. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  170. }
  171. return Status::OK();
  172. }
  173. std::shared_ptr<TensorOp> NgramOperation::Build() {
  174. int32_t l_len = left_pad_.second;
  175. int32_t r_len = right_pad_.second;
  176. std::string l_pad = left_pad_.first;
  177. std::string r_pad = right_pad_.first;
  178. std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
  179. return tensor_op;
  180. }
  181. // SentencePieceTokenizerOperation
  182. SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
  183. SPieceTokenizerOutType out_type)
  184. : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
  185. SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
  186. SPieceTokenizerOutType out_type)
  187. : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
  188. Status SentencePieceTokenizerOperation::ValidateParams() {
  189. if (load_type_ == SPieceTokenizerLoadType::kModel) {
  190. if (vocab_ == nullptr) {
  191. std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
  192. MS_LOG(ERROR) << err_msg;
  193. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  194. }
  195. } else {
  196. Path vocab_file(vocab_path_);
  197. if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
  198. std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
  199. MS_LOG(ERROR) << err_msg;
  200. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  201. }
  202. if (access(vocab_file.toString().c_str(), R_OK) == -1) {
  203. std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
  204. MS_LOG(ERROR) << err_msg;
  205. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  206. }
  207. }
  208. return Status::OK();
  209. }
  210. std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
  211. std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
  212. if (load_type_ == SPieceTokenizerLoadType::kModel) {
  213. tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
  214. } else {
  215. Path vocab_file(vocab_path_);
  216. std::string model_path = vocab_file.ParentPath();
  217. std::string model_filename = vocab_file.Basename();
  218. tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
  219. }
  220. return tensor_op;
  221. }
  222. // SlidingWindowOperation
  223. SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
  224. Status SlidingWindowOperation::ValidateParams() {
  225. if (width_ < 1) {
  226. std::string err_msg =
  227. "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
  228. MS_LOG(ERROR) << err_msg;
  229. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  230. }
  231. return Status::OK();
  232. }
  233. std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
  234. std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
  235. return tensor_op;
  236. }
  237. #ifndef _WIN32
  238. // WhitespaceTokenizerOperation
  239. WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
  240. Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
  241. std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
  242. std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
  243. return tensor_op;
  244. }
  245. #endif
  246. } // namespace text
  247. } // namespace dataset
  248. } // namespace mindspore