You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

text.cc 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <unistd.h>
  17. #include "minddata/dataset/include/text.h"
  18. #ifndef _WIN32
  19. #include "minddata/dataset/text/kernels/case_fold_op.h"
  20. #endif
  21. #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
  22. #include "minddata/dataset/text/kernels/lookup_op.h"
  23. #include "minddata/dataset/text/kernels/ngram_op.h"
  24. #ifndef _WIN32
  25. #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
  26. #endif
  27. #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
  28. #include "minddata/dataset/text/kernels/sliding_window_op.h"
  29. #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
  30. #ifndef _WIN32
  31. #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
  32. #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
  33. #endif
  34. #include "minddata/dataset/util/path.h"
  35. namespace mindspore {
  36. namespace dataset {
  37. // Transform operations for text.
  38. namespace text {
  39. // FUNCTIONS TO CREATE TEXT OPERATIONS
  40. // (In alphabetical order)
  41. #ifndef _WIN32
  42. std::shared_ptr<CaseFoldOperation> CaseFold() {
  43. auto op = std::make_shared<CaseFoldOperation>();
  44. return op->ValidateParams() ? op : nullptr;
  45. }
  46. #endif
  47. std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
  48. const JiebaMode &mode, bool with_offsets) {
  49. auto op = std::make_shared<JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
  50. return op->ValidateParams() ? op : nullptr;
  51. }
  52. std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  53. const DataType &data_type) {
  54. auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
  55. return op->ValidateParams() ? op : nullptr;
  56. }
  57. std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
  58. const std::pair<std::string, int32_t> &left_pad,
  59. const std::pair<std::string, int32_t> &right_pad, const std::string &separator) {
  60. auto op = std::make_shared<NgramOperation>(ngrams, left_pad, right_pad, separator);
  61. return op->ValidateParams() ? op : nullptr;
  62. }
  63. #ifndef _WIN32
  64. std::shared_ptr<NormalizeUTF8Operation> NormalizeUTF8(NormalizeForm normalize_form) {
  65. auto op = std::make_shared<NormalizeUTF8Operation>(normalize_form);
  66. return op->ValidateParams() ? op : nullptr;
  67. }
  68. #endif
  69. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
  70. const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
  71. auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);
  72. return op->ValidateParams() ? op : nullptr;
  73. }
  74. std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(const std::string &vocab_path,
  75. SPieceTokenizerOutType out_type) {
  76. auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab_path, out_type);
  77. return op->ValidateParams() ? op : nullptr;
  78. }
  79. std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis) {
  80. auto op = std::make_shared<SlidingWindowOperation>(width, axis);
  81. return op->ValidateParams() ? op : nullptr;
  82. }
  83. std::shared_ptr<UnicodeCharTokenizerOperation> UnicodeCharTokenizer(bool with_offsets) {
  84. auto op = std::make_shared<UnicodeCharTokenizerOperation>(with_offsets);
  85. return op->ValidateParams() ? op : nullptr;
  86. }
  87. #ifndef _WIN32
  88. std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets) {
  89. auto op = std::make_shared<UnicodeScriptTokenizerOperation>(keep_whitespace, with_offsets);
  90. return op->ValidateParams() ? op : nullptr;
  91. }
  92. std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets) {
  93. auto op = std::make_shared<WhitespaceTokenizerOperation>(with_offsets);
  94. return op->ValidateParams() ? op : nullptr;
  95. }
  96. #endif
  97. /* ####################################### Validator Functions ############################################ */
  98. // Helper function to validate tokenizer directory parameter
  99. Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
  100. if (tokenizer_file.empty()) {
  101. std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
  102. MS_LOG(ERROR) << err_msg;
  103. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  104. }
  105. Path file(tokenizer_file);
  106. if (!file.Exists()) {
  107. std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
  108. MS_LOG(ERROR) << err_msg;
  109. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  110. }
  111. if (access(tokenizer_file.c_str(), R_OK) == -1) {
  112. std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
  113. MS_LOG(ERROR) << err_msg;
  114. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  115. }
  116. return Status::OK();
  117. }
  118. /* ####################################### Derived TensorOperation classes ################################# */
  119. // (In alphabetical order)
  120. #ifndef _WIN32
  121. // CaseFoldOperation
  122. Status CaseFoldOperation::ValidateParams() { return Status::OK(); }
  123. std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
  124. std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
  125. return tensor_op;
  126. }
  127. #endif
  128. // JiebaTokenizerOperation
  129. JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
  130. const JiebaMode &mode, bool with_offsets)
  131. : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
  132. Status JiebaTokenizerOperation::ValidateParams() {
  133. if (hmm_path_.empty()) {
  134. std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
  135. MS_LOG(ERROR) << err_msg;
  136. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  137. }
  138. if (mp_path_.empty()) {
  139. std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
  140. MS_LOG(ERROR) << err_msg;
  141. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  142. }
  143. RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
  144. RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
  145. return Status::OK();
  146. }
  147. std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
  148. std::shared_ptr<JiebaTokenizerOp> tensor_op =
  149. std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
  150. return tensor_op;
  151. }
  152. // LookupOperation
  153. LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
  154. const DataType &data_type)
  155. : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
  156. Status LookupOperation::ValidateParams() {
  157. if (vocab_ == nullptr) {
  158. std::string err_msg = "Lookup: vocab object type is incorrect or null.";
  159. MS_LOG(ERROR) << err_msg;
  160. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  161. }
  162. default_id_ = vocab_->Lookup(unknown_token_);
  163. if (default_id_ == Vocab::kNoTokenExists) {
  164. std::string err_msg = "Lookup: " + unknown_token_ + " doesn't exist in vocab.";
  165. MS_LOG(ERROR) << err_msg;
  166. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  167. }
  168. return Status::OK();
  169. }
  170. std::shared_ptr<TensorOp> LookupOperation::Build() {
  171. std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_);
  172. return tensor_op;
  173. }
  174. // NgramOperation
  175. NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
  176. const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
  177. : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
  178. Status NgramOperation::ValidateParams() {
  179. if (ngrams_.size() == 0) {
  180. std::string err_msg = "Ngram : Container cannot be empty.";
  181. MS_LOG(ERROR) << err_msg;
  182. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  183. } else {
  184. for (int32_t i = 0; i < ngrams_.size(); ++i) {
  185. if (ngrams_[i] <= 0) {
  186. std::string err_msg =
  187. "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
  188. MS_LOG(ERROR) << err_msg;
  189. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  190. }
  191. }
  192. }
  193. if (left_pad_.second < 0) {
  194. std::string err_msg =
  195. "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
  196. std::to_string(left_pad_.second);
  197. MS_LOG(ERROR) << err_msg;
  198. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  199. }
  200. if (right_pad_.second < 0) {
  201. std::string err_msg =
  202. "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
  203. std::to_string(right_pad_.second);
  204. MS_LOG(ERROR) << err_msg;
  205. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  206. }
  207. return Status::OK();
  208. }
  209. std::shared_ptr<TensorOp> NgramOperation::Build() {
  210. int32_t l_len = left_pad_.second;
  211. int32_t r_len = right_pad_.second;
  212. std::string l_pad = left_pad_.first;
  213. std::string r_pad = right_pad_.first;
  214. std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
  215. return tensor_op;
  216. }
  217. #ifndef _WIN32
  218. // NormalizeUTF8Operation
  219. NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
  220. Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); }
  221. std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() {
  222. std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_);
  223. return tensor_op;
  224. }
  225. #endif
  226. // SentencePieceTokenizerOperation
  227. SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
  228. SPieceTokenizerOutType out_type)
  229. : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {}
  230. SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path,
  231. SPieceTokenizerOutType out_type)
  232. : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {}
  233. Status SentencePieceTokenizerOperation::ValidateParams() {
  234. if (load_type_ == SPieceTokenizerLoadType::kModel) {
  235. if (vocab_ == nullptr) {
  236. std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null.";
  237. MS_LOG(ERROR) << err_msg;
  238. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  239. }
  240. } else {
  241. Path vocab_file(vocab_path_);
  242. if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
  243. std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
  244. MS_LOG(ERROR) << err_msg;
  245. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  246. }
  247. if (access(vocab_file.toString().c_str(), R_OK) == -1) {
  248. std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_;
  249. MS_LOG(ERROR) << err_msg;
  250. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  251. }
  252. }
  253. return Status::OK();
  254. }
  255. std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
  256. std::shared_ptr<SentencePieceTokenizerOp> tensor_op;
  257. if (load_type_ == SPieceTokenizerLoadType::kModel) {
  258. tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_);
  259. } else {
  260. Path vocab_file(vocab_path_);
  261. std::string model_path = vocab_file.ParentPath();
  262. std::string model_filename = vocab_file.Basename();
  263. tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_);
  264. }
  265. return tensor_op;
  266. }
  267. // SlidingWindowOperation
  268. SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
  269. Status SlidingWindowOperation::ValidateParams() {
  270. if (width_ < 1) {
  271. std::string err_msg =
  272. "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
  273. MS_LOG(ERROR) << err_msg;
  274. RETURN_STATUS_SYNTAX_ERROR(err_msg);
  275. }
  276. return Status::OK();
  277. }
  278. std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
  279. std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
  280. return tensor_op;
  281. }
  282. // UnicodeCharTokenizerOperation
  283. UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
  284. Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); }
  285. std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() {
  286. std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_);
  287. return tensor_op;
  288. }
  289. #ifndef _WIN32
  290. // UnicodeScriptTokenizerOperation
  291. UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets)
  292. : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
  293. Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); }
  294. std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() {
  295. std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op =
  296. std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_);
  297. return tensor_op;
  298. }
  299. // WhitespaceTokenizerOperation
  300. WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
  301. Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
  302. std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
  303. std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
  304. return tensor_op;
  305. }
  306. #endif
  307. } // namespace text
  308. } // namespace dataset
  309. } // namespace mindspore