You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer_op_test.cc 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <memory>
  17. #include <string>
  18. #include <string_view>
  19. #include "common/common.h"
  20. #include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
  21. #include "minddata/dataset/text/kernels/case_fold_op.h"
  22. #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
  23. #include "minddata/dataset/text/kernels/regex_replace_op.h"
  24. #include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
  25. #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
  26. #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
  27. #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
  28. #include "gtest/gtest.h"
  29. #include "utils/log_adapter.h"
  30. using namespace mindspore::dataset;
  31. class MindDataTestTokenizerOp : public UT::Common {
  32. public:
  33. void CheckEqual(const std::shared_ptr<Tensor> &o,
  34. const std::vector<dsize_t> &index,
  35. const std::string &expect) {
  36. std::string_view str;
  37. Status s = o->GetItemAt(&str, index);
  38. EXPECT_TRUE(s.IsOk());
  39. EXPECT_EQ(str, expect);
  40. }
  41. };
  42. TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
  43. MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp.";
  44. std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp(true));
  45. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Hello World!");
  46. TensorRow output;
  47. Status s = op->Compute(TensorRow(0, {input}), &output);
  48. EXPECT_TRUE(s.IsOk());
  49. EXPECT_EQ(output[0]->Size(), 12);
  50. EXPECT_EQ(output[0]->Rank(), 1);
  51. MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
  52. CheckEqual(output[0], {0}, "H");
  53. CheckEqual(output[0], {1}, "e");
  54. CheckEqual(output[0], {2}, "l");
  55. CheckEqual(output[0], {3}, "l");
  56. CheckEqual(output[0], {4}, "o");
  57. CheckEqual(output[0], {5}, " ");
  58. CheckEqual(output[0], {6}, "W");
  59. CheckEqual(output[0], {7}, "o");
  60. CheckEqual(output[0], {8}, "r");
  61. CheckEqual(output[0], {9}, "l");
  62. CheckEqual(output[0], {10}, "d");
  63. CheckEqual(output[0], {11}, "!");
  64. input = std::make_shared<Tensor>("中国 你好!");
  65. output.clear();
  66. s = op->Compute(TensorRow(0, {input}), &output);
  67. EXPECT_TRUE(s.IsOk());
  68. EXPECT_EQ(output[0]->Size(), 6);
  69. EXPECT_EQ(output[0]->Rank(), 1);
  70. MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
  71. CheckEqual(output[0], {0}, "中");
  72. CheckEqual(output[0], {1}, "国");
  73. CheckEqual(output[0], {2}, " ");
  74. CheckEqual(output[0], {3}, "你");
  75. CheckEqual(output[0], {4}, "好");
  76. CheckEqual(output[0], {5}, "!");
  77. input = std::make_shared<Tensor>("中");
  78. output.clear();
  79. s = op->Compute(TensorRow(0, {input}), &output);
  80. EXPECT_TRUE(s.IsOk());
  81. EXPECT_EQ(output[0]->Size(), 1);
  82. EXPECT_EQ(output[0]->Rank(), 1);
  83. MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
  84. CheckEqual(output[0], {0}, "中");
  85. input = std::make_shared<Tensor>("H");
  86. output.clear();
  87. s = op->Compute(TensorRow(0, {input}), &output);
  88. EXPECT_TRUE(s.IsOk());
  89. EXPECT_EQ(output[0]->Size(), 1);
  90. EXPECT_EQ(output[0]->Rank(), 1);
  91. MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
  92. CheckEqual(output[0], {0}, "H");
  93. input = std::make_shared<Tensor>(" ");
  94. output.clear();
  95. s = op->Compute(TensorRow(0, {input}), &output);
  96. EXPECT_TRUE(s.IsOk());
  97. EXPECT_EQ(output[0]->Size(), 2);
  98. EXPECT_EQ(output[0]->Rank(), 1);
  99. MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
  100. CheckEqual(output[0], {0}, " ");
  101. CheckEqual(output[0], {1}, " ");
  102. input = std::make_shared<Tensor>("");
  103. output.clear();
  104. s = op->Compute(TensorRow(0, {input}), &output);
  105. EXPECT_TRUE(s.IsOk());
  106. EXPECT_EQ(output[0]->Size(), 1);
  107. EXPECT_EQ(output[0]->Rank(), 1);
  108. MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
  109. CheckEqual(output[0], {0}, "");
  110. }
  111. TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
  112. MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
  113. std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp(true));
  114. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
  115. TensorRow output;
  116. Status s = op->Compute(TensorRow(0, {input}), &output);
  117. EXPECT_TRUE(s.IsOk());
  118. EXPECT_EQ(output[0]->Size(), 3);
  119. EXPECT_EQ(output[0]->Rank(), 1);
  120. MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
  121. CheckEqual(output[0], {0}, "Welcome");
  122. CheckEqual(output[0], {1}, "to");
  123. CheckEqual(output[0], {2}, "China.");
  124. input = std::make_shared<Tensor>(" hello");
  125. output.clear();
  126. s = op->Compute(TensorRow(0, {input}), &output);
  127. EXPECT_TRUE(s.IsOk());
  128. EXPECT_EQ(output[0]->Size(), 1);
  129. EXPECT_EQ(output[0]->Rank(), 1);
  130. MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
  131. CheckEqual(output[0], {0}, "hello");
  132. input = std::make_shared<Tensor>("hello");
  133. output.clear();
  134. s = op->Compute(TensorRow(0, {input}), &output);
  135. EXPECT_TRUE(s.IsOk());
  136. EXPECT_EQ(output[0]->Size(), 1);
  137. EXPECT_EQ(output[0]->Rank(), 1);
  138. MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
  139. CheckEqual(output[0], {0}, "hello");
  140. input = std::make_shared<Tensor>("hello ");
  141. output.clear();
  142. s = op->Compute(TensorRow(0, {input}), &output);
  143. EXPECT_TRUE(s.IsOk());
  144. EXPECT_EQ(output[0]->Size(), 1);
  145. EXPECT_EQ(output[0]->Rank(), 1);
  146. MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
  147. CheckEqual(output[0], {0}, "hello");
  148. input = std::make_shared<Tensor>(" ");
  149. output.clear();
  150. s = op->Compute(TensorRow(0, {input}), &output);
  151. EXPECT_TRUE(s.IsOk());
  152. EXPECT_EQ(output[0]->Size(), 1);
  153. EXPECT_EQ(output[0]->Rank(), 1);
  154. MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
  155. CheckEqual(output[0], {0}, "");
  156. }
  157. TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
  158. MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
  159. std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true));
  160. std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true));
  161. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
  162. TensorRow output;
  163. Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  164. EXPECT_TRUE(s.IsOk());
  165. EXPECT_EQ(output[0]->Size(), 10);
  166. EXPECT_EQ(output[0]->Rank(), 1);
  167. MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
  168. CheckEqual(output[0], {0}, "Welcome");
  169. CheckEqual(output[0], {1}, " ");
  170. CheckEqual(output[0], {2}, "to");
  171. CheckEqual(output[0], {3}, " ");
  172. CheckEqual(output[0], {4}, "China");
  173. CheckEqual(output[0], {5}, ".");
  174. CheckEqual(output[0], {6}, " \n ");
  175. CheckEqual(output[0], {7}, "中国");
  176. CheckEqual(output[0], {8}, "\t");
  177. CheckEqual(output[0], {9}, "北京");
  178. output.clear();
  179. s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
  180. EXPECT_TRUE(s.IsOk());
  181. EXPECT_EQ(output[0]->Size(), 6);
  182. EXPECT_EQ(output[0]->Rank(), 1);
  183. MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
  184. CheckEqual(output[0], {0}, "Welcome");
  185. CheckEqual(output[0], {1}, "to");
  186. CheckEqual(output[0], {2}, "China");
  187. CheckEqual(output[0], {3}, ".");
  188. CheckEqual(output[0], {4}, "中国");
  189. CheckEqual(output[0], {5}, "北京");
  190. input = std::make_shared<Tensor>(" Welcome to 中国. ");
  191. output.clear();
  192. s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
  193. EXPECT_TRUE(s.IsOk());
  194. EXPECT_EQ(output[0]->Size(), 4);
  195. EXPECT_EQ(output[0]->Rank(), 1);
  196. MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
  197. CheckEqual(output[0], {0}, "Welcome");
  198. CheckEqual(output[0], {1}, "to");
  199. CheckEqual(output[0], {2}, "中国");
  200. CheckEqual(output[0], {3}, ".");
  201. output.clear();
  202. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  203. EXPECT_TRUE(s.IsOk());
  204. EXPECT_EQ(output[0]->Size(), 8);
  205. EXPECT_EQ(output[0]->Rank(), 1);
  206. MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
  207. CheckEqual(output[0], {0}, " ");
  208. CheckEqual(output[0], {1}, "Welcome");
  209. CheckEqual(output[0], {2}, " ");
  210. CheckEqual(output[0], {3}, "to");
  211. CheckEqual(output[0], {4}, " ");
  212. CheckEqual(output[0], {5}, "中国");
  213. CheckEqual(output[0], {6}, ".");
  214. CheckEqual(output[0], {7}, " ");
  215. input = std::make_shared<Tensor>("Hello");
  216. output.clear();
  217. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  218. EXPECT_TRUE(s.IsOk());
  219. EXPECT_EQ(output[0]->Size(), 1);
  220. EXPECT_EQ(output[0]->Rank(), 1);
  221. MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
  222. CheckEqual(output[0], {0}, "Hello");
  223. input = std::make_shared<Tensor>("H");
  224. output.clear();
  225. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  226. EXPECT_TRUE(s.IsOk());
  227. EXPECT_EQ(output[0]->Size(), 1);
  228. EXPECT_EQ(output[0]->Rank(), 1);
  229. MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
  230. CheckEqual(output[0], {0}, "H");
  231. input = std::make_shared<Tensor>("");
  232. output.clear();
  233. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  234. EXPECT_TRUE(s.IsOk());
  235. EXPECT_EQ(output[0]->Size(), 1);
  236. EXPECT_EQ(output[0]->Rank(), 1);
  237. MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString();
  238. CheckEqual(output[0], {0}, "");
  239. input = std::make_shared<Tensor>("Hello中国Hello世界");
  240. output.clear();
  241. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk());
  242. EXPECT_EQ(output[0]->Size(), 4);
  243. EXPECT_EQ(output[0]->Rank(), 1);
  244. MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString();
  245. CheckEqual(output[0], {0}, "Hello");
  246. CheckEqual(output[0], {1}, "中国");
  247. CheckEqual(output[0], {2}, "Hello");
  248. CheckEqual(output[0], {3}, "世界");
  249. input = std::make_shared<Tensor>(" ");
  250. output.clear();
  251. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  252. EXPECT_TRUE(s.IsOk());
  253. EXPECT_EQ(output[0]->Size(), 1);
  254. EXPECT_EQ(output[0]->Rank(), 1);
  255. MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString();
  256. CheckEqual(output[0], {0}, " ");
  257. input = std::make_shared<Tensor>(" ");
  258. output.clear();
  259. s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
  260. EXPECT_TRUE(s.IsOk());
  261. EXPECT_EQ(output[0]->Size(), 1);
  262. EXPECT_EQ(output[0]->Rank(), 1);
  263. MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString();
  264. CheckEqual(output[0], {0}, "");
  265. }
  266. TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
  267. MS_LOG(INFO) << "Doing TestCaseFold.";
  268. std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp());
  269. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
  270. std::shared_ptr<Tensor> output;
  271. Status s = case_fold_op->Compute(input, &output);
  272. EXPECT_TRUE(s.IsOk());
  273. EXPECT_EQ(output->Size(), 1);
  274. EXPECT_EQ(output->Rank(), 0);
  275. MS_LOG(INFO) << "Out tensor1: " << output->ToString();
  276. CheckEqual(output, {}, "welcome to china. \n 中国\t北京");
  277. }
  278. TEST_F(MindDataTestTokenizerOp, TestNormalize) {
  279. MS_LOG(INFO) << "Doing TestNormalize.";
  280. std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc));
  281. std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc));
  282. std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd));
  283. std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd));
  284. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("ṩ");
  285. std::shared_ptr<Tensor> output;
  286. Status s = nfc_normalize_op->Compute(input, &output);
  287. EXPECT_TRUE(s.IsOk());
  288. MS_LOG(INFO) << "NFC str:" << output->ToString();
  289. nfkc_normalize_op->Compute(input, &output);
  290. EXPECT_TRUE(s.IsOk());
  291. MS_LOG(INFO) << "NFKC str:" << output->ToString();
  292. nfd_normalize_op->Compute(input, &output);
  293. EXPECT_TRUE(s.IsOk());
  294. MS_LOG(INFO) << "NFD str:" << output->ToString();
  295. nfkd_normalize_op->Compute(input, &output);
  296. EXPECT_TRUE(s.IsOk());
  297. MS_LOG(INFO) << "NFKD str:" << output->ToString();
  298. }
  299. TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
  300. MS_LOG(INFO) << "Doing TestRegexReplace.";
  301. std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true));
  302. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
  303. std::shared_ptr<Tensor> output;
  304. Status s = regex_replace_op->Compute(input, &output);
  305. EXPECT_TRUE(s.IsOk());
  306. EXPECT_EQ(output->Size(), 1);
  307. EXPECT_EQ(output->Rank(), 0);
  308. MS_LOG(INFO) << "Out tensor1: " << output->ToString();
  309. CheckEqual(output, {}, "Welcome_to_China._中国_北京");
  310. }
  311. TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
  312. MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
  313. std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true));
  314. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
  315. TensorRow output;
  316. Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output);
  317. EXPECT_TRUE(s.IsOk());
  318. }
  319. TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
  320. MS_LOG(INFO) << "Doing TestBasicTokenizer.";
  321. //bool lower_case, bool keep_whitespace,
  322. // NormalizeForm normalization_form, bool preserve_unused_token
  323. std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false,
  324. true));
  325. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
  326. TensorRow output;
  327. Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output);
  328. EXPECT_TRUE(s.IsOk());
  329. }