You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer_op_test.cc 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <memory>
  17. #include <string>
  18. #include <string_view>
  19. #include "common/common.h"
  20. #include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
  21. #include "minddata/dataset/text/kernels/case_fold_op.h"
  22. #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
  23. #include "minddata/dataset/text/kernels/regex_replace_op.h"
  24. #include "minddata/dataset/text/kernels/regex_tokenizer_op.h"
  25. #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
  26. #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h"
  27. #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
  28. #include "gtest/gtest.h"
  29. #include "utils/log_adapter.h"
  30. using namespace mindspore::dataset;
  31. class MindDataTestTokenizerOp : public UT::Common {
  32. public:
  33. void CheckEqual(const std::shared_ptr<Tensor> &o,
  34. const std::vector<dsize_t> &index,
  35. const std::string &expect) {
  36. std::string_view str;
  37. Status s = o->GetItemAt(&str, index);
  38. EXPECT_TRUE(s.IsOk());
  39. EXPECT_EQ(str, expect);
  40. }
  41. };
  42. TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
  43. MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp.";
  44. std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp(true));
  45. std::shared_ptr<Tensor> input;
  46. Tensor::CreateScalar<std::string>("Hello World!", &input); TensorRow output;
  47. Status s = op->Compute(TensorRow(0, {input}), &output);
  48. EXPECT_TRUE(s.IsOk());
  49. EXPECT_EQ(output[0]->Size(), 12);
  50. EXPECT_EQ(output[0]->Rank(), 1);
  51. MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
  52. CheckEqual(output[0], {0}, "H");
  53. CheckEqual(output[0], {1}, "e");
  54. CheckEqual(output[0], {2}, "l");
  55. CheckEqual(output[0], {3}, "l");
  56. CheckEqual(output[0], {4}, "o");
  57. CheckEqual(output[0], {5}, " ");
  58. CheckEqual(output[0], {6}, "W");
  59. CheckEqual(output[0], {7}, "o");
  60. CheckEqual(output[0], {8}, "r");
  61. CheckEqual(output[0], {9}, "l");
  62. CheckEqual(output[0], {10}, "d");
  63. CheckEqual(output[0], {11}, "!");
  64. Tensor::CreateScalar<std::string>("中国 你好!", &input);
  65. output.clear();
  66. s = op->Compute(TensorRow(0, {input}), &output);
  67. EXPECT_TRUE(s.IsOk());
  68. EXPECT_EQ(output[0]->Size(), 6);
  69. EXPECT_EQ(output[0]->Rank(), 1);
  70. MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
  71. CheckEqual(output[0], {0}, "中");
  72. CheckEqual(output[0], {1}, "国");
  73. CheckEqual(output[0], {2}, " ");
  74. CheckEqual(output[0], {3}, "你");
  75. CheckEqual(output[0], {4}, "好");
  76. CheckEqual(output[0], {5}, "!");
  77. Tensor::CreateScalar<std::string>("中", &input);
  78. output.clear();
  79. s = op->Compute(TensorRow(0, {input}), &output);
  80. EXPECT_TRUE(s.IsOk());
  81. EXPECT_EQ(output[0]->Size(), 1);
  82. EXPECT_EQ(output[0]->Rank(), 1);
  83. MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
  84. CheckEqual(output[0], {0}, "中");
  85. Tensor::CreateScalar<std::string>("H", &input);
  86. output.clear();
  87. s = op->Compute(TensorRow(0, {input}), &output);
  88. EXPECT_TRUE(s.IsOk());
  89. EXPECT_EQ(output[0]->Size(), 1);
  90. EXPECT_EQ(output[0]->Rank(), 1);
  91. MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
  92. CheckEqual(output[0], {0}, "H");
  93. Tensor::CreateScalar<std::string>(" ", &input);
  94. output.clear();
  95. s = op->Compute(TensorRow(0, {input}), &output);
  96. EXPECT_TRUE(s.IsOk());
  97. EXPECT_EQ(output[0]->Size(), 2);
  98. EXPECT_EQ(output[0]->Rank(), 1);
  99. MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
  100. CheckEqual(output[0], {0}, " ");
  101. CheckEqual(output[0], {1}, " ");
  102. Tensor::CreateScalar<std::string>("", &input);
  103. output.clear();
  104. s = op->Compute(TensorRow(0, {input}), &output);
  105. EXPECT_TRUE(s.IsOk());
  106. EXPECT_EQ(output[0]->Size(), 1);
  107. EXPECT_EQ(output[0]->Rank(), 1);
  108. MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
  109. CheckEqual(output[0], {0}, "");
  110. }
  111. TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
  112. MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
  113. std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp(true));
  114. std::shared_ptr<Tensor> input;
  115. Tensor::CreateScalar<std::string>("Welcome to China.", &input); TensorRow output;
  116. Status s = op->Compute(TensorRow(0, {input}), &output);
  117. EXPECT_TRUE(s.IsOk());
  118. EXPECT_EQ(output[0]->Size(), 3);
  119. EXPECT_EQ(output[0]->Rank(), 1);
  120. MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
  121. CheckEqual(output[0], {0}, "Welcome");
  122. CheckEqual(output[0], {1}, "to");
  123. CheckEqual(output[0], {2}, "China.");
  124. Tensor::CreateScalar<std::string>(" hello", &input);
  125. output.clear();
  126. s = op->Compute(TensorRow(0, {input}), &output);
  127. EXPECT_TRUE(s.IsOk());
  128. EXPECT_EQ(output[0]->Size(), 1);
  129. EXPECT_EQ(output[0]->Rank(), 1);
  130. MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
  131. CheckEqual(output[0], {0}, "hello");
  132. Tensor::CreateScalar<std::string>("hello", &input);
  133. output.clear();
  134. s = op->Compute(TensorRow(0, {input}), &output);
  135. EXPECT_TRUE(s.IsOk());
  136. EXPECT_EQ(output[0]->Size(), 1);
  137. EXPECT_EQ(output[0]->Rank(), 1);
  138. MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
  139. CheckEqual(output[0], {0}, "hello");
  140. Tensor::CreateScalar<std::string>("hello ", &input);
  141. output.clear();
  142. s = op->Compute(TensorRow(0, {input}), &output);
  143. EXPECT_TRUE(s.IsOk());
  144. EXPECT_EQ(output[0]->Size(), 1);
  145. EXPECT_EQ(output[0]->Rank(), 1);
  146. MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
  147. CheckEqual(output[0], {0}, "hello");
  148. Tensor::CreateScalar<std::string>(" ", &input);
  149. output.clear();
  150. s = op->Compute(TensorRow(0, {input}), &output);
  151. EXPECT_TRUE(s.IsOk());
  152. EXPECT_EQ(output[0]->Size(), 1);
  153. EXPECT_EQ(output[0]->Rank(), 1);
  154. MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
  155. CheckEqual(output[0], {0}, "");
  156. }
  157. TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
  158. MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
  159. std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true));
  160. std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true));
  161. std::shared_ptr<Tensor> input;
  162. Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input);
  163. TensorRow output;
  164. Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  165. EXPECT_TRUE(s.IsOk());
  166. EXPECT_EQ(output[0]->Size(), 10);
  167. EXPECT_EQ(output[0]->Rank(), 1);
  168. MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
  169. CheckEqual(output[0], {0}, "Welcome");
  170. CheckEqual(output[0], {1}, " ");
  171. CheckEqual(output[0], {2}, "to");
  172. CheckEqual(output[0], {3}, " ");
  173. CheckEqual(output[0], {4}, "China");
  174. CheckEqual(output[0], {5}, ".");
  175. CheckEqual(output[0], {6}, " \n ");
  176. CheckEqual(output[0], {7}, "中国");
  177. CheckEqual(output[0], {8}, "\t");
  178. CheckEqual(output[0], {9}, "北京");
  179. output.clear();
  180. s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
  181. EXPECT_TRUE(s.IsOk());
  182. EXPECT_EQ(output[0]->Size(), 6);
  183. EXPECT_EQ(output[0]->Rank(), 1);
  184. MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
  185. CheckEqual(output[0], {0}, "Welcome");
  186. CheckEqual(output[0], {1}, "to");
  187. CheckEqual(output[0], {2}, "China");
  188. CheckEqual(output[0], {3}, ".");
  189. CheckEqual(output[0], {4}, "中国");
  190. CheckEqual(output[0], {5}, "北京");
  191. Tensor::CreateScalar<std::string>(" Welcome to 中国. ", &input);
  192. output.clear();
  193. s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk());
  194. EXPECT_EQ(output[0]->Size(), 4);
  195. EXPECT_EQ(output[0]->Rank(), 1);
  196. MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
  197. CheckEqual(output[0], {0}, "Welcome");
  198. CheckEqual(output[0], {1}, "to");
  199. CheckEqual(output[0], {2}, "中国");
  200. CheckEqual(output[0], {3}, ".");
  201. output.clear();
  202. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  203. EXPECT_TRUE(s.IsOk());
  204. EXPECT_EQ(output[0]->Size(), 8);
  205. EXPECT_EQ(output[0]->Rank(), 1);
  206. MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
  207. CheckEqual(output[0], {0}, " ");
  208. CheckEqual(output[0], {1}, "Welcome");
  209. CheckEqual(output[0], {2}, " ");
  210. CheckEqual(output[0], {3}, "to");
  211. CheckEqual(output[0], {4}, " ");
  212. CheckEqual(output[0], {5}, "中国");
  213. CheckEqual(output[0], {6}, ".");
  214. CheckEqual(output[0], {7}, " ");
  215. Tensor::CreateScalar<std::string>("Hello", &input);
  216. output.clear();
  217. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk());
  218. EXPECT_EQ(output[0]->Size(), 1);
  219. EXPECT_EQ(output[0]->Rank(), 1);
  220. MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
  221. CheckEqual(output[0], {0}, "Hello");
  222. Tensor::CreateScalar<std::string>("H", &input);
  223. output.clear();
  224. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk());
  225. EXPECT_EQ(output[0]->Size(), 1);
  226. EXPECT_EQ(output[0]->Rank(), 1);
  227. MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
  228. CheckEqual(output[0], {0}, "H");
  229. Tensor::CreateScalar<std::string>("", &input);
  230. output.clear();
  231. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  232. EXPECT_TRUE(s.IsOk());
  233. EXPECT_EQ(output[0]->Size(), 1);
  234. EXPECT_EQ(output[0]->Rank(), 1);
  235. MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString();
  236. CheckEqual(output[0], {0}, "");
  237. Tensor::CreateScalar<std::string>("Hello中国Hello世界", &input);
  238. output.clear();
  239. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); EXPECT_EQ(output[0]->Size(), 4);
  240. EXPECT_EQ(output[0]->Rank(), 1);
  241. MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString();
  242. CheckEqual(output[0], {0}, "Hello");
  243. CheckEqual(output[0], {1}, "中国");
  244. CheckEqual(output[0], {2}, "Hello");
  245. CheckEqual(output[0], {3}, "世界");
  246. Tensor::CreateScalar<std::string>(" ", &input);
  247. output.clear();
  248. s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
  249. EXPECT_TRUE(s.IsOk());
  250. EXPECT_EQ(output[0]->Size(), 1);
  251. EXPECT_EQ(output[0]->Rank(), 1);
  252. MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString();
  253. CheckEqual(output[0], {0}, " ");
  254. Tensor::CreateScalar<std::string>(" ", &input);
  255. output.clear();
  256. s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
  257. EXPECT_TRUE(s.IsOk());
  258. EXPECT_EQ(output[0]->Size(), 1);
  259. EXPECT_EQ(output[0]->Rank(), 1);
  260. MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString();
  261. CheckEqual(output[0], {0}, "");
  262. }
  263. TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
  264. MS_LOG(INFO) << "Doing TestCaseFold.";
  265. std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp());
  266. std::shared_ptr<Tensor> input;
  267. Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input);
  268. std::shared_ptr<Tensor> output;
  269. Status s = case_fold_op->Compute(input, &output);
  270. EXPECT_TRUE(s.IsOk());
  271. EXPECT_EQ(output->Size(), 1);
  272. EXPECT_EQ(output->Rank(), 0);
  273. MS_LOG(INFO) << "Out tensor1: " << output->ToString();
  274. CheckEqual(output, {}, "welcome to china. \n 中国\t北京");
  275. }
  276. TEST_F(MindDataTestTokenizerOp, TestNormalize) {
  277. MS_LOG(INFO) << "Doing TestNormalize.";
  278. std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc));
  279. std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc));
  280. std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd));
  281. std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd));
  282. std::shared_ptr<Tensor> input;
  283. Tensor::CreateScalar<std::string>("ṩ", &input);
  284. std::shared_ptr<Tensor> output;
  285. Status s = nfc_normalize_op->Compute(input, &output);
  286. EXPECT_TRUE(s.IsOk());
  287. MS_LOG(INFO) << "NFC str:" << output->ToString();
  288. nfkc_normalize_op->Compute(input, &output);
  289. EXPECT_TRUE(s.IsOk());
  290. MS_LOG(INFO) << "NFKC str:" << output->ToString();
  291. nfd_normalize_op->Compute(input, &output);
  292. EXPECT_TRUE(s.IsOk());
  293. MS_LOG(INFO) << "NFD str:" << output->ToString();
  294. nfkd_normalize_op->Compute(input, &output);
  295. EXPECT_TRUE(s.IsOk());
  296. MS_LOG(INFO) << "NFKD str:" << output->ToString();
  297. }
  298. TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
  299. MS_LOG(INFO) << "Doing TestRegexReplace.";
  300. std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true));
  301. std::shared_ptr<Tensor> input;
  302. Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input);
  303. std::shared_ptr<Tensor> output;
  304. Status s = regex_replace_op->Compute(input, &output);
  305. EXPECT_TRUE(s.IsOk());
  306. EXPECT_EQ(output->Size(), 1);
  307. EXPECT_EQ(output->Rank(), 0);
  308. MS_LOG(INFO) << "Out tensor1: " << output->ToString();
  309. CheckEqual(output, {}, "Welcome_to_China._中国_北京");
  310. }
  311. TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
  312. MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
  313. std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true));
  314. std::shared_ptr<Tensor> input;
  315. Tensor::CreateScalar<std::string>("Welcome to China. \n 中国\t北京", &input);
  316. TensorRow output;
  317. Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output);
  318. EXPECT_TRUE(s.IsOk());
  319. }
  320. TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
  321. MS_LOG(INFO) << "Doing TestBasicTokenizer.";
  322. // bool lower_case, bool keep_whitespace,
  323. // NormalizeForm normalization_form, bool preserve_unused_token
  324. std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false,true));
  325. std::shared_ptr<Tensor> input;
  326. Tensor::CreateScalar<std::string>("Welcome to China. 中国\t北京", &input);
  327. TensorRow output;
  328. Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output);
  329. EXPECT_TRUE(s.IsOk());
  330. }