You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

c_api_text_test.cc 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <memory>
  17. #include <vector>
  18. #include <string>
  19. #include "common/common.h"
  20. #include "minddata/dataset/include/datasets.h"
  21. #include "minddata/dataset/include/status.h"
  22. #include "minddata/dataset/include/transforms.h"
  23. #include "minddata/dataset/include/text.h"
  24. using namespace mindspore::dataset;
  25. using mindspore::dataset::DataType;
  26. using mindspore::dataset::ShuffleMode;
  27. using mindspore::dataset::Status;
  28. using mindspore::dataset::Tensor;
  29. using mindspore::dataset::Vocab;
  30. class MindDataTestPipeline : public UT::DatasetOpTesting {
  31. protected:
  32. };
  33. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
  34. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
  35. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
  36. // Create a TextFile dataset
  37. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  38. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  39. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  40. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  41. EXPECT_NE(ds, nullptr);
  42. // Create jieba_tokenizer operation on ds
  43. std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
  44. EXPECT_NE(jieba_tokenizer, nullptr);
  45. // Create Map operation on ds
  46. ds = ds->Map({jieba_tokenizer}, {"text"});
  47. EXPECT_NE(ds, nullptr);
  48. // Create an iterator over the result of the above dataset
  49. // This will trigger the creation of the Execution Tree and launch it.
  50. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  51. EXPECT_NE(iter, nullptr);
  52. // Iterate the dataset and get each row
  53. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  54. iter->GetNextRow(&row);
  55. std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  56. uint64_t i = 0;
  57. while (row.size() != 0) {
  58. auto ind = row["text"];
  59. std::shared_ptr<Tensor> expected_tensor;
  60. Tensor::CreateFromVector(expected, &expected_tensor);
  61. EXPECT_EQ(*ind, *expected_tensor);
  62. iter->GetNextRow(&row);
  63. i++;
  64. }
  65. EXPECT_EQ(i, 1);
  66. // Manually terminate the pipeline
  67. iter->Stop();
  68. }
  69. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
  70. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
  71. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
  72. // Create a TextFile dataset
  73. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  74. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  75. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  76. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  77. EXPECT_NE(ds, nullptr);
  78. // Create jieba_tokenizer operation on ds
  79. std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kHmm);
  80. EXPECT_NE(jieba_tokenizer, nullptr);
  81. // Create Map operation on ds
  82. ds = ds->Map({jieba_tokenizer}, {"text"});
  83. EXPECT_NE(ds, nullptr);
  84. // Create an iterator over the result of the above dataset
  85. // This will trigger the creation of the Execution Tree and launch it.
  86. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  87. EXPECT_NE(iter, nullptr);
  88. // Iterate the dataset and get each row
  89. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  90. iter->GetNextRow(&row);
  91. std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
  92. uint64_t i = 0;
  93. while (row.size() != 0) {
  94. auto ind = row["text"];
  95. std::shared_ptr<Tensor> expected_tensor;
  96. Tensor::CreateFromVector(expected, &expected_tensor);
  97. EXPECT_EQ(*ind, *expected_tensor);
  98. iter->GetNextRow(&row);
  99. i++;
  100. }
  101. EXPECT_EQ(i, 1);
  102. // Manually terminate the pipeline
  103. iter->Stop();
  104. }
  105. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
  106. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
  107. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
  108. // Create a TextFile dataset
  109. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  110. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  111. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  112. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  113. EXPECT_NE(ds, nullptr);
  114. // Create jieba_tokenizer operation on ds
  115. std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp, true);
  116. EXPECT_NE(jieba_tokenizer, nullptr);
  117. // Create Map operation on ds
  118. ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  119. {"token", "offsets_start", "offsets_limit"});
  120. EXPECT_NE(ds, nullptr);
  121. // Create an iterator over the result of the above dataset
  122. // This will trigger the creation of the Execution Tree and launch it.
  123. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  124. EXPECT_NE(iter, nullptr);
  125. // Iterate the dataset and get each row
  126. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  127. iter->GetNextRow(&row);
  128. std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  129. std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
  130. std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
  131. uint64_t i = 0;
  132. while (row.size() != 0) {
  133. auto ind = row["offsets_start"];
  134. auto ind1 = row["offsets_limit"];
  135. auto token = row["token"];
  136. std::shared_ptr<Tensor> expected_tensor;
  137. std::shared_ptr<Tensor> expected_tensor_offsets_start;
  138. std::shared_ptr<Tensor> expected_tensor_offsets_limit;
  139. Tensor::CreateFromVector(expected, &expected_tensor);
  140. Tensor::CreateFromVector(expected_offsets_start, &expected_tensor_offsets_start);
  141. Tensor::CreateFromVector(expected_offsets_limit, &expected_tensor_offsets_limit);
  142. EXPECT_EQ(*ind, *expected_tensor_offsets_start);
  143. EXPECT_EQ(*ind1, *expected_tensor_offsets_limit);
  144. EXPECT_EQ(*token, *expected_tensor);
  145. iter->GetNextRow(&row);
  146. i++;
  147. }
  148. EXPECT_EQ(i, 1);
  149. // Manually terminate the pipeline
  150. iter->Stop();
  151. }
  152. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) {
  153. // Testing the incorrect parameter of JiebaTokenizer interface.
  154. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail.";
  155. // Create a TextFile dataset
  156. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  157. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  158. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  159. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  160. EXPECT_NE(ds, nullptr);
  161. // Create jieba_tokenizer operation on ds
  162. // Testing the parameter hmm_path is empty
  163. std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer("", mp_path, JiebaMode::kMp);
  164. EXPECT_EQ(jieba_tokenizer, nullptr);
  165. // Testing the parameter mp_path is empty
  166. std::shared_ptr<TensorOperation> jieba_tokenizer1 = text::JiebaTokenizer(hmm_path, "", JiebaMode::kMp);
  167. EXPECT_EQ(jieba_tokenizer1, nullptr);
  168. // Testing the parameter hmm_path is invalid path
  169. std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
  170. std::shared_ptr<TensorOperation> jieba_tokenizer2 = text::JiebaTokenizer(hmm_path_invalid, mp_path, JiebaMode::kMp);
  171. EXPECT_EQ(jieba_tokenizer2, nullptr);
  172. // Testing the parameter mp_path is invalid path
  173. std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
  174. std::shared_ptr<TensorOperation> jieba_tokenizer3 = text::JiebaTokenizer(hmm_path, mp_path_invalid, JiebaMode::kMp);
  175. EXPECT_EQ(jieba_tokenizer3, nullptr);
  176. }
  177. TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
  178. // Testing the parameter of SlidingWindow interface when the axis is 0.
  179. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
  180. // Create a TextFile dataset
  181. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  182. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  183. EXPECT_NE(ds, nullptr);
  184. // Create white_tokenizer operation on ds
  185. std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
  186. EXPECT_NE(white_tokenizer, nullptr);
  187. // Create sliding_window operation on ds
  188. std::shared_ptr<TensorOperation> sliding_window = text::SlidingWindow(3, 0);
  189. EXPECT_NE(sliding_window, nullptr);
  190. // Create Map operation on ds
  191. ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
  192. EXPECT_NE(ds, nullptr);
  193. // Create an iterator over the result of the above dataset
  194. // This will trigger the creation of the Execution Tree and launch it.
  195. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  196. EXPECT_NE(iter, nullptr);
  197. // Iterate the dataset and get each row
  198. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  199. iter->GetNextRow(&row);
  200. std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
  201. {"Be", "happy", "every", "happy", "every", "day."},
  202. {"Good", "luck", "to", "luck", "to", "everyone."}};
  203. uint64_t i = 0;
  204. while (row.size() != 0) {
  205. auto ind = row["text"];
  206. std::shared_ptr<Tensor> expected_tensor;
  207. int x = expected[i].size() / 3;
  208. Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &expected_tensor);
  209. EXPECT_EQ(*ind, *expected_tensor);
  210. iter->GetNextRow(&row);
  211. i++;
  212. }
  213. EXPECT_EQ(i, 3);
  214. // Manually terminate the pipeline
  215. iter->Stop();
  216. }
  217. TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
  218. // Testing the parameter of SlidingWindow interface when the axis is -1.
  219. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
  220. // Create a TextFile dataset
  221. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  222. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  223. EXPECT_NE(ds, nullptr);
  224. // Create white_tokenizer operation on ds
  225. std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
  226. EXPECT_NE(white_tokenizer, nullptr);
  227. // Create sliding_window operation on ds
  228. std::shared_ptr<TensorOperation> sliding_window = text::SlidingWindow(2, -1);
  229. EXPECT_NE(sliding_window, nullptr);
  230. // Create Map operation on ds
  231. ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
  232. EXPECT_NE(ds, nullptr);
  233. // Create an iterator over the result of the above dataset
  234. // This will trigger the creation of the Execution Tree and launch it.
  235. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  236. EXPECT_NE(iter, nullptr);
  237. // Iterate the dataset and get each row
  238. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  239. iter->GetNextRow(&row);
  240. std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
  241. {"Be", "happy", "happy", "every", "every", "day."},
  242. {"Good", "luck", "luck", "to", "to", "everyone."}};
  243. uint64_t i = 0;
  244. while (row.size() != 0) {
  245. auto ind = row["text"];
  246. std::shared_ptr<Tensor> expected_tensor;
  247. int x = expected[i].size() / 2;
  248. Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &expected_tensor);
  249. EXPECT_EQ(*ind, *expected_tensor);
  250. iter->GetNextRow(&row);
  251. i++;
  252. }
  253. EXPECT_EQ(i, 3);
  254. // Manually terminate the pipeline
  255. iter->Stop();
  256. }
  257. TEST_F(MindDataTestPipeline, TestSlidingWindowFail) {
  258. // Testing the incorrect parameter of SlidingWindow interface.
  259. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail.";
  260. // Create a TextFile dataset
  261. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  262. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  263. EXPECT_NE(ds, nullptr);
  264. // Create sliding_window operation on ds
  265. // Testing the parameter width less than or equal to 0
  266. // The parameter axis support 0 or -1 only for now
  267. std::shared_ptr<TensorOperation> sliding_window = text::SlidingWindow(0, 0);
  268. EXPECT_EQ(sliding_window, nullptr);
  269. // Testing the parameter width less than or equal to 0
  270. // The parameter axis support 0 or -1 only for now
  271. std::shared_ptr<TensorOperation> sliding_window1 = text::SlidingWindow(-2, 0);
  272. EXPECT_EQ(sliding_window1, nullptr);
  273. }
  274. TEST_F(MindDataTestPipeline, TestNgramSuccess) {
  275. // Testing the parameter of Ngram interface.
  276. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
  277. // Create a TextFile dataset
  278. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  279. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  280. EXPECT_NE(ds, nullptr);
  281. // Create white_tokenizer operation on ds
  282. std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
  283. EXPECT_NE(white_tokenizer, nullptr);
  284. // Create sliding_window operation on ds
  285. std::shared_ptr<TensorOperation> ngram_op = text::Ngram({2}, {"_", 1}, {"_", 1}, " ");
  286. EXPECT_NE(ngram_op, nullptr);
  287. // Create Map operation on ds
  288. ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
  289. EXPECT_NE(ds, nullptr);
  290. // Create an iterator over the result of the above dataset
  291. // This will trigger the creation of the Execution Tree and launch it.
  292. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  293. EXPECT_NE(iter, nullptr);
  294. // Iterate the dataset and get each row
  295. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  296. iter->GetNextRow(&row);
  297. std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
  298. {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
  299. {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
  300. uint64_t i = 0;
  301. while (row.size() != 0) {
  302. auto ind = row["text"];
  303. std::shared_ptr<Tensor> expected_tensor;
  304. int x = expected[i].size();
  305. Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
  306. EXPECT_EQ(*ind, *expected_tensor);
  307. iter->GetNextRow(&row);
  308. i++;
  309. }
  310. EXPECT_EQ(i, 3);
  311. // Manually terminate the pipeline
  312. iter->Stop();
  313. }
  314. TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
  315. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
  316. // Create a TextFile dataset
  317. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  318. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  319. EXPECT_NE(ds, nullptr);
  320. // Create white_tokenizer operation on ds
  321. std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
  322. EXPECT_NE(white_tokenizer, nullptr);
  323. // Create sliding_window operation on ds
  324. std::shared_ptr<TensorOperation> ngram_op = text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-");
  325. EXPECT_NE(ngram_op, nullptr);
  326. // Create Map operation on ds
  327. ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
  328. EXPECT_NE(ds, nullptr);
  329. // Create an iterator over the result of the above dataset
  330. // This will trigger the creation of the Execution Tree and launch it.
  331. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  332. EXPECT_NE(iter, nullptr);
  333. // Iterate the dataset and get each row
  334. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  335. iter->GetNextRow(&row);
  336. std::vector<std::vector<std::string>> expected = {
  337. {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
  338. "a-text-file.", "text-file.-&", "file.-&-&"},
  339. {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
  340. "happy-every-day.", "every-day.-&", "day.-&-&"},
  341. {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
  342. "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
  343. uint64_t i = 0;
  344. while (row.size() != 0) {
  345. auto ind = row["text"];
  346. std::shared_ptr<Tensor> expected_tensor;
  347. int x = expected[i].size();
  348. Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
  349. EXPECT_EQ(*ind, *expected_tensor);
  350. iter->GetNextRow(&row);
  351. i++;
  352. }
  353. EXPECT_EQ(i, 3);
  354. // Manually terminate the pipeline
  355. iter->Stop();
  356. }
  357. TEST_F(MindDataTestPipeline, TestNgramFail) {
  358. // Testing the incorrect parameter of Ngram interface.
  359. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail.";
  360. // Create a TextFile dataset
  361. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  362. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  363. EXPECT_NE(ds, nullptr);
  364. // Create sliding_window operation on ds
  365. // Testing the vector of ngram is empty
  366. std::shared_ptr<TensorOperation> ngram_op = text::Ngram({});
  367. EXPECT_EQ(ngram_op, nullptr);
  368. // Testing the value of ngrams vector less than and equal to 0
  369. std::shared_ptr<TensorOperation> ngram_op1 = text::Ngram({0});
  370. EXPECT_EQ(ngram_op1, nullptr);
  371. // Testing the value of ngrams vector less than and equal to 0
  372. std::shared_ptr<TensorOperation> ngram_op2 = text::Ngram({-2});
  373. EXPECT_EQ(ngram_op2, nullptr);
  374. // Testing the second parameter pad_width in left_pad vector less than 0
  375. std::shared_ptr<TensorOperation> ngram_op3 = text::Ngram({2}, {"", -1});
  376. EXPECT_EQ(ngram_op3, nullptr);
  377. // Testing the second parameter pad_width in right_pad vector less than 0
  378. std::shared_ptr<TensorOperation> ngram_op4 = text::Ngram({2}, {"", 1}, {"", -1});
  379. EXPECT_EQ(ngram_op4, nullptr);
  380. }
  381. TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
  382. // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
  383. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
  384. // Create a TextFile dataset
  385. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  386. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  387. EXPECT_NE(ds, nullptr);
  388. // Create white_tokenizer operation on ds
  389. std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
  390. EXPECT_NE(white_tokenizer, nullptr);
  391. // Create Map operation on ds
  392. ds = ds->Map({white_tokenizer}, {"text"});
  393. EXPECT_NE(ds, nullptr);
  394. // Create an iterator over the result of the above dataset
  395. // This will trigger the creation of the Execution Tree and launch it.
  396. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  397. EXPECT_NE(iter, nullptr);
  398. // Iterate the dataset and get each row
  399. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  400. iter->GetNextRow(&row);
  401. std::vector<std::vector<std::string>> expected = {
  402. {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
  403. uint64_t i = 0;
  404. while (row.size() != 0) {
  405. auto ind = row["text"];
  406. std::shared_ptr<Tensor> expected_tensor;
  407. int x = expected[i].size();
  408. Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
  409. EXPECT_EQ(*ind, *expected_tensor);
  410. iter->GetNextRow(&row);
  411. i++;
  412. }
  413. EXPECT_EQ(i, 3);
  414. // Manually terminate the pipeline
  415. iter->Stop();
  416. }
  417. TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
  418. // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
  419. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
  420. // Create a TextFile dataset
  421. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  422. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  423. EXPECT_NE(ds, nullptr);
  424. // Create white_tokenizer operation on ds
  425. std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer(true);
  426. EXPECT_NE(white_tokenizer, nullptr);
  427. // Create Map operation on ds
  428. ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  429. {"token", "offsets_start", "offsets_limit"});
  430. EXPECT_NE(ds, nullptr);
  431. // Create an iterator over the result of the above dataset
  432. // This will trigger the creation of the Execution Tree and launch it.
  433. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  434. EXPECT_NE(iter, nullptr);
  435. // Iterate the dataset and get each row
  436. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  437. iter->GetNextRow(&row);
  438. std::vector<std::vector<std::string>> expected = {
  439. {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
  440. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
  441. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
  442. uint64_t i = 0;
  443. while (row.size() != 0) {
  444. auto ind = row["offsets_start"];
  445. auto ind1 = row["offsets_limit"];
  446. auto token = row["token"];
  447. std::shared_ptr<Tensor> expected_tensor;
  448. std::shared_ptr<Tensor> expected_tensor_offsets_start;
  449. std::shared_ptr<Tensor> expected_tensor_offsets_limit;
  450. int x = expected[i].size();
  451. Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
  452. Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start);
  453. Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit);
  454. EXPECT_EQ(*ind, *expected_tensor_offsets_start);
  455. EXPECT_EQ(*ind1, *expected_tensor_offsets_limit);
  456. EXPECT_EQ(*token, *expected_tensor);
  457. iter->GetNextRow(&row);
  458. i++;
  459. }
  460. EXPECT_EQ(i, 4);
  461. // Manually terminate the pipeline
  462. iter->Stop();
  463. }