You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

c_api_dataset_vocab.cc 9.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <fstream>
  17. #include <iostream>
  18. #include <memory>
  19. #include <vector>
  20. #include <string>
  21. #include "common/common.h"
  22. #include "minddata/dataset/include/datasets.h"
  23. #include "minddata/dataset/include/status.h"
  24. #include "minddata/dataset/include/transforms.h"
  25. #include "minddata/dataset/include/text.h"
  26. using namespace mindspore::dataset::api;
  27. using mindspore::dataset::ShuffleMode;
  28. using mindspore::dataset::Tensor;
  29. using mindspore::dataset::Status;
  30. using mindspore::dataset::Vocab;
  31. class MindDataTestPipeline : public UT::DatasetOpTesting {
  32. protected:
  33. };
  34. TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
  35. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp.";
  36. // Create a TextFile dataset
  37. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  38. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  39. EXPECT_NE(ds, nullptr);
  40. // Create a vocab from vector
  41. std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
  42. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  43. Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
  44. EXPECT_EQ(s, Status::OK());
  45. // Create Lookup operation on ds
  46. std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
  47. EXPECT_NE(lookup, nullptr);
  48. // Create Map operation on ds
  49. ds = ds->Map({lookup}, {"text"});
  50. EXPECT_NE(ds, nullptr);
  51. // Create an iterator over the result of the above dataset
  52. // This will trigger the creation of the Execution Tree and launch it.
  53. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  54. EXPECT_NE(iter, nullptr);
  55. // Iterate the dataset and get each row
  56. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  57. iter->GetNextRow(&row);
  58. uint64_t i = 0;
  59. std::vector<int32_t> expected = {2, 1, 4, 5, 6, 7};
  60. while (row.size() != 0) {
  61. auto ind = row["text"];
  62. MS_LOG(INFO) << ind->shape() << " " << *ind;
  63. std::shared_ptr<Tensor> expected_item;
  64. Tensor::CreateScalar(expected[i], &expected_item);
  65. EXPECT_EQ(*ind, *expected_item);
  66. iter->GetNextRow(&row);
  67. i++;
  68. }
  69. }
  70. TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
  71. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1.";
  72. // Create a TextFile Dataset
  73. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  74. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  75. EXPECT_NE(ds, nullptr);
  76. // Build vocab from vector
  77. std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
  78. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  79. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  80. EXPECT_EQ(s, Status::OK());
  81. // Create lookup op for ds
  82. // Expected failure: "<unk>" is not a word of vocab
  83. std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
  84. EXPECT_EQ(lookup, nullptr);
  85. }
  86. TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
  87. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail2.";
  88. // Vocab has nothing
  89. std::shared_ptr<Vocab> vocab;
  90. // Create lookup op
  91. // Expected failure: vocab is null
  92. std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "");
  93. EXPECT_EQ(lookup, nullptr);
  94. }
  95. TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) {
  96. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken.";
  97. // Create a TextFile dataset
  98. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  99. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  100. EXPECT_NE(ds, nullptr);
  101. // Create a vocab from map
  102. std::unordered_map<std::string, int32_t> dict;
  103. dict["Home"] = 3;
  104. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  105. Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
  106. EXPECT_EQ(s, Status::OK());
  107. // Create Lookup operation on ds
  108. // Expected failure: "" is not a word of vocab
  109. std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "");
  110. EXPECT_EQ(lookup, nullptr);
  111. }
  112. TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
  113. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset.";
  114. // Create a TextFile dataset
  115. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  116. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  117. EXPECT_NE(ds, nullptr);
  118. // Create vocab from dataset
  119. std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()},
  120. std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true);
  121. EXPECT_NE(vocab, nullptr);
  122. // Check if vocab has words or not
  123. int32_t home_index = vocab->Lookup("home");
  124. EXPECT_EQ(home_index, 4);
  125. // Create Lookup operation on ds
  126. std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
  127. EXPECT_NE(lookup, nullptr);
  128. // Create Map operation on ds
  129. ds = ds->Map({lookup}, {"text"});
  130. EXPECT_NE(ds, nullptr);
  131. // Create an iterator over the result of the above dataset
  132. // This will trigger the creation of the Execution Tree and launch it.
  133. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  134. EXPECT_NE(iter, nullptr);
  135. // Iterate the dataset and get each row
  136. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  137. iter->GetNextRow(&row);
  138. uint64_t i = 0;
  139. std::vector<int32_t> expected = {4, 5, 3, 6, 7, 2};
  140. while (row.size() != 0) {
  141. auto ind = row["text"];
  142. MS_LOG(INFO) << ind->shape() << " " << *ind;
  143. std::shared_ptr<Tensor> expected_item;
  144. Tensor::CreateScalar(expected[i], &expected_item);
  145. EXPECT_EQ(*ind, *expected_item);
  146. iter->GetNextRow(&row);
  147. i++;
  148. }
  149. }
  150. TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
  151. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetDefault.";
  152. // Create a TextFile dataset
  153. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  154. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  155. EXPECT_NE(ds, nullptr);
  156. // Create vocab from dataset
  157. std::shared_ptr<Vocab> vocab = ds->BuildVocab();
  158. EXPECT_NE(vocab, nullptr);
  159. // Check if vocab has words or not
  160. int32_t home_index = vocab->Lookup("home");
  161. EXPECT_EQ(home_index, 2);
  162. // Create Lookup operation on ds
  163. std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home");
  164. EXPECT_NE(lookup, nullptr);
  165. // Create Map operation on ds
  166. ds = ds->Map({lookup});
  167. EXPECT_NE(ds, nullptr);
  168. // Create an iterator over the result of the above dataset
  169. // This will trigger the creation of the Execution Tree and launch it.
  170. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  171. EXPECT_NE(iter, nullptr);
  172. // Iterate the dataset and get each row
  173. std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
  174. iter->GetNextRow(&row);
  175. uint64_t i = 0;
  176. std::vector<int32_t> expected = {2, 3, 1, 4, 5, 0};
  177. while (row.size() != 0) {
  178. auto ind = row["text"];
  179. MS_LOG(INFO) << ind->shape() << " " << *ind;
  180. std::shared_ptr<Tensor> expected_item;
  181. Tensor::CreateScalar(expected[i], &expected_item);
  182. EXPECT_EQ(*ind, *expected_item);
  183. iter->GetNextRow(&row);
  184. i++;
  185. }
  186. }
  187. TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) {
  188. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail1.";
  189. // Create a TextFile dataset
  190. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  191. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  192. EXPECT_NE(ds, nullptr);
  193. // Create vocab from dataset
  194. // Expected failure: top_k can not be negative
  195. std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()},
  196. -2, {"<pad>", "<unk>"}, true);
  197. EXPECT_EQ(vocab, nullptr);
  198. }
  199. TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail2) {
  200. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail2.";
  201. // Create a TextFile dataset
  202. std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
  203. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  204. EXPECT_NE(ds, nullptr);
  205. // Create vocab from dataset
  206. // Expected failure: requency_range [a,b] should be 0 <= a <= b
  207. std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {4, 1},
  208. std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true);
  209. EXPECT_EQ(vocab, nullptr);
  210. }