You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

shuffle_op_test.cc 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "minddata/dataset/core/client.h"
  17. #include "common/common.h"
  18. #include "utils/ms_utils.h"
  19. #include "gtest/gtest.h"
  20. #include "utils/log_adapter.h"
  21. #include <memory>
  22. #include <vector>
  23. #include <iostream>
  24. #include "minddata/dataset/util/random.h"
  25. #include "minddata/dataset/engine/jagged_connector.h"
  26. namespace common = mindspore::common;
  27. using namespace mindspore::dataset;
  28. using mindspore::LogStream;
  29. using mindspore::ExceptionType::NoExceptionType;
  30. using mindspore::MsLogLevel::INFO;
  31. class MindDataTestShuffleOp : public UT::DatasetOpTesting {};
  32. // Test info:
  33. // - Dataset from testDataset1 has 10 rows, 2 columns.
  34. // - RowsPerBuffer buffer setting of 2 divides evenly into total rows.
  35. // - Shuffle size is multiple of rows per buffer.
  36. //
  37. // Tree: shuffle over TFReader
  38. //
  39. // ShuffleOp
  40. // |
  41. // TFReaderOp
  42. //
  43. TEST_F(MindDataTestShuffleOp, TestShuffleBasic1) {
  44. Status rc;
  45. MS_LOG(INFO) << "UT test TestShuffleBasic1.";
  46. // Start with an empty execution tree
  47. auto my_tree = std::make_shared<ExecutionTree>();
  48. std::string dataset_path;
  49. dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
  50. std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
  51. auto op_connector_size = config_manager->op_connector_size();
  52. std::vector<std::string> columns_to_load = {};
  53. std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
  54. std::vector<std::string> files = {dataset_path};
  55. std::shared_ptr<TFReaderOp> my_tfreader_op = std::make_shared<TFReaderOp>(
  56. 1, 16, 0, files, std::move(schema), op_connector_size, columns_to_load, false, 1, 0, false);
  57. rc = my_tfreader_op->Init();
  58. EXPECT_TRUE(rc.IsOk());
  59. rc = my_tree->AssociateNode(my_tfreader_op);
  60. EXPECT_TRUE(rc.IsOk());
  61. uint32_t shuffle_seed = GetSeed();
  62. std::shared_ptr<ShuffleOp> my_shuffle_op = std::make_shared<ShuffleOp>(4, shuffle_seed, op_connector_size, true);
  63. rc = my_tree->AssociateNode(my_shuffle_op);
  64. EXPECT_TRUE(rc.IsOk());
  65. // Set children/root layout.
  66. rc = my_shuffle_op->AddChild(my_tfreader_op);
  67. EXPECT_TRUE(rc.IsOk());
  68. rc = my_tree->AssignRoot(my_shuffle_op);
  69. EXPECT_TRUE(rc.IsOk());
  70. MS_LOG(INFO) << "Launching tree and begin iteration.";
  71. rc = my_tree->Prepare();
  72. EXPECT_TRUE(rc.IsOk());
  73. rc = my_tree->Launch();
  74. EXPECT_TRUE(rc.IsOk());
  75. // Start the loop of reading tensors from our pipeline
  76. DatasetIterator di(my_tree);
  77. TensorRow tensor_list;
  78. rc = di.FetchNextTensorRow(&tensor_list);
  79. EXPECT_TRUE(rc.IsOk());
  80. int row_count = 0;
  81. while (!tensor_list.empty()) {
  82. MS_LOG(INFO) << "Row display for row #: " << row_count << ".";
  83. // Display the tensor by calling the printer on it
  84. for (int i = 0; i < tensor_list.size(); i++) {
  85. std::ostringstream ss;
  86. ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
  87. MS_LOG(INFO) << "Tensor print: " << ss.str() << ".";
  88. }
  89. rc = di.FetchNextTensorRow(&tensor_list);
  90. EXPECT_TRUE(rc.IsOk());
  91. row_count++;
  92. }
  93. ASSERT_EQ(row_count, 10);
  94. }
  95. // Test info:
  96. // - Dataset from testDataset1 has 10 rows, 2 columns.
  97. // - RowsPerBuffer buffer setting of 3 does not divide evenly into total rows, thereby causing
  98. // partially filled buffers.
  99. // - Shuffle size is not a multiple of rows per buffer.
  100. // - User has provided a non-default seed value.
  101. //
  102. // Tree: shuffle over TFReader
  103. //
  104. // ShuffleOp
  105. // |
  106. // TFReaderOp
  107. //
  108. TEST_F(MindDataTestShuffleOp, TestShuffleBasic2) {
  109. Status rc;
  110. MS_LOG(INFO) << "UT test TestShuffleBasic2.";
  111. // Start with an empty execution tree
  112. auto my_tree = std::make_shared<ExecutionTree>();
  113. std::string dataset_path;
  114. dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
  115. std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
  116. int32_t op_connector_size = config_manager->op_connector_size();
  117. std::vector<std::string> columns_to_load = {};
  118. std::vector<std::string> files = {dataset_path};
  119. std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
  120. std::shared_ptr<TFReaderOp> my_tfreader_op = std::make_shared<TFReaderOp>(
  121. 1, 16, 0, files, std::move(schema), op_connector_size, columns_to_load, false, 1, 0, false);
  122. rc = my_tfreader_op->Init();
  123. EXPECT_TRUE(rc.IsOk());
  124. rc = my_tree->AssociateNode(my_tfreader_op);
  125. EXPECT_TRUE(rc.IsOk());
  126. std::shared_ptr<ShuffleOp> my_shuffle_op = std::make_shared<ShuffleOp>(4, 100, op_connector_size, true);
  127. rc = my_tree->AssociateNode(my_shuffle_op);
  128. EXPECT_TRUE(rc.IsOk());
  129. // Set children/root layout.
  130. rc = my_shuffle_op->AddChild(my_tfreader_op);
  131. EXPECT_TRUE(rc.IsOk());
  132. rc = my_tree->AssignRoot(my_shuffle_op);
  133. EXPECT_TRUE(rc.IsOk());
  134. MS_LOG(INFO) << "Launching tree and begin iteration.";
  135. rc = my_tree->Prepare();
  136. EXPECT_TRUE(rc.IsOk());
  137. rc = my_tree->Launch();
  138. EXPECT_TRUE(rc.IsOk());
  139. // Start the loop of reading tensors from our pipeline
  140. DatasetIterator di(my_tree);
  141. TensorRow tensor_list;
  142. rc = di.FetchNextTensorRow(&tensor_list);
  143. EXPECT_TRUE(rc.IsOk());
  144. int row_count = 0;
  145. while (!tensor_list.empty()) {
  146. MS_LOG(INFO) << "Row display for row #: " << row_count << ".";
  147. // Display the tensor by calling the printer on it
  148. for (int i = 0; i < tensor_list.size(); i++) {
  149. std::ostringstream ss;
  150. ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
  151. MS_LOG(INFO) << "Tensor print: " << ss.str() << ".";
  152. }
  153. rc = di.FetchNextTensorRow(&tensor_list);
  154. EXPECT_TRUE(rc.IsOk());
  155. row_count++;
  156. }
  157. ASSERT_EQ(row_count, 10);
  158. }
  159. // Test info:
  160. // - Dataset from testDataset1 has 10 rows, 2 columns.
  161. // - RowsPerBuffer buffer setting of 3 does not divide evenly into total rows, thereby causing
  162. // partially filled buffers
  163. // - Shuffle size captures the entire dataset size (actually sets a value that is larger than the
  164. // amount of rows in the dataset.
  165. //
  166. // Tree: shuffle over TFReader
  167. //
  168. // ShuffleOp
  169. // |
  170. // TFReaderOp
  171. //
  172. TEST_F(MindDataTestShuffleOp, TestShuffleBasic3) {
  173. Status rc;
  174. MS_LOG(INFO) << "UT test TestShuffleBasic3.";
  175. // Start with an empty execution tree
  176. auto my_tree = std::make_shared<ExecutionTree>();
  177. std::string dataset_path;
  178. dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
  179. std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
  180. auto op_connector_size = config_manager->op_connector_size();
  181. std::vector<std::string> columns_to_load = {};
  182. std::vector<std::string> files = {dataset_path};
  183. std::shared_ptr<TFReaderOp> my_tfreader_op = std::make_shared<TFReaderOp>(
  184. 1, 16, 0, files, std::make_unique<DataSchema>(), op_connector_size, columns_to_load, false, 1, 0, false);
  185. rc = my_tfreader_op->Init();
  186. EXPECT_TRUE(rc.IsOk());
  187. my_tree->AssociateNode(my_tfreader_op);
  188. uint32_t shuffle_seed = GetSeed();
  189. std::shared_ptr<ShuffleOp> my_shuffle_op = std::make_shared<ShuffleOp>(100, shuffle_seed, op_connector_size, true);
  190. rc = my_tree->AssociateNode(my_shuffle_op);
  191. EXPECT_TRUE(rc.IsOk());
  192. // Set children/root layout.
  193. rc = my_shuffle_op->AddChild(my_tfreader_op);
  194. EXPECT_TRUE(rc.IsOk());
  195. rc = my_tree->AssignRoot(my_shuffle_op);
  196. EXPECT_TRUE(rc.IsOk());
  197. MS_LOG(INFO) << "Launching tree and begin iteration.";
  198. rc = my_tree->Prepare();
  199. EXPECT_TRUE(rc.IsOk());
  200. rc = my_tree->Launch();
  201. EXPECT_TRUE(rc.IsOk());
  202. // Start the loop of reading tensors from our pipeline
  203. DatasetIterator di(my_tree);
  204. TensorRow tensor_list;
  205. rc = di.FetchNextTensorRow(&tensor_list);
  206. EXPECT_TRUE(rc.IsOk());
  207. int row_count = 0;
  208. while (!tensor_list.empty()) {
  209. MS_LOG(INFO) << "Row display for row #: " << row_count << ".";
  210. // Display the tensor by calling the printer on it
  211. for (int i = 0; i < tensor_list.size(); i++) {
  212. std::ostringstream ss;
  213. ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl;
  214. MS_LOG(INFO) << "Tensor print: " << common::SafeCStr(ss.str()) << ".";
  215. }
  216. rc = di.FetchNextTensorRow(&tensor_list);
  217. EXPECT_TRUE(rc.IsOk());
  218. row_count++;
  219. }
  220. ASSERT_EQ(row_count, 10);
  221. }
  222. // Test info:
  223. // - Dataset from testDataset1 has 10 rows, 2 columns.
  224. // - RowsPerBuffer buffer setting of 3 does not divide evenly into total rows thereby causing
  225. // partially filled buffers
  226. // - Shuffle size is not a multiple of rows per buffer.
  227. // - shuffle seed is given, and subsequent epochs will change the seed each time.
  228. // - Repeat count of 2
  229. //
  230. // Tree: Repeat over shuffle over TFReader
  231. //
  232. // Repeat
  233. // |
  234. // shuffle
  235. // |
  236. // TFReaderOp
  237. //
  238. TEST_F(MindDataTestShuffleOp, TestRepeatShuffle) {
  239. Status rc;
  240. MS_LOG(INFO) << "UT test TestRepeatShuffle.";
  241. // Start with an empty execution tree
  242. auto my_tree = std::make_shared<ExecutionTree>();
  243. std::string dataset_path;
  244. dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data";
  245. std::shared_ptr<ConfigManager> config_manager = GlobalContext::config_manager();
  246. int32_t op_connector_size = config_manager->op_connector_size();
  247. std::vector<std::string> columns_to_load = {};
  248. std::vector<std::string> files = {dataset_path};
  249. std::shared_ptr<TFReaderOp> my_tfreader_op = std::make_shared<TFReaderOp>(
  250. 2, 16, 0, files, std::make_unique<DataSchema>(), op_connector_size, columns_to_load, false, 1, 0, false);
  251. rc = my_tfreader_op->Init();
  252. EXPECT_TRUE(rc.IsOk());
  253. rc = my_tree->AssociateNode(my_tfreader_op);
  254. EXPECT_TRUE(rc.IsOk());
  255. std::shared_ptr<ShuffleOp> my_shuffle_op = std::make_shared<ShuffleOp>(4, 100, op_connector_size, true);
  256. rc = my_tree->AssociateNode(my_shuffle_op);
  257. EXPECT_TRUE(rc.IsOk());
  258. uint32_t num_repeats = 2;
  259. std::shared_ptr<RepeatOp> my_repeat_op = std::make_shared<RepeatOp>(num_repeats);
  260. rc = my_tree->AssociateNode(my_repeat_op);
  261. EXPECT_TRUE(rc.IsOk());
  262. // Set children/root layout.
  263. my_shuffle_op->set_total_repeats(num_repeats);
  264. my_shuffle_op->set_num_repeats_per_epoch(num_repeats);
  265. rc = my_repeat_op->AddChild(my_shuffle_op);
  266. EXPECT_TRUE(rc.IsOk());
  267. my_tfreader_op->set_total_repeats(num_repeats);
  268. my_tfreader_op->set_num_repeats_per_epoch(num_repeats);
  269. rc = my_shuffle_op->AddChild(my_tfreader_op);
  270. EXPECT_TRUE(rc.IsOk());
  271. rc = my_tree->AssignRoot(my_repeat_op);
  272. EXPECT_TRUE(rc.IsOk());
  273. MS_LOG(INFO) << "Launching tree and begin iteration.";
  274. rc = my_tree->Prepare();
  275. EXPECT_TRUE(rc.IsOk());
  276. rc = my_tree->Launch();
  277. EXPECT_TRUE(rc.IsOk());
  278. // Start the loop of reading tensors from our pipeline
  279. DatasetIterator di(my_tree);
  280. TensorRow tensor_list;
  281. rc = di.FetchNextTensorRow(&tensor_list);
  282. EXPECT_TRUE(rc.IsOk());
  283. int row_count = 0;
  284. while (!tensor_list.empty()) {
  285. MS_LOG(INFO) << "Row display for row #: " << row_count << ".";
  286. // Display the tensor by calling the printer on it
  287. for (int i = 0; i < tensor_list.size(); i++) {
  288. std::ostringstream ss;
  289. ss << *tensor_list[i] << std::endl;
  290. MS_LOG(INFO) << "Tensor print: " << common::SafeCStr(ss.str()) << ".";
  291. }
  292. rc = di.FetchNextTensorRow(&tensor_list);
  293. EXPECT_TRUE(rc.IsOk());
  294. row_count++;
  295. }
  296. ASSERT_EQ(row_count, 20);
  297. }