/** * Copyright 2019-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "minddata/dataset/core/client.h" #include "common/common.h" #include "utils/ms_utils.h" #include "gtest/gtest.h" #include "utils/log_adapter.h" #include #include #include #include "minddata/dataset/util/random.h" #include "minddata/dataset/engine/jagged_connector.h" namespace common = mindspore::common; using namespace mindspore::dataset; using mindspore::LogStream; using mindspore::ExceptionType::NoExceptionType; using mindspore::MsLogLevel::INFO; class MindDataTestShuffleOp : public UT::DatasetOpTesting {}; // Test info: // - Dataset from testDataset1 has 10 rows, 2 columns. // - RowsPerBuffer buffer setting of 2 divides evenly into total rows. // - Shuffle size is multiple of rows per buffer. // // Tree: shuffle over TFReader // // ShuffleOp // | // TFReaderOp // TEST_F(MindDataTestShuffleOp, TestShuffleBasic1) { Status rc; MS_LOG(INFO) << "UT test TestShuffleBasic1."; // Start with an empty execution tree auto my_tree = std::make_shared(); std::string dataset_path; dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data"; std::shared_ptr config_manager = GlobalContext::config_manager(); auto op_connector_size = config_manager->op_connector_size(); std::vector columns_to_load = {}; std::unique_ptr schema = std::make_unique(); std::vector files = {dataset_path}; std::shared_ptr my_tfreader_op = std::make_shared( 1, 16, 0, files, std::move(schema), op_connector_size, columns_to_load, false, 1, 0, false); rc = my_tfreader_op->Init(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssociateNode(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); uint32_t shuffle_seed = GetSeed(); std::shared_ptr my_shuffle_op = std::make_shared(4, shuffle_seed, op_connector_size, true); rc = my_tree->AssociateNode(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); // Set children/root layout. rc = my_shuffle_op->AddChild(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssignRoot(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); MS_LOG(INFO) << "Launching tree and begin iteration."; rc = my_tree->Prepare(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->Launch(); EXPECT_TRUE(rc.IsOk()); // Start the loop of reading tensors from our pipeline DatasetIterator di(my_tree); TensorRow tensor_list; rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); int row_count = 0; while (!tensor_list.empty()) { MS_LOG(INFO) << "Row display for row #: " << row_count << "."; // Display the tensor by calling the printer on it for (int i = 0; i < tensor_list.size(); i++) { std::ostringstream ss; ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl; MS_LOG(INFO) << "Tensor print: " << ss.str() << "."; } rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); row_count++; } ASSERT_EQ(row_count, 10); } // Test info: // - Dataset from testDataset1 has 10 rows, 2 columns. // - RowsPerBuffer buffer setting of 3 does not divide evenly into total rows, thereby causing // partially filled buffers. // - Shuffle size is not a multiple of rows per buffer. // - User has provided a non-default seed value. // // Tree: shuffle over TFReader // // ShuffleOp // | // TFReaderOp // TEST_F(MindDataTestShuffleOp, TestShuffleBasic2) { Status rc; MS_LOG(INFO) << "UT test TestShuffleBasic2."; // Start with an empty execution tree auto my_tree = std::make_shared(); std::string dataset_path; dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data"; std::shared_ptr config_manager = GlobalContext::config_manager(); int32_t op_connector_size = config_manager->op_connector_size(); std::vector columns_to_load = {}; std::vector files = {dataset_path}; std::unique_ptr schema = std::make_unique(); std::shared_ptr my_tfreader_op = std::make_shared( 1, 16, 0, files, std::move(schema), op_connector_size, columns_to_load, false, 1, 0, false); rc = my_tfreader_op->Init(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssociateNode(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); std::shared_ptr my_shuffle_op = std::make_shared(4, 100, op_connector_size, true); rc = my_tree->AssociateNode(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); // Set children/root layout. rc = my_shuffle_op->AddChild(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssignRoot(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); MS_LOG(INFO) << "Launching tree and begin iteration."; rc = my_tree->Prepare(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->Launch(); EXPECT_TRUE(rc.IsOk()); // Start the loop of reading tensors from our pipeline DatasetIterator di(my_tree); TensorRow tensor_list; rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); int row_count = 0; while (!tensor_list.empty()) { MS_LOG(INFO) << "Row display for row #: " << row_count << "."; // Display the tensor by calling the printer on it for (int i = 0; i < tensor_list.size(); i++) { std::ostringstream ss; ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl; MS_LOG(INFO) << "Tensor print: " << ss.str() << "."; } rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); row_count++; } ASSERT_EQ(row_count, 10); } // Test info: // - Dataset from testDataset1 has 10 rows, 2 columns. // - RowsPerBuffer buffer setting of 3 does not divide evenly into total rows, thereby causing // partially filled buffers // - Shuffle size captures the entire dataset size (actually sets a value that is larger than the // amount of rows in the dataset. // // Tree: shuffle over TFReader // // ShuffleOp // | // TFReaderOp // TEST_F(MindDataTestShuffleOp, TestShuffleBasic3) { Status rc; MS_LOG(INFO) << "UT test TestShuffleBasic3."; // Start with an empty execution tree auto my_tree = std::make_shared(); std::string dataset_path; dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data"; std::shared_ptr config_manager = GlobalContext::config_manager(); auto op_connector_size = config_manager->op_connector_size(); std::vector columns_to_load = {}; std::vector files = {dataset_path}; std::shared_ptr my_tfreader_op = std::make_shared( 1, 16, 0, files, std::make_unique(), op_connector_size, columns_to_load, false, 1, 0, false); rc = my_tfreader_op->Init(); EXPECT_TRUE(rc.IsOk()); my_tree->AssociateNode(my_tfreader_op); uint32_t shuffle_seed = GetSeed(); std::shared_ptr my_shuffle_op = std::make_shared(100, shuffle_seed, op_connector_size, true); rc = my_tree->AssociateNode(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); // Set children/root layout. rc = my_shuffle_op->AddChild(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssignRoot(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); MS_LOG(INFO) << "Launching tree and begin iteration."; rc = my_tree->Prepare(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->Launch(); EXPECT_TRUE(rc.IsOk()); // Start the loop of reading tensors from our pipeline DatasetIterator di(my_tree); TensorRow tensor_list; rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); int row_count = 0; while (!tensor_list.empty()) { MS_LOG(INFO) << "Row display for row #: " << row_count << "."; // Display the tensor by calling the printer on it for (int i = 0; i < tensor_list.size(); i++) { std::ostringstream ss; ss << "(" << tensor_list[i] << "): " << *tensor_list[i] << std::endl; MS_LOG(INFO) << "Tensor print: " << common::SafeCStr(ss.str()) << "."; } rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); row_count++; } ASSERT_EQ(row_count, 10); } // Test info: // - Dataset from testDataset1 has 10 rows, 2 columns. // - RowsPerBuffer buffer setting of 3 does not divide evenly into total rows thereby causing // partially filled buffers // - Shuffle size is not a multiple of rows per buffer. // - shuffle seed is given, and subsequent epochs will change the seed each time. // - Repeat count of 2 // // Tree: Repeat over shuffle over TFReader // // Repeat // | // shuffle // | // TFReaderOp // TEST_F(MindDataTestShuffleOp, TestRepeatShuffle) { Status rc; MS_LOG(INFO) << "UT test TestRepeatShuffle."; // Start with an empty execution tree auto my_tree = std::make_shared(); std::string dataset_path; dataset_path = datasets_root_path_ + "/testDataset1/testDataset1.data"; std::shared_ptr config_manager = GlobalContext::config_manager(); int32_t op_connector_size = config_manager->op_connector_size(); std::vector columns_to_load = {}; std::vector files = {dataset_path}; std::shared_ptr my_tfreader_op = std::make_shared( 2, 16, 0, files, std::make_unique(), op_connector_size, columns_to_load, false, 1, 0, false); rc = my_tfreader_op->Init(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssociateNode(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); std::shared_ptr my_shuffle_op = std::make_shared(4, 100, op_connector_size, true); rc = my_tree->AssociateNode(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); uint32_t num_repeats = 2; std::shared_ptr my_repeat_op = std::make_shared(num_repeats); rc = my_tree->AssociateNode(my_repeat_op); EXPECT_TRUE(rc.IsOk()); // Set children/root layout. my_shuffle_op->set_total_repeats(num_repeats); my_shuffle_op->set_num_repeats_per_epoch(num_repeats); rc = my_repeat_op->AddChild(my_shuffle_op); EXPECT_TRUE(rc.IsOk()); my_tfreader_op->set_total_repeats(num_repeats); my_tfreader_op->set_num_repeats_per_epoch(num_repeats); rc = my_shuffle_op->AddChild(my_tfreader_op); EXPECT_TRUE(rc.IsOk()); rc = my_tree->AssignRoot(my_repeat_op); EXPECT_TRUE(rc.IsOk()); MS_LOG(INFO) << "Launching tree and begin iteration."; rc = my_tree->Prepare(); EXPECT_TRUE(rc.IsOk()); rc = my_tree->Launch(); EXPECT_TRUE(rc.IsOk()); // Start the loop of reading tensors from our pipeline DatasetIterator di(my_tree); TensorRow tensor_list; rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); int row_count = 0; while (!tensor_list.empty()) { MS_LOG(INFO) << "Row display for row #: " << row_count << "."; // Display the tensor by calling the printer on it for (int i = 0; i < tensor_list.size(); i++) { std::ostringstream ss; ss << *tensor_list[i] << std::endl; MS_LOG(INFO) << "Tensor print: " << common::SafeCStr(ss.str()) << "."; } rc = di.FetchNextTensorRow(&tensor_list); EXPECT_TRUE(rc.IsOk()); row_count++; } ASSERT_EQ(row_count, 20); }