From: @ziruiwu Reviewed-by: @robingrosman,@nsyca Signed-off-by: @nsycatags/v1.1.0
| @@ -91,11 +91,17 @@ PYBIND_REGISTER(CocoOp, 1, ([](const py::module *m) { | |||
| PYBIND_REGISTER(ImageFolderOp, 1, ([](const py::module *m) { | |||
| (void)py::class_<ImageFolderOp, DatasetOp, std::shared_ptr<ImageFolderOp>>(*m, "ImageFolderOp") | |||
| .def_static("get_num_rows_and_classes", [](const std::string &path) { | |||
| int64_t count = 0, num_classes = 0; | |||
| THROW_IF_ERROR( | |||
| ImageFolderOp::CountRowsAndClasses(path, std::set<std::string>{}, &count, &num_classes)); | |||
| return py::make_tuple(count, num_classes); | |||
| .def_static("get_num_rows", | |||
| [](const std::string &path) { | |||
| int64_t count = 0; | |||
| THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, {}, &count, nullptr, {})); | |||
| return count; | |||
| }) | |||
| .def_static("get_num_classes", [](const std::string &path, | |||
| const std::map<std::string, int32_t> class_index) { | |||
| int64_t num_classes = 0; | |||
| THROW_IF_ERROR(ImageFolderOp::CountRowsAndClasses(path, {}, nullptr, &num_classes, class_index)); | |||
| return num_classes; | |||
| }); | |||
| })); | |||
| @@ -15,7 +15,7 @@ | |||
| */ | |||
| #include "minddata/dataset/engine/datasetops/source/image_folder_op.h" | |||
| #include <fstream> | |||
| #include <iomanip> | |||
| #include <unordered_set> | |||
| #include "utils/ms_utils.h" | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/core/tensor_shape.h" | |||
| @@ -280,7 +280,7 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i | |||
| RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not."); | |||
| } else { | |||
| RETURN_STATUS_UNEXPECTED( | |||
| "Map for storaging image-index pair is nullptr or has been set in other place," | |||
| "Map containing image-index pair is nullptr or has been set in other place," | |||
| "it must be empty before using GetClassIds."); | |||
| } | |||
| } | |||
| @@ -294,14 +294,14 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i | |||
| } | |||
| // Worker Entry for pre-scanning all the folders and do the 1st level shuffle | |||
| // Worker pull a file name from mFoldernameQueue (which is a Queue), walks all the images under that foldername | |||
| // Worker pull a file name from folder_name_queue_ (which is a Queue), walks all the images under that foldername | |||
| // After walking is complete, sort all the file names (relative path to all jpeg files under the same directory ) | |||
| // (Sort is automatically conducted using a set which is implemented using a Red-Black Tree) | |||
| // Add the sorted filenames in to a queue. The make a pair (foldername, queue<filenames>*), | |||
| // foldername is used for 2nd level sorting. | |||
| // FYI: 1st level sorting: sort all images under the same directory. | |||
| // FYI: 2nd level sorting: sort all folder names | |||
| // push this pair to mImagenameQueue (which is again a Queue) | |||
| // push this pair to image_name_queue (which is again a Queue) | |||
| Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { | |||
| TaskManager::FindMe()->Post(); | |||
| std::string folder_name; | |||
| @@ -334,7 +334,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { | |||
| return Status::OK(); | |||
| } | |||
| // This helper function recursively walks all foldernames, and send each foldername to mFoldernameQueue | |||
| // This helper function recursively walks all folder_paths, and send each foldername to folder_name_queue_ | |||
| // if mRecursive == false, don't go into folder of folders | |||
| Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { | |||
| std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir); | |||
| @@ -355,7 +355,7 @@ Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { | |||
| } | |||
| // A thread that calls RecursiveWalkFolder | |||
| Status ImageFolderOp::startAsyncWalk() { | |||
| Status ImageFolderOp::StartAsyncWalk() { | |||
| TaskManager::FindMe()->Post(); | |||
| Path dir(folder_path_); | |||
| if (dir.Exists() == false || dir.IsDirectory() == false) { | |||
| @@ -363,8 +363,8 @@ Status ImageFolderOp::startAsyncWalk() { | |||
| } | |||
| dirname_offset_ = folder_path_.length(); | |||
| RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir)); | |||
| // send out num_workers_ end signal to mFoldernameQueue, 1 for each worker. | |||
| // Upon receiving end Signal, worker quits and set another end Signal to mImagenameQueue. | |||
| // send out num_workers_ end signal to folder_name_queue_, 1 for each worker. | |||
| // Upon receiving end Signal, worker quits and set another end Signal to image_name_queue. | |||
| for (int32_t ind = 0; ind < num_workers_; ++ind) { | |||
| RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal | |||
| } | |||
| @@ -372,19 +372,17 @@ Status ImageFolderOp::startAsyncWalk() { | |||
| } | |||
| Status ImageFolderOp::LaunchThreadsAndInitOp() { | |||
| if (tree_ == nullptr) { | |||
| RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set."); | |||
| } | |||
| RETURN_UNEXPECTED_IF_NULL(tree_); | |||
| // Registers QueueList and individual Queues for interrupt services | |||
| RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks())); | |||
| RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks())); | |||
| RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks())); | |||
| RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks())); | |||
| // The following code launch 3 threads group | |||
| // 1) A thread that walks all folders and push the folder names to a util:Queue mFoldernameQueue. | |||
| // 2) Workers that pull foldername from mFoldernameQueue, walk it and return the sorted images to mImagenameQueue | |||
| // 1) A thread that walks all folders and push the folder names to a util:Queue folder_name_queue_. | |||
| // 2) Workers that pull foldername from folder_name_queue_, walk it and return the sorted images to image_name_queue | |||
| // 3) Launch main workers that load DataBuffers by reading all images | |||
| RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::startAsyncWalk, this))); | |||
| RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::StartAsyncWalk, this))); | |||
| RETURN_IF_NOT_OK( | |||
| tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1))); | |||
| RETURN_IF_NOT_OK( | |||
| @@ -397,42 +395,53 @@ Status ImageFolderOp::LaunchThreadsAndInitOp() { | |||
| } | |||
| Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows, | |||
| int64_t *num_classes, int64_t dev_id, int64_t num_dev) { | |||
| int64_t *num_classes, std::map<std::string, int32_t> class_index) { | |||
| Path dir(path); | |||
| std::string err_msg = ""; | |||
| int64_t row_cnt = 0; | |||
| err_msg += (dir.Exists() == false || dir.IsDirectory() == false) | |||
| ? "Invalid parameter, image folde path is invalid or not set, path: " + path | |||
| ? "Invalid parameter, image folder path is invalid or not set, path: " + path | |||
| : ""; | |||
| err_msg += | |||
| (num_classes == nullptr || num_rows == nullptr) ? "Invalid parameter, num_class or num_rows cannot be null.\n" : ""; | |||
| err_msg += (dev_id >= num_dev || num_dev <= 0) | |||
| ? "Invalid parameter, num_shard must be greater than shard_id and greater than 0, got num_shard: " + | |||
| std::to_string(num_dev) + ", shard_id: " + std::to_string(dev_id) + ".\n" | |||
| : ""; | |||
| (num_classes == nullptr && num_rows == nullptr) ? "Invalid parameter, num_class and num_rows are null.\n" : ""; | |||
| if (err_msg.empty() == false) { | |||
| RETURN_STATUS_UNEXPECTED(err_msg); | |||
| } | |||
| std::queue<std::string> foldernames; | |||
| std::queue<std::string> folder_paths; | |||
| std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir); | |||
| std::unordered_set<std::string> folder_names; | |||
| while (dir_itr->hasNext()) { | |||
| Path subdir = dir_itr->next(); | |||
| if (subdir.IsDirectory()) { | |||
| foldernames.push(subdir.toString()); | |||
| folder_paths.push(subdir.toString()); | |||
| if (!class_index.empty()) folder_names.insert(subdir.Basename()); | |||
| } | |||
| } | |||
| (*num_classes) = foldernames.size(); | |||
| while (foldernames.empty() == false) { | |||
| Path subdir(foldernames.front()); | |||
| if (num_classes != nullptr) { | |||
| // if class index is empty, get everything on disk | |||
| if (class_index.empty()) { | |||
| *num_classes = folder_paths.size(); | |||
| } else { | |||
| for (const auto &p : class_index) { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(folder_names.find(p.first) != folder_names.end(), | |||
| "folder: " + p.first + " doesn't exist in " + path + " ."); | |||
| } | |||
| (*num_classes) = class_index.size(); | |||
| } | |||
| } | |||
| // return here if only num_class is needed | |||
| RETURN_OK_IF_TRUE(num_rows == nullptr); | |||
| while (folder_paths.empty() == false) { | |||
| Path subdir(folder_paths.front()); | |||
| dir_itr = Path::DirIterator::OpenDirectory(&subdir); | |||
| while (dir_itr->hasNext()) { | |||
| if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) { | |||
| ++row_cnt; | |||
| } | |||
| } | |||
| foldernames.pop(); | |||
| folder_paths.pop(); | |||
| } | |||
| (*num_rows) = (row_cnt / num_dev) + (row_cnt % num_dev == 0 ? 0 : 1); | |||
| (*num_rows) = row_cnt; | |||
| return Status::OK(); | |||
| } | |||
| @@ -460,9 +469,12 @@ Status ImageFolderOp::GetDatasetSize(int64_t *dataset_size) { | |||
| *dataset_size = dataset_size_; | |||
| return Status::OK(); | |||
| } | |||
| int64_t sample_size, num_rows, num_classes; | |||
| int64_t sample_size, num_rows; | |||
| num_rows = num_rows_; | |||
| if (num_rows_ <= 0) RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, &num_classes)); | |||
| if (num_rows_ <= 0) { | |||
| // GetDatasetSize will not be impacted by class_index_ | |||
| RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, nullptr, {})); | |||
| } | |||
| sample_size = sampler_->GetNumSamples(); | |||
| *dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows; | |||
| dataset_size_ = *dataset_size; | |||
| @@ -475,8 +487,7 @@ Status ImageFolderOp::GetNumClasses(int64_t *num_classes) { | |||
| *num_classes = num_classes_; | |||
| return Status::OK(); | |||
| } | |||
| int64_t num_rows = num_rows_; | |||
| RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, num_classes)); | |||
| RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, nullptr, num_classes, class_index_)); | |||
| num_classes_ = *num_classes; | |||
| return Status::OK(); | |||
| } | |||
| @@ -205,7 +205,7 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp { | |||
| // returned by this function may not be consistent with what image_folder_op is going to return | |||
| // user this at your own risk! | |||
| static Status CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows, | |||
| int64_t *num_classes, int64_t dev_id = 0, int64_t num_dev = 1); | |||
| int64_t *num_classes, std::map<std::string, int32_t> class_index); | |||
| // Base-class override for NodePass visitor acceptor. | |||
| // @param p - Pointer to the NodePass to be accepted. | |||
| @@ -251,7 +251,7 @@ class ImageFolderOp : public ParallelOp, public RandomAccessOp { | |||
| // start walking of all dirs | |||
| // @return | |||
| Status startAsyncWalk(); | |||
| Status StartAsyncWalk(); | |||
| // Called first when function is called | |||
| // @return | |||
| @@ -81,14 +81,16 @@ std::vector<std::shared_ptr<DatasetOp>> BatchNode::Build() { | |||
| std::vector<std::shared_ptr<DatasetOp>> node_ops; | |||
| #ifdef ENABLE_PYTHON | |||
| node_ops.push_back(std::make_shared<BatchOp>(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, | |||
| in_col_names_, out_col_names_, batch_size_func_, batch_map_func_, | |||
| pad_map_)); | |||
| // need to insert a project when per_batch_func changes the number of columns | |||
| // if col_order_ isn't empty, then a project node needs to be attached after batch node. (same as map) | |||
| // this means project_node needs to be the parent of batch_node. this means node_ops = [project_node, batch_node] | |||
| if (!col_order_.empty()) { | |||
| auto project_op = std::make_shared<ProjectOp>(col_order_); | |||
| node_ops.push_back(project_op); | |||
| } | |||
| node_ops.push_back(std::make_shared<BatchOp>(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, | |||
| in_col_names_, out_col_names_, batch_size_func_, batch_map_func_, | |||
| pad_map_)); | |||
| #else | |||
| node_ops.push_back(std::make_shared<BatchOp>(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, | |||
| in_col_names_, pad_map_)); | |||
| @@ -2891,7 +2891,7 @@ class ImageFolderDataset(MappableDataset): | |||
| Number, number of batches. | |||
| """ | |||
| if self.dataset_size is None: | |||
| num_rows = ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[0] | |||
| num_rows = ImageFolderOp.get_num_rows(self.dataset_dir) | |||
| self.dataset_size = get_num_rows(num_rows, self.num_shards) | |||
| rows_from_sampler = self._get_sampler_dataset_size() | |||
| if rows_from_sampler is not None and rows_from_sampler < self.dataset_size: | |||
| @@ -2905,7 +2905,8 @@ class ImageFolderDataset(MappableDataset): | |||
| Return: | |||
| Number, number of classes. | |||
| """ | |||
| return ImageFolderOp.get_num_rows_and_classes(self.dataset_dir)[1] | |||
| class_index = self.class_indexing if self.class_indexing else {} | |||
| return ImageFolderOp.get_num_classes(self.dataset_dir, class_index) | |||
| def is_shuffled(self): | |||
| if self.shuffle_level is None: | |||
| @@ -65,7 +65,9 @@ TEST_F(MindDataTestPipeline, TestSaveCifar10AndLoad) { | |||
| std::string temp_file = datasets_root_path_ + "/testCifar10Data/mind.mind"; | |||
| std::string temp_file_db = datasets_root_path_ + "/testCifar10Data/mind.mind.db"; | |||
| bool rc = ds->Save(temp_file); | |||
| EXPECT_EQ(rc, true); | |||
| // if save fails, no need to continue the execution | |||
| // save could fail if temp_file already exists | |||
| ASSERT_EQ(rc, true); | |||
| // Stage 3: Load dataset from file output by stage 2 | |||
| // Create a MindData Dataset | |||
| @@ -304,4 +304,22 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithNullSamplerFail) { | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| // Expect failure: invalid Mnist input, sampler cannot be nullptr | |||
| EXPECT_EQ(iter, nullptr); | |||
| } | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestImageFolderClassIndexDatasetSize) { | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data"; | |||
| std::map<std::string, int32_t> class_index; | |||
| class_index["class1"] = 111; | |||
| class_index["class2"] = 333; | |||
| auto ds = ImageFolder(folder_path, false, RandomSampler(), {}, class_index); | |||
| EXPECT_EQ(ds->GetNumClasses(), 2); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestImageFolderClassIndexDatasetSizeFail) { | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data"; | |||
| std::map<std::string, int32_t> class_index; | |||
| class_index["class1"] = 111; | |||
| class_index["wrong class"] = 333; | |||
| auto ds = ImageFolder(folder_path, false, RandomSampler(), {}, class_index); | |||
| EXPECT_EQ(ds->GetNumClasses(), -1); | |||
| } | |||
| @@ -38,9 +38,9 @@ | |||
| namespace common = mindspore::common; | |||
| using namespace mindspore::dataset; | |||
| using mindspore::MsLogLevel::ERROR; | |||
| using mindspore::ExceptionType::NoExceptionType; | |||
| using mindspore::LogStream; | |||
| using mindspore::ExceptionType::NoExceptionType; | |||
| using mindspore::MsLogLevel::ERROR; | |||
| std::shared_ptr<BatchOp> Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2); | |||
| @@ -54,14 +54,17 @@ std::shared_ptr<ImageFolderOp> ImageFolder(int64_t num_works, int64_t rows, int6 | |||
| std::shared_ptr<ImageFolderOp> so; | |||
| ImageFolderOp::Builder builder; | |||
| Status rc = builder.SetNumWorkers(num_works) | |||
| .SetImageFolderDir(path) | |||
| .SetRowsPerBuffer(rows) | |||
| .SetOpConnectorSize(conns) | |||
| .SetExtensions({".jpg", ".JPEG"}) | |||
| .SetSampler(std::move(sampler)) | |||
| .SetClassIndex(map) | |||
| .SetDecode(decode) | |||
| .Build(&so); | |||
| .SetImageFolderDir(path) | |||
| .SetRowsPerBuffer(rows) | |||
| .SetOpConnectorSize(conns) | |||
| .SetExtensions({".jpg", ".JPEG"}) | |||
| .SetSampler(std::move(sampler)) | |||
| .SetClassIndex(map) | |||
| .SetDecode(decode) | |||
| .Build(&so); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << "Fail to build ImageFolderOp: " << rc.ToString() << "\n"; | |||
| } | |||
| return so; | |||
| } | |||
| @@ -166,9 +169,9 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch) | |||
| auto tree = Build({ImageFolder(16, 2, 32, folder_path, false), Repeat(2), Batch(11)}); | |||
| tree->Prepare(); | |||
| int32_t res[4][11] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |||
| {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, | |||
| {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, | |||
| {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}; | |||
| {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, | |||
| {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, | |||
| {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}; | |||
| Status rc = tree->Launch(); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << "Return code error detected during tree launch: " << common::SafeCStr(rc.ToString()) << "."; | |||
| @@ -184,7 +187,7 @@ TEST_F(MindDataTestImageFolderSampler, TestSequentialImageFolderWithRepeatBatch) | |||
| Create1DTensor(&label, 11, reinterpret_cast<unsigned char *>(res[i % 4]), DataType::DE_INT32); | |||
| EXPECT_TRUE((*label) == (*tensor_map["label"])); | |||
| MS_LOG(DEBUG) << "row: " << i << " " << tensor_map["image"]->shape() << " (*label):" << (*label) | |||
| << " *tensor_map[label]: " << *tensor_map["label"] << std::endl; | |||
| << " *tensor_map[label]: " << *tensor_map["label"] << std::endl; | |||
| i++; | |||
| di.GetNextAsMap(&tensor_map); | |||
| } | |||
| @@ -373,8 +376,8 @@ TEST_F(MindDataTestImageFolderSampler, TestImageFolderDecode) { | |||
| while (tensor_map.size() != 0) { | |||
| tensor_map["label"]->GetItemAt<int32_t>(&label, {}); | |||
| EXPECT_TRUE(label == res[i / 11]); | |||
| EXPECT_TRUE( | |||
| tensor_map["image"]->shape() == TensorShape({2268, 4032, 3})); // verify shapes are correct after decode | |||
| EXPECT_TRUE(tensor_map["image"]->shape() == | |||
| TensorShape({2268, 4032, 3})); // verify shapes are correct after decode | |||
| MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "\n"; | |||
| i++; | |||
| di.GetNextAsMap(&tensor_map); | |||
| @@ -158,13 +158,23 @@ def test_imagefolder(): | |||
| assert data.get_dataset_size() == 10 | |||
| assert data.num_classes() == 4 | |||
| data = ds.ImageFolderDataset("../data/dataset/testPK/data/", class_indexing={"class1": 1, "class2": 22}) | |||
| assert data.num_classes() == 2 | |||
| data = ds.ImageFolderDataset("../data/dataset/testPK/data/", class_indexing={"class1": 1, "wrong name": 22}) | |||
| err_msg = "" | |||
| try: | |||
| data.num_classes() | |||
| except RuntimeError as e: | |||
| err_msg = str(e) | |||
| assert "wrong name doesn't exist" in err_msg | |||
| if __name__ == '__main__': | |||
| # test_compare_v1_and_2() | |||
| # test_imagefolder() | |||
| # test_manifest() | |||
| test_manifest() | |||
| test_case1() | |||
| # test_case2() | |||
| # test_case3() | |||
| # test_case4() | |||
| # test_case5() | |||
| test_case2() | |||
| test_case3() | |||
| test_case4() | |||
| test_case5() | |||
| test_imagefolder() | |||