|
|
@@ -15,7 +15,7 @@ |
|
|
*/ |
|
|
*/ |
|
|
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h" |
|
|
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h" |
|
|
#include <fstream> |
|
|
#include <fstream> |
|
|
#include <iomanip> |
|
|
|
|
|
|
|
|
#include <unordered_set> |
|
|
#include "utils/ms_utils.h" |
|
|
#include "utils/ms_utils.h" |
|
|
#include "minddata/dataset/core/config_manager.h" |
|
|
#include "minddata/dataset/core/config_manager.h" |
|
|
#include "minddata/dataset/core/tensor_shape.h" |
|
|
#include "minddata/dataset/core/tensor_shape.h" |
|
|
@@ -280,7 +280,7 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i |
|
|
RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not."); |
|
|
RETURN_STATUS_UNEXPECTED("No images found in dataset, please check if Op read images successfully or not."); |
|
|
} else { |
|
|
} else { |
|
|
RETURN_STATUS_UNEXPECTED( |
|
|
RETURN_STATUS_UNEXPECTED( |
|
|
"Map for storaging image-index pair is nullptr or has been set in other place," |
|
|
|
|
|
|
|
|
"Map containing image-index pair is nullptr or has been set in other place," |
|
|
"it must be empty before using GetClassIds."); |
|
|
"it must be empty before using GetClassIds."); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
@@ -294,14 +294,14 @@ Status ImageFolderOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_i |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// Worker Entry for pre-scanning all the folders and do the 1st level shuffle |
|
|
// Worker Entry for pre-scanning all the folders and do the 1st level shuffle |
|
|
// Worker pull a file name from mFoldernameQueue (which is a Queue), walks all the images under that foldername |
|
|
|
|
|
|
|
|
// Worker pull a file name from folder_name_queue_ (which is a Queue), walks all the images under that foldername |
|
|
// After walking is complete, sort all the file names (relative path to all jpeg files under the same directory ) |
|
|
// After walking is complete, sort all the file names (relative path to all jpeg files under the same directory ) |
|
|
// (Sort is automatically conducted using a set which is implemented using a Red-Black Tree) |
|
|
// (Sort is automatically conducted using a set which is implemented using a Red-Black Tree) |
|
|
// Add the sorted filenames in to a queue. The make a pair (foldername, queue<filenames>*), |
|
|
// Add the sorted filenames in to a queue. The make a pair (foldername, queue<filenames>*), |
|
|
// foldername is used for 2nd level sorting. |
|
|
// foldername is used for 2nd level sorting. |
|
|
// FYI: 1st level sorting: sort all images under the same directory. |
|
|
// FYI: 1st level sorting: sort all images under the same directory. |
|
|
// FYI: 2nd level sorting: sort all folder names |
|
|
// FYI: 2nd level sorting: sort all folder names |
|
|
// push this pair to mImagenameQueue (which is again a Queue) |
|
|
|
|
|
|
|
|
// push this pair to image_name_queue (which is again a Queue) |
|
|
Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { |
|
|
Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { |
|
|
TaskManager::FindMe()->Post(); |
|
|
TaskManager::FindMe()->Post(); |
|
|
std::string folder_name; |
|
|
std::string folder_name; |
|
|
@@ -334,7 +334,7 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) { |
|
|
return Status::OK(); |
|
|
return Status::OK(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// This helper function recursively walks all foldernames, and send each foldername to mFoldernameQueue |
|
|
|
|
|
|
|
|
// This helper function recursively walks all folder_paths, and send each foldername to folder_name_queue_ |
|
|
// if mRecursive == false, don't go into folder of folders |
|
|
// if mRecursive == false, don't go into folder of folders |
|
|
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { |
|
|
Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { |
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir); |
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir); |
|
|
@@ -355,7 +355,7 @@ Status ImageFolderOp::RecursiveWalkFolder(Path *dir) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// A thread that calls RecursiveWalkFolder |
|
|
// A thread that calls RecursiveWalkFolder |
|
|
Status ImageFolderOp::startAsyncWalk() { |
|
|
|
|
|
|
|
|
Status ImageFolderOp::StartAsyncWalk() { |
|
|
TaskManager::FindMe()->Post(); |
|
|
TaskManager::FindMe()->Post(); |
|
|
Path dir(folder_path_); |
|
|
Path dir(folder_path_); |
|
|
if (dir.Exists() == false || dir.IsDirectory() == false) { |
|
|
if (dir.Exists() == false || dir.IsDirectory() == false) { |
|
|
@@ -363,8 +363,8 @@ Status ImageFolderOp::startAsyncWalk() { |
|
|
} |
|
|
} |
|
|
dirname_offset_ = folder_path_.length(); |
|
|
dirname_offset_ = folder_path_.length(); |
|
|
RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir)); |
|
|
RETURN_IF_NOT_OK(RecursiveWalkFolder(&dir)); |
|
|
// send out num_workers_ end signal to mFoldernameQueue, 1 for each worker. |
|
|
|
|
|
// Upon receiving end Signal, worker quits and set another end Signal to mImagenameQueue. |
|
|
|
|
|
|
|
|
// send out num_workers_ end signal to folder_name_queue_, 1 for each worker. |
|
|
|
|
|
// Upon receiving end Signal, worker quits and set another end Signal to image_name_queue. |
|
|
for (int32_t ind = 0; ind < num_workers_; ++ind) { |
|
|
for (int32_t ind = 0; ind < num_workers_; ++ind) { |
|
|
RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal |
|
|
RETURN_IF_NOT_OK(folder_name_queue_->EmplaceBack("")); // end signal |
|
|
} |
|
|
} |
|
|
@@ -372,19 +372,17 @@ Status ImageFolderOp::startAsyncWalk() { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
Status ImageFolderOp::LaunchThreadsAndInitOp() { |
|
|
Status ImageFolderOp::LaunchThreadsAndInitOp() { |
|
|
if (tree_ == nullptr) { |
|
|
|
|
|
RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set."); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
RETURN_UNEXPECTED_IF_NULL(tree_); |
|
|
// Registers QueueList and individual Queues for interrupt services |
|
|
// Registers QueueList and individual Queues for interrupt services |
|
|
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(folder_name_queue_->Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(image_name_queue_->Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks())); |
|
|
RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks())); |
|
|
// The following code launch 3 threads group |
|
|
// The following code launch 3 threads group |
|
|
// 1) A thread that walks all folders and push the folder names to a util:Queue mFoldernameQueue. |
|
|
|
|
|
// 2) Workers that pull foldername from mFoldernameQueue, walk it and return the sorted images to mImagenameQueue |
|
|
|
|
|
|
|
|
// 1) A thread that walks all folders and push the folder names to a util:Queue folder_name_queue_. |
|
|
|
|
|
// 2) Workers that pull foldername from folder_name_queue_, walk it and return the sorted images to image_name_queue |
|
|
// 3) Launch main workers that load DataBuffers by reading all images |
|
|
// 3) Launch main workers that load DataBuffers by reading all images |
|
|
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::startAsyncWalk, this))); |
|
|
|
|
|
|
|
|
RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("walk dir", std::bind(&ImageFolderOp::StartAsyncWalk, this))); |
|
|
RETURN_IF_NOT_OK( |
|
|
RETURN_IF_NOT_OK( |
|
|
tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1))); |
|
|
tree_->LaunchWorkers(num_workers_, std::bind(&ImageFolderOp::PrescanWorkerEntry, this, std::placeholders::_1))); |
|
|
RETURN_IF_NOT_OK( |
|
|
RETURN_IF_NOT_OK( |
|
|
@@ -397,42 +395,53 @@ Status ImageFolderOp::LaunchThreadsAndInitOp() { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows, |
|
|
Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::set<std::string> &exts, int64_t *num_rows, |
|
|
int64_t *num_classes, int64_t dev_id, int64_t num_dev) { |
|
|
|
|
|
|
|
|
int64_t *num_classes, std::map<std::string, int32_t> class_index) { |
|
|
Path dir(path); |
|
|
Path dir(path); |
|
|
std::string err_msg = ""; |
|
|
std::string err_msg = ""; |
|
|
int64_t row_cnt = 0; |
|
|
int64_t row_cnt = 0; |
|
|
err_msg += (dir.Exists() == false || dir.IsDirectory() == false) |
|
|
err_msg += (dir.Exists() == false || dir.IsDirectory() == false) |
|
|
? "Invalid parameter, image folde path is invalid or not set, path: " + path |
|
|
|
|
|
|
|
|
? "Invalid parameter, image folder path is invalid or not set, path: " + path |
|
|
: ""; |
|
|
: ""; |
|
|
err_msg += |
|
|
err_msg += |
|
|
(num_classes == nullptr || num_rows == nullptr) ? "Invalid parameter, num_class or num_rows cannot be null.\n" : ""; |
|
|
|
|
|
err_msg += (dev_id >= num_dev || num_dev <= 0) |
|
|
|
|
|
? "Invalid parameter, num_shard must be greater than shard_id and greater than 0, got num_shard: " + |
|
|
|
|
|
std::to_string(num_dev) + ", shard_id: " + std::to_string(dev_id) + ".\n" |
|
|
|
|
|
: ""; |
|
|
|
|
|
|
|
|
(num_classes == nullptr && num_rows == nullptr) ? "Invalid parameter, num_class and num_rows are null.\n" : ""; |
|
|
if (err_msg.empty() == false) { |
|
|
if (err_msg.empty() == false) { |
|
|
RETURN_STATUS_UNEXPECTED(err_msg); |
|
|
RETURN_STATUS_UNEXPECTED(err_msg); |
|
|
} |
|
|
} |
|
|
std::queue<std::string> foldernames; |
|
|
|
|
|
|
|
|
std::queue<std::string> folder_paths; |
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir); |
|
|
std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir); |
|
|
|
|
|
std::unordered_set<std::string> folder_names; |
|
|
while (dir_itr->hasNext()) { |
|
|
while (dir_itr->hasNext()) { |
|
|
Path subdir = dir_itr->next(); |
|
|
Path subdir = dir_itr->next(); |
|
|
if (subdir.IsDirectory()) { |
|
|
if (subdir.IsDirectory()) { |
|
|
foldernames.push(subdir.toString()); |
|
|
|
|
|
|
|
|
folder_paths.push(subdir.toString()); |
|
|
|
|
|
if (!class_index.empty()) folder_names.insert(subdir.Basename()); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
(*num_classes) = foldernames.size(); |
|
|
|
|
|
while (foldernames.empty() == false) { |
|
|
|
|
|
Path subdir(foldernames.front()); |
|
|
|
|
|
|
|
|
if (num_classes != nullptr) { |
|
|
|
|
|
// if class index is empty, get everything on disk |
|
|
|
|
|
if (class_index.empty()) { |
|
|
|
|
|
*num_classes = folder_paths.size(); |
|
|
|
|
|
} else { |
|
|
|
|
|
for (const auto &p : class_index) { |
|
|
|
|
|
CHECK_FAIL_RETURN_UNEXPECTED(folder_names.find(p.first) != folder_names.end(), |
|
|
|
|
|
"folder: " + p.first + " doesn't exist in " + path + " ."); |
|
|
|
|
|
} |
|
|
|
|
|
(*num_classes) = class_index.size(); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
// return here if only num_class is needed |
|
|
|
|
|
RETURN_OK_IF_TRUE(num_rows == nullptr); |
|
|
|
|
|
while (folder_paths.empty() == false) { |
|
|
|
|
|
Path subdir(folder_paths.front()); |
|
|
dir_itr = Path::DirIterator::OpenDirectory(&subdir); |
|
|
dir_itr = Path::DirIterator::OpenDirectory(&subdir); |
|
|
while (dir_itr->hasNext()) { |
|
|
while (dir_itr->hasNext()) { |
|
|
if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) { |
|
|
if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) { |
|
|
++row_cnt; |
|
|
++row_cnt; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
foldernames.pop(); |
|
|
|
|
|
|
|
|
folder_paths.pop(); |
|
|
} |
|
|
} |
|
|
(*num_rows) = (row_cnt / num_dev) + (row_cnt % num_dev == 0 ? 0 : 1); |
|
|
|
|
|
|
|
|
(*num_rows) = row_cnt; |
|
|
return Status::OK(); |
|
|
return Status::OK(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@@ -460,9 +469,12 @@ Status ImageFolderOp::GetDatasetSize(int64_t *dataset_size) { |
|
|
*dataset_size = dataset_size_; |
|
|
*dataset_size = dataset_size_; |
|
|
return Status::OK(); |
|
|
return Status::OK(); |
|
|
} |
|
|
} |
|
|
int64_t sample_size, num_rows, num_classes; |
|
|
|
|
|
|
|
|
int64_t sample_size, num_rows; |
|
|
num_rows = num_rows_; |
|
|
num_rows = num_rows_; |
|
|
if (num_rows_ <= 0) RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, &num_classes)); |
|
|
|
|
|
|
|
|
if (num_rows_ <= 0) { |
|
|
|
|
|
// GetDatasetSize will not be impacted by class_index_ |
|
|
|
|
|
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, nullptr, {})); |
|
|
|
|
|
} |
|
|
sample_size = sampler_->GetNumSamples(); |
|
|
sample_size = sampler_->GetNumSamples(); |
|
|
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows; |
|
|
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows; |
|
|
dataset_size_ = *dataset_size; |
|
|
dataset_size_ = *dataset_size; |
|
|
@@ -475,8 +487,7 @@ Status ImageFolderOp::GetNumClasses(int64_t *num_classes) { |
|
|
*num_classes = num_classes_; |
|
|
*num_classes = num_classes_; |
|
|
return Status::OK(); |
|
|
return Status::OK(); |
|
|
} |
|
|
} |
|
|
int64_t num_rows = num_rows_; |
|
|
|
|
|
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, &num_rows, num_classes)); |
|
|
|
|
|
|
|
|
RETURN_IF_NOT_OK(CountRowsAndClasses(folder_path_, extensions_, nullptr, num_classes, class_index_)); |
|
|
num_classes_ = *num_classes; |
|
|
num_classes_ = *num_classes; |
|
|
return Status::OK(); |
|
|
return Status::OK(); |
|
|
} |
|
|
} |
|
|
|