Merge pull request !4772 from EricZ/ms-albumtags/v1.0.0
| @@ -393,7 +393,7 @@ build_mindspore() | |||
| CMAKE_VERBOSE="--verbose" | |||
| fi | |||
| cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM | |||
| echo "success to build mindspore project!" | |||
| echo "success building mindspore project!" | |||
| } | |||
| checkndk() { | |||
| @@ -21,6 +21,7 @@ | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/engine/dataset_iterator.h" | |||
| // Source dataset headers (in alphabetical order) | |||
| #include "minddata/dataset/engine/datasetops/source/album_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/celeba_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/cifar_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/clue_op.h" | |||
| @@ -117,6 +118,15 @@ std::shared_ptr<SchemaObj> Schema(const std::string &schema_file) { | |||
| // FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS | |||
| // (In alphabetical order) | |||
| // Function to create a AlbumDataset. | |||
| std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema, | |||
| const std::vector<std::string> &column_names, bool decode, | |||
| const std::shared_ptr<SamplerObj> &sampler) { | |||
| auto ds = std::make_shared<AlbumDataset>(dataset_dir, data_schema, column_names, decode, sampler); | |||
| return ds->ValidateParams() ? ds : nullptr; | |||
| } | |||
| // Function to create a CelebADataset. | |||
| std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std::string &dataset_type, | |||
| const std::shared_ptr<SamplerObj> &sampler, bool decode, | |||
| @@ -687,6 +697,49 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha | |||
| // DERIVED DATASET CLASSES LEAF-NODE DATASETS | |||
| // (In alphabetical order) | |||
| // Constructor for AlbumDataset | |||
| AlbumDataset::AlbumDataset(const std::string &dataset_dir, const std::string &data_schema, | |||
| const std::vector<std::string> &column_names, bool decode, | |||
| const std::shared_ptr<SamplerObj> &sampler) | |||
| : dataset_dir_(dataset_dir), | |||
| schema_path_(data_schema), | |||
| column_names_(column_names), | |||
| decode_(decode), | |||
| sampler_(sampler) {} | |||
| bool AlbumDataset::ValidateParams() { | |||
| if (!ValidateDatasetDirParam("AlbumDataset", dataset_dir_)) { | |||
| return false; | |||
| } | |||
| if (!ValidateDatasetFilesParam("AlbumDataset", {schema_path_})) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| // Function to build AlbumDataset | |||
| std::vector<std::shared_ptr<DatasetOp>> AlbumDataset::Build() { | |||
| // A vector containing shared pointer to the Dataset Ops that this object will create | |||
| std::vector<std::shared_ptr<DatasetOp>> node_ops; | |||
| // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. | |||
| if (sampler_ == nullptr) { | |||
| sampler_ = CreateDefaultSampler(); | |||
| } | |||
| auto schema = std::make_unique<DataSchema>(); | |||
| RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_)); | |||
| // Argument that is not exposed to user in the API. | |||
| std::set<std::string> extensions = {}; | |||
| node_ops.push_back(std::make_shared<AlbumOp>(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, | |||
| decode_, extensions, std::move(schema), std::move(sampler_->Build()))); | |||
| return node_ops; | |||
| } | |||
| // Constructor for CelebADataset | |||
| CelebADataset::CelebADataset(const std::string &dataset_dir, const std::string &dataset_type, | |||
| const std::shared_ptr<SamplerObj> &sampler, const bool &decode, | |||
| @@ -13,6 +13,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES | |||
| text_file_op.cc | |||
| clue_op.cc | |||
| csv_op.cc | |||
| album_op.cc | |||
| ) | |||
| set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES | |||
| @@ -0,0 +1,508 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/engine/datasetops/source/album_op.h" | |||
| #include <fstream> | |||
| #include <iomanip> | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/core/tensor_shape.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h" | |||
| #include "minddata/dataset/engine/db_connector.h" | |||
| #include "minddata/dataset/engine/execution_tree.h" | |||
| #include "minddata/dataset/engine/opt/pass.h" | |||
| #include "minddata/dataset/kernels/image/image_utils.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| AlbumOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr), builder_schema_file_("") { | |||
| std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager(); | |||
| builder_num_workers_ = cfg->num_parallel_workers(); | |||
| builder_rows_per_buffer_ = cfg->rows_per_buffer(); | |||
| builder_op_connector_size_ = cfg->op_connector_size(); | |||
| } | |||
| Status AlbumOp::Builder::Build(std::shared_ptr<AlbumOp> *ptr) { | |||
| RETURN_IF_NOT_OK(SanityCheck()); | |||
| if (builder_sampler_ == nullptr) { | |||
| int64_t num_samples = 0; // default num samples of 0 means to sample entire set of data | |||
| int64_t start_index = 0; | |||
| builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples); | |||
| } | |||
| builder_schema_ = std::make_unique<DataSchema>(); | |||
| Path schema_file(builder_schema_file_); | |||
| if (builder_schema_file_ == "" || !schema_file.Exists()) { | |||
| RETURN_STATUS_UNEXPECTED("Schema not provided"); | |||
| } else { | |||
| MS_LOG(INFO) << "Schema file provided: " << builder_schema_file_ << "."; | |||
| builder_schema_->LoadSchemaFile(builder_schema_file_, builder_columns_to_load_); | |||
| } | |||
| *ptr = std::make_shared<AlbumOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_, | |||
| builder_op_connector_size_, builder_decode_, builder_extensions_, | |||
| std::move(builder_schema_), std::move(builder_sampler_)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::Builder::SanityCheck() { | |||
| Path dir(builder_dir_); | |||
| std::string err_msg; | |||
| err_msg += dir.IsDirectory() == false ? "Album path is invalid or not set\n" : ""; | |||
| err_msg += builder_num_workers_ <= 0 ? "Num of parallel workers is set to 0\n" : ""; | |||
| return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); | |||
| } | |||
| AlbumOp::AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode, | |||
| const std::set<std::string> &exts, std::unique_ptr<DataSchema> data_schema, | |||
| std::shared_ptr<Sampler> sampler) | |||
| : ParallelOp(num_wkrs, queue_size), | |||
| rows_per_buffer_(rows_per_buffer), | |||
| folder_path_(file_dir), | |||
| decode_(do_decode), | |||
| extensions_(exts), | |||
| data_schema_(std::move(data_schema)), | |||
| sampler_(std::move(sampler)), | |||
| row_cnt_(0), | |||
| buf_cnt_(0), | |||
| sampler_ind_(0), | |||
| dirname_offset_(0) { | |||
| // Set the column name map (base class field) | |||
| for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) { | |||
| column_name_id_map_[data_schema_->column(i).name()] = i; | |||
| } | |||
| io_block_queues_.Init(num_workers_, queue_size); | |||
| } | |||
| // Helper function for string comparison | |||
| bool StrComp(const std::string &a, const std::string &b) { | |||
| // returns 1 if string a is alphabetically | |||
| // less than string b | |||
| // quite similar to strcmp operation | |||
| return a < b; | |||
| } | |||
| // Single thread to go through the folder directory and gets all file names | |||
| // calculate numRows then return | |||
| Status AlbumOp::PrescanEntry() { | |||
| Path folder(folder_path_); | |||
| dirname_offset_ = folder_path_.length(); | |||
| std::shared_ptr<Path::DirIterator> dirItr = Path::DirIterator::OpenDirectory(&folder); | |||
| if (folder.Exists() == false || dirItr == nullptr) { | |||
| RETURN_STATUS_UNEXPECTED("Error unable to open: " + folder_path_); | |||
| } | |||
| MS_LOG(INFO) << "Album folder Path found: " << folder_path_ << "."; | |||
| while (dirItr->hasNext()) { | |||
| Path file = dirItr->next(); | |||
| if (extensions_.empty() || extensions_.find(file.Extension()) != extensions_.end()) { | |||
| (void)image_rows_.push_back(file.toString().substr(dirname_offset_)); | |||
| } else { | |||
| MS_LOG(INFO) << "Album operator unsupported file found: " << file.toString() | |||
| << ", extension: " << file.Extension() << "."; | |||
| } | |||
| } | |||
| std::sort(image_rows_.begin(), image_rows_.end(), StrComp); | |||
| num_rows_ = image_rows_.size(); | |||
| return Status::OK(); | |||
| } | |||
| // Main logic, Register Queue with TaskGroup, launch all threads and do the functor's work | |||
| Status AlbumOp::operator()() { | |||
| RETURN_IF_NOT_OK(this->PrescanEntry()); | |||
| RETURN_IF_NOT_OK(LaunchThreadsAndInitOp()); | |||
| std::unique_ptr<DataBuffer> sampler_buffer; | |||
| RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer)); | |||
| while (true) { // each iterator is 1 epoch | |||
| std::vector<int64_t> keys; | |||
| keys.reserve(rows_per_buffer_); | |||
| while (sampler_buffer->eoe() == false) { | |||
| TensorRow sample_row; | |||
| RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row)); | |||
| std::shared_ptr<Tensor> sample_ids = sample_row[0]; | |||
| for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) { | |||
| if ((*itr) >= num_rows_) continue; // index out of bound, skipping | |||
| keys.push_back(*itr); | |||
| row_cnt_++; | |||
| if (row_cnt_ % rows_per_buffer_ == 0) { | |||
| RETURN_IF_NOT_OK( | |||
| io_block_queues_[buf_cnt_++ % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone))); | |||
| keys.clear(); | |||
| } | |||
| } | |||
| RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer)); | |||
| } | |||
| if (keys.empty() == false) { | |||
| RETURN_IF_NOT_OK( | |||
| io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone))); | |||
| } | |||
| if (IsLastIteration()) { | |||
| std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe); | |||
| std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof); | |||
| RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block))); | |||
| RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block))); | |||
| for (int32_t i = 0; i < num_workers_; ++i) { | |||
| RETURN_IF_NOT_OK( | |||
| io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone))); | |||
| } | |||
| return Status::OK(); | |||
| } else { // not the last repeat. Sleep master thread, wait for the wake-up from reset | |||
| RETURN_IF_NOT_OK( | |||
| io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe))); | |||
| RETURN_IF_NOT_OK(wp_.Wait()); // Master thread goes to sleep after it has made all the IOBlocks | |||
| wp_.Clear(); | |||
| RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer)); | |||
| } | |||
| UpdateRepeatAndEpochCounter(); | |||
| } | |||
| } | |||
| // contains the main logic of pulling a IOBlock from IOBlockQueue, load a buffer and push the buffer to out_connector_ | |||
| // IMPORTANT: 1 IOBlock produces 1 DataBuffer | |||
| Status AlbumOp::WorkerEntry(int32_t worker_id) { | |||
| TaskManager::FindMe()->Post(); | |||
| int64_t buffer_id = worker_id; | |||
| std::unique_ptr<IOBlock> io_block; | |||
| RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block)); | |||
| while (io_block != nullptr) { | |||
| if (io_block->eoe() == true) { | |||
| RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))); | |||
| buffer_id = worker_id; | |||
| } else if (io_block->eof() == true) { | |||
| RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))); | |||
| } else { | |||
| std::vector<int64_t> keys; | |||
| RETURN_IF_NOT_OK(io_block->GetKeys(&keys)); | |||
| if (keys.empty() == true) return Status::OK(); // empty key is a quit signal for workers | |||
| std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone); | |||
| RETURN_IF_NOT_OK(LoadBuffer(keys, &db)); | |||
| RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db))); | |||
| buffer_id += num_workers_; | |||
| } | |||
| RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block)); | |||
| } | |||
| RETURN_STATUS_UNEXPECTED("Unexpected nullptr received in worker"); | |||
| } | |||
| // Only support JPEG/PNG/GIF/BMP | |||
| // Optimization: Could take in a tensor | |||
| Status AlbumOp::CheckImageType(const std::string &file_name, bool *valid) { | |||
| std::ifstream file_handle; | |||
| constexpr int read_num = 3; | |||
| *valid = false; | |||
| file_handle.open(file_name, std::ios::binary | std::ios::in); | |||
| if (!file_handle.is_open()) { | |||
| RETURN_STATUS_UNEXPECTED("Can not open image file " + file_name); | |||
| } | |||
| unsigned char file_type[read_num]; | |||
| (void)file_handle.read(reinterpret_cast<char *>(file_type), read_num); | |||
| if (file_handle.fail()) { | |||
| file_handle.close(); | |||
| RETURN_STATUS_UNEXPECTED("Read image file failed " + file_name); | |||
| } | |||
| file_handle.close(); | |||
| if (file_type[0] == 0xff && file_type[1] == 0xd8 && file_type[2] == 0xff) { | |||
| // Normal JPEGs start with \xff\xd8\xff\xe0 | |||
| // JPEG with EXIF stats with \xff\xd8\xff\xe1 | |||
| // Use \xff\xd8\xff to cover both. | |||
| *valid = true; | |||
| } else if (file_type[0] == 0x89 && file_type[1] == 0x50 && file_type[2] == 0x4e) { | |||
| // It's a PNG | |||
| *valid = true; | |||
| } else if (file_type[0] == 0x47 && file_type[1] == 0x49 && file_type[2] == 0x46) { | |||
| // It's a GIF | |||
| *valid = true; | |||
| } else if (file_type[0] == 0x42 && file_type[1] == 0x4d) { | |||
| // It's a BMP | |||
| *valid = true; | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LoadImageTensor(const std::string &image_file_path, uint32_t col_num, TensorRow *row) { | |||
| std::shared_ptr<Tensor> image; | |||
| std::ifstream fs; | |||
| fs.open(image_file_path, std::ios::binary | std::ios::in); | |||
| if (fs.fail()) { | |||
| MS_LOG(INFO) << "Image file not found:" << image_file_path << "."; | |||
| // If file doesn't exist, we don't flag this as error in input check, simply skip | |||
| return Status::OK(); | |||
| } | |||
| MS_LOG(INFO) << "Image file found: " << image_file_path << "."; | |||
| // check that the file is an image before decoding | |||
| bool valid = false; | |||
| RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid)); | |||
| RETURN_IF_NOT_OK(Tensor::CreateFromFile(image_file_path, &image)); | |||
| if (decode_ && valid) { | |||
| Status rc = Decode(image, &image); | |||
| if (rc.IsError()) { | |||
| std::string err = "Fail to decode image:" + image_file_path; | |||
| RETURN_STATUS_UNEXPECTED(err); | |||
| } | |||
| } | |||
| row->push_back(std::move(image)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { | |||
| std::vector<std::string> data = json_obj; | |||
| MS_LOG(INFO) << "String array label found: " << data << "."; | |||
| std::shared_ptr<Tensor> label; | |||
| RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); | |||
| row->push_back(std::move(label)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { | |||
| std::string data = json_obj; | |||
| // now we iterate over the elements in json | |||
| MS_LOG(INFO) << "String label found: " << data << "."; | |||
| std::shared_ptr<Tensor> label; | |||
| RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(data, &label)); | |||
| row->push_back(std::move(label)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { | |||
| std::shared_ptr<Tensor> label; | |||
| // consider templating this function to handle all ints | |||
| if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT64)) { | |||
| std::vector<int64_t> data; | |||
| // Iterate over the integer list and add those values to the output shape tensor | |||
| auto items = json_obj.items(); | |||
| using it_type = decltype(items.begin()); | |||
| (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); | |||
| RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); | |||
| } else if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT32)) { | |||
| std::vector<int32_t> data; | |||
| // Iterate over the integer list and add those values to the output shape tensor | |||
| auto items = json_obj.items(); | |||
| using it_type = decltype(items.begin()); | |||
| (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); }); | |||
| MS_LOG(INFO) << "Int array found: " << data << "."; | |||
| RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label)); | |||
| } else { | |||
| RETURN_STATUS_UNEXPECTED("Error in Load Int Tensor"); | |||
| } | |||
| row->push_back(std::move(label)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row) { | |||
| if (data_schema_->column(col_num).type() == DataType(DataType::DE_STRING)) { | |||
| std::shared_ptr<Tensor> id; | |||
| RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(file, &id)); | |||
| row->push_back(std::move(id)); | |||
| return Status::OK(); | |||
| } | |||
| // hack to get the file name without extension, the 1 is to get rid of the backslash character | |||
| int64_t image_id = std::atoi(file.substr(1, file.find(".")).c_str()); | |||
| std::shared_ptr<Tensor> id; | |||
| RETURN_IF_NOT_OK(Tensor::CreateScalar<int64_t>(image_id, &id)); | |||
| MS_LOG(INFO) << "File ID " << image_id << "."; | |||
| row->push_back(std::move(id)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LoadEmptyTensor(uint32_t col_num, TensorRow *row) { | |||
| // hack to get the file name without extension, the 1 is to get rid of the backslash character | |||
| std::shared_ptr<Tensor> empty_tensor; | |||
| RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({}), data_schema_->column(col_num).type(), &empty_tensor)); | |||
| row->push_back(std::move(empty_tensor)); | |||
| return Status::OK(); | |||
| } | |||
| // Loads a tensor with float value, issue with float64, we don't have reverse look up to the type | |||
| // So we actually have to check what type we want to fill the tensor with. | |||
| // Float64 doesn't work with reinterpret cast here. Otherwise we limit the float in the schema to | |||
| // only be float32, seems like a weird limitation to impose | |||
| Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) { | |||
| std::shared_ptr<Tensor> float_tensor; | |||
| if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT64)) { | |||
| double data = json_obj; | |||
| MS_LOG(INFO) << "double found: " << json_obj << "."; | |||
| RETURN_IF_NOT_OK(Tensor::CreateScalar<double>(data, &float_tensor)); | |||
| } else if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT32)) { | |||
| float data = json_obj; | |||
| RETURN_IF_NOT_OK(Tensor::CreateScalar<float>(data, &float_tensor)); | |||
| MS_LOG(INFO) << "float found: " << json_obj << "."; | |||
| } | |||
| row->push_back(std::move(float_tensor)); | |||
| return Status::OK(); | |||
| } | |||
| // Load 1 TensorRow (image,label) using 1 ImageColumns. 1 function call produces 1 TensorTow in a DataBuffer | |||
| // possible optimization: the helper functions of LoadTensorRow should be optimized | |||
| // to take a reference to a column descriptor? | |||
| Status AlbumOp::LoadTensorRow(const std::string &file, TensorRow *row) { | |||
| // testing here is to just print out file path | |||
| (*row) = {}; | |||
| MS_LOG(INFO) << "Image row file: " << file << "."; | |||
| std::ifstream file_handle(folder_path_ + file); | |||
| if (!file_handle.is_open()) { | |||
| RETURN_STATUS_UNEXPECTED("Json file " + folder_path_ + file + " can not open."); | |||
| } | |||
| std::string line; | |||
| while (getline(file_handle, line)) { | |||
| try { | |||
| nlohmann::json js = nlohmann::json::parse(line); | |||
| MS_LOG(INFO) << "This Line: " << line << "."; | |||
| // note if take a schema here, then we have to iterate over all column descriptors in schema and check for key | |||
| // get columns in schema: | |||
| int32_t columns = data_schema_->NumColumns(); | |||
| // loop over each column descriptor, this can optimized by swtich cases | |||
| for (int32_t i = 0; i < columns; i++) { | |||
| // special case to handle | |||
| if (data_schema_->column(i).name() == "id") { | |||
| // id is internal, special case to load from file | |||
| RETURN_IF_NOT_OK(LoadIDTensor(file, i, row)); | |||
| continue; | |||
| } | |||
| // find if key does not exist, insert placeholder nullptr if not found | |||
| if (js.find(data_schema_->column(i).name()) == js.end()) { | |||
| // iterator not found, push nullptr as placeholder | |||
| MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->column(i).name() << "."; | |||
| RETURN_IF_NOT_OK(LoadEmptyTensor(i, row)); | |||
| continue; | |||
| } | |||
| nlohmann::json column_value = js.at(data_schema_->column(i).name()); | |||
| MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << "."; | |||
| bool is_array = column_value.is_array(); | |||
| // load single string | |||
| if (column_value.is_string() && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) { | |||
| RETURN_IF_NOT_OK(LoadStringTensor(column_value, i, row)); | |||
| continue; | |||
| } | |||
| // load string array | |||
| if (is_array && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) { | |||
| RETURN_IF_NOT_OK(LoadStringArrayTensor(column_value, i, row)); | |||
| continue; | |||
| } | |||
| // load image file | |||
| if (column_value.is_string() && data_schema_->column(i).type() != DataType(DataType::DE_STRING)) { | |||
| std::string image_file_path = column_value; | |||
| RETURN_IF_NOT_OK(LoadImageTensor(image_file_path, i, row)); | |||
| continue; | |||
| } | |||
| // load float array | |||
| if (!is_array && (data_schema_->column(i).type() == DataType(DataType::DE_FLOAT32) || | |||
| data_schema_->column(i).type() == DataType(DataType::DE_FLOAT64))) { | |||
| RETURN_IF_NOT_OK(LoadFloatTensor(column_value, i, row)); | |||
| continue; | |||
| } | |||
| // int array | |||
| if (is_array && (data_schema_->column(i).type() == DataType(DataType::DE_INT64) || | |||
| data_schema_->column(i).type() == DataType(DataType::DE_INT32))) { | |||
| RETURN_IF_NOT_OK(LoadIntArrayTensor(column_value, i, row)); | |||
| continue; | |||
| } else { | |||
| MS_LOG(WARNING) << "Value type for column: " << data_schema_->column(i).name() << " is not supported."; | |||
| continue; | |||
| } | |||
| } | |||
| } catch (const std::exception &err) { | |||
| file_handle.close(); | |||
| RETURN_STATUS_UNEXPECTED("Parse Json file failed"); | |||
| } | |||
| } | |||
| file_handle.close(); | |||
| return Status::OK(); | |||
| } | |||
| // Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer | |||
| Status AlbumOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) { | |||
| std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>(); | |||
| TensorRow trow; | |||
| for (const int64_t &key : keys) { | |||
| RETURN_IF_NOT_OK(this->LoadTensorRow(image_rows_[key], &trow)); | |||
| deq->push_back(std::move(trow)); | |||
| } | |||
| (*db)->set_tensor_table(std::move(deq)); | |||
| return Status::OK(); | |||
| } | |||
| void AlbumOp::Print(std::ostream &out, bool show_all) const { | |||
| // Always show the id and name as first line regardless if this summary or detailed print | |||
| out << "(" << std::setw(2) << operator_id_ << ") <AlbumOp>:"; | |||
| if (!show_all) { | |||
| // Call the super class for displaying any common 1-liner info | |||
| ParallelOp::Print(out, show_all); | |||
| // Then show any custom derived-internal 1-liner info for this op | |||
| out << "\n"; | |||
| } else { | |||
| // Call the super class for displaying any common detailed info | |||
| ParallelOp::Print(out, show_all); | |||
| // Then show any custom derived-internal stuff | |||
| out << "\nNumber of rows:" << num_rows_ << "\nAlbum directory: " << folder_path_ << "\n\n"; | |||
| } | |||
| } | |||
| // Reset Sampler and wakeup Master thread (functor) | |||
| Status AlbumOp::Reset() { | |||
| RETURN_IF_NOT_OK(sampler_->ResetSampler()); | |||
| row_cnt_ = 0; | |||
| wp_.Set(); // wake up master thread after reset is done | |||
| return Status::OK(); | |||
| } | |||
| // hand shake with Sampler, allow Sampler to call RandomAccessOp's functions to get NumRows | |||
| Status AlbumOp::InitSampler() { | |||
| RETURN_IF_NOT_OK(sampler_->HandshakeRandomAccessOp(this)); | |||
| return Status::OK(); | |||
| } | |||
| Status AlbumOp::LaunchThreadsAndInitOp() { | |||
| RETURN_UNEXPECTED_IF_NULL(tree_); | |||
| // registers QueueList and individual Queues for interrupt services | |||
| RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks())); | |||
| RETURN_IF_NOT_OK(wp_.Register(tree_->AllTasks())); | |||
| // launch main workers that load DataBuffers by reading all images | |||
| RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&AlbumOp::WorkerEntry, this, std::placeholders::_1))); | |||
| TaskManager::FindMe()->Post(); | |||
| RETURN_IF_NOT_OK(this->InitSampler()); // pass numRows to Sampler | |||
| return Status::OK(); | |||
| } | |||
| // Visitor accept method for NodePass | |||
| Status AlbumOp::Accept(NodePass *p, bool *modified) { | |||
| // Downcast shared pointer then call visitor | |||
| return p->RunOnNode(shared_from_base<AlbumOp>(), modified); | |||
| } | |||
| Status AlbumOp::ComputeColMap() { | |||
| // Set the column name map (base class field) | |||
| if (column_name_id_map_.empty()) { | |||
| for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) { | |||
| column_name_id_map_[data_schema_->column(i).name()] = i; | |||
| } | |||
| } else { | |||
| MS_LOG(WARNING) << "Column name map is already set!"; | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,298 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_ | |||
| #include <deque> | |||
| #include <memory> | |||
| #include <queue> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include <map> | |||
| #include <set> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include <unordered_map> | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/engine/data_buffer.h" | |||
| #include "minddata/dataset/engine/data_schema.h" | |||
| #include "minddata/dataset/engine/datasetops/parallel_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/io_block.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #include "minddata/dataset/util/queue.h" | |||
| #include "minddata/dataset/util/services.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/util/wait_post.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // Forward declares | |||
| template <typename T> | |||
| class Queue; | |||
| // Define row information as a list of file objects to read | |||
| using FolderImages = std::shared_ptr<std::pair<std::string, std::queue<std::string>>>; | |||
| /// \class AlbumOp album_op.h | |||
| class AlbumOp : public ParallelOp, public RandomAccessOp { | |||
| public: | |||
| class Builder { | |||
| public: | |||
| /// \brief Constructor for Builder class of AlbumOp | |||
| Builder(); | |||
| /// \brief Destructor. | |||
| ~Builder() = default; | |||
| /// \brief Setter method | |||
| /// \param[in] rows_per_buffer | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetRowsPerBuffer(int32_t rows_per_buffer) { | |||
| builder_rows_per_buffer_ = rows_per_buffer; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] size | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetOpConnectorSize(int32_t size) { | |||
| builder_op_connector_size_ = size; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] exts - file extensions to be read | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetExtensions(const std::set<std::string> &exts) { | |||
| builder_extensions_ = exts; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] do_decode | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetDecode(bool do_decode) { | |||
| builder_decode_ = do_decode; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] num_workers | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetNumWorkers(int32_t num_workers) { | |||
| builder_num_workers_ = num_workers; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] sampler | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetSampler(std::shared_ptr<Sampler> sampler) { | |||
| builder_sampler_ = std::move(sampler); | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] dir - dataset directory | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetAlbumDir(const std::string &dir) { | |||
| builder_dir_ = dir; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] file - schema file to load | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetSchemaFile(const std::string &file) { | |||
| builder_schema_file_ = file; | |||
| return *this; | |||
| } | |||
| /// \brief Setter method | |||
| /// \param[in] columns - input columns | |||
| /// \return Builder setter method returns reference to the builder | |||
| Builder &SetColumnsToLoad(const std::vector<std::string> &columns) { | |||
| builder_columns_to_load_ = columns; | |||
| return *this; | |||
| } | |||
| /// \brief Check validity of input args | |||
| /// \return - The error code return | |||
| Status SanityCheck(); | |||
| /// \brief The builder "build" method creates the final object. | |||
| /// \param[inout] std::shared_ptr<AlbumOp> *op - DatasetOp | |||
| /// \return - The error code return | |||
| Status Build(std::shared_ptr<AlbumOp> *op); | |||
| private: | |||
| bool builder_decode_; | |||
| std::vector<std::string> builder_columns_to_load_; | |||
| std::string builder_dir_; | |||
| std::string builder_schema_file_; | |||
| int32_t builder_num_workers_; | |||
| int32_t builder_rows_per_buffer_; | |||
| int32_t builder_op_connector_size_; | |||
| std::set<std::string> builder_extensions_; | |||
| std::shared_ptr<Sampler> builder_sampler_; | |||
| std::unique_ptr<DataSchema> builder_schema_; | |||
| }; | |||
| /// \brief Constructor | |||
| /// \param[in] num_wkrs - Num of workers reading images in parallel | |||
| /// \param[in] rows_per_buffer Number of images (rows) in each buffer | |||
| /// \param[in] file_dir - directory of Album | |||
| /// \param[in] queue_size - connector size | |||
| /// \param[in] do_decode - decode image files | |||
| /// \param[in] exts - set of file extensions to read, if empty, read everything under the dir | |||
| /// \param[in] data_schema - schema of dataset | |||
| /// \param[in] sampler - sampler tells AlbumOp what to read | |||
| AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode, | |||
| const std::set<std::string> &exts, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler); | |||
| /// \brief Destructor. | |||
| ~AlbumOp() = default; | |||
| /// \brief Initialize AlbumOp related var, calls the function to walk all files | |||
| /// \return - The error code return | |||
| Status PrescanEntry(); | |||
| /// \brief Worker thread pulls a number of IOBlock from IOBlock Queue, make a buffer and push it to Connector | |||
| /// \param[in] int32_t workerId - id of each worker | |||
| /// \return Status - The error code return | |||
| Status WorkerEntry(int32_t worker_id) override; | |||
| /// \brief Main Loop of AlbumOp | |||
| /// Master thread: Fill IOBlockQueue, then goes to sleep | |||
| /// Worker thread: pulls IOBlock from IOBlockQueue, work on it then put buffer to mOutConnector | |||
| /// \return Status - The error code return | |||
| Status operator()() override; | |||
| /// \brief A print method typically used for debugging | |||
| /// \param[in] out | |||
| /// \param[in] show_all | |||
| void Print(std::ostream &out, bool show_all) const override; | |||
| /// \brief Check if image ia valid.Only support JPEG/PNG/GIF/BMP | |||
| /// This function could be optimized to return the tensor to reduce open/closing files | |||
| /// \return Status - The error code return | |||
| Status CheckImageType(const std::string &file_name, bool *valid); | |||
| // Base-class override for NodePass visitor acceptor. | |||
| // @param p - Pointer to the NodePass to be accepted. | |||
| // @param modified - Whether this node visit modified the pipeline. | |||
| // @return - Status of the node visit. | |||
| Status Accept(NodePass *p, bool *modified) override; | |||
| // Op name getter | |||
| // @return Name of the current Op | |||
| std::string Name() const override { return "AlbumOp"; } | |||
| private: | |||
| /// \brief Initialize Sampler, calls sampler->Init() within | |||
| /// \return Status The error code return | |||
| Status InitSampler(); | |||
| /// \brief Load image to tensor row | |||
| /// \param[in] image_file Image name of file | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadImageTensor(const std::string &image_file, uint32_t col_num, TensorRow *row); | |||
| /// \brief Load vector of ints to tensor, append tensor to tensor row | |||
| /// \param[in] json_obj Json object containing multi-dimensional label | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); | |||
| /// \brief Load string array into a tensor, append tensor to tensor row | |||
| /// \param[in] json_obj Json object containing string tensor | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); | |||
| /// \brief Load string into a tensor, append tensor to tensor row | |||
| /// \param[in] json_obj Json object containing string tensor | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); | |||
| /// \brief Load float value to tensor row | |||
| /// \param[in] json_obj Json object containing float | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row); | |||
| /// \brief Load emtpy tensor to tensor row | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadEmptyTensor(uint32_t col_num, TensorRow *row); | |||
| /// \brief Load id from file name to tensor row | |||
| /// \param[in] file The file name to get ID from | |||
| /// \param[in] col_num Column num in schema | |||
| /// \param[inout] row Tensor row to push to | |||
| /// \return Status The error code return | |||
| Status LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row); | |||
| /// \brief Load a tensor row according to a json file | |||
| /// \param[in] ImageColumns file Json file location | |||
| /// \param[inout] TensorRow row Json content stored into a tensor row | |||
| /// \return Status The error code return | |||
| Status LoadTensorRow(const std::string &file, TensorRow *row); | |||
| /// \param[in] const std::vector<int64_t> &keys Keys in ioblock | |||
| /// \param[inout] std::unique_ptr<DataBuffer> db Databuffer to push to | |||
| /// \return Status The error code return | |||
| Status LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db); | |||
| /// \brief Called first when function is called | |||
| /// \return The error code return | |||
| Status LaunchThreadsAndInitOp(); | |||
| /// \brief reset Op | |||
| /// \return Status The error code return | |||
| Status Reset() override; | |||
| // Private function for computing the assignment of the column name map. | |||
| // @return - Status | |||
| Status ComputeColMap() override; | |||
| int32_t rows_per_buffer_; | |||
| std::string folder_path_; // directory of image folder | |||
| bool decode_; | |||
| std::set<std::string> extensions_; // extensions allowed | |||
| std::unordered_map<std::string, int32_t> col_name_map_; | |||
| std::unique_ptr<DataSchema> data_schema_; | |||
| std::shared_ptr<Sampler> sampler_; | |||
| int64_t row_cnt_; | |||
| int64_t buf_cnt_; | |||
| int64_t sampler_ind_; | |||
| int64_t dirname_offset_; | |||
| WaitPost wp_; | |||
| std::vector<std::string> image_rows_; | |||
| QueueList<std::unique_ptr<IOBlock>> io_block_queues_; // queues of IOBlocks | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_ | |||
| @@ -134,7 +134,6 @@ Status ImageFolderOp::operator()() { | |||
| TensorRow sample_row; | |||
| RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row)); | |||
| std::shared_ptr<Tensor> sample_ids = sample_row[0]; | |||
| if (sample_ids->type() != DataType(DataType::DE_INT64)) RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64"); | |||
| for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) { | |||
| if ((*itr) >= num_rows_) continue; // index out of bound, skipping | |||
| keys.push_back(*itr); | |||
| @@ -30,6 +30,7 @@ | |||
| #include "minddata/dataset/engine/datasetops/repeat_op.h" | |||
| #include "minddata/dataset/engine/datasetops/skip_op.h" | |||
| #include "minddata/dataset/engine/datasetops/shuffle_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/album_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/celeba_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/cifar_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/coco_op.h" | |||
| @@ -199,6 +200,11 @@ Status NodePass::RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) | |||
| return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified); | |||
| } | |||
| Status NodePass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) { | |||
| // Fallback to base class visitor by default | |||
| return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified); | |||
| } | |||
| Status NodePass::RunOnNode(std::shared_ptr<CacheOp> node, bool *modified) { | |||
| // Fallback to base class visitor by default | |||
| return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified); | |||
| @@ -49,6 +49,8 @@ class FilterOp; | |||
| class GeneratorOp; | |||
| #endif | |||
| class AlbumOp; | |||
| class RandomDataOp; | |||
| class RepeatOp; | |||
| @@ -178,6 +180,8 @@ class NodePass : public Pass { | |||
| virtual Status RunOnNode(std::shared_ptr<RandomDataOp> node, bool *modified); | |||
| virtual Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified); | |||
| virtual Status RunOnNode(std::shared_ptr<TakeOp> node, bool *modified); | |||
| virtual Status RunOnNode(std::shared_ptr<ZipOp> node, bool *modified); | |||
| @@ -21,6 +21,7 @@ | |||
| #include "minddata/dataset/engine/datasetops/cache_lookup_op.h" | |||
| #include "minddata/dataset/engine/datasetops/cache_merge_op.h" | |||
| #include "minddata/dataset/engine/datasetops/cache_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/album_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/celeba_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/cifar_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/coco_op.h" | |||
| @@ -152,6 +153,11 @@ Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<ImageFolderOp> n | |||
| return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node)); | |||
| } | |||
| // Perform leaf node cache transform identification | |||
| Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) { | |||
| return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node)); | |||
| } | |||
| // Perform leaf node cache transform identification | |||
| Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<MnistOp> node, bool *modified) { | |||
| return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node)); | |||
| @@ -79,6 +79,12 @@ class CacheTransformPass : public TreePass { | |||
| /// \return Status The error code return | |||
| Status RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) override; | |||
| /// \brief Perform leaf node cache tranform identifications | |||
| /// \param[in] node The node being visited | |||
| /// \param[inout] modified Indicator if the node was changed at all | |||
| /// \return Status The error code return | |||
| Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) override; | |||
| /// \brief Perform leaf node cache tranform identifications | |||
| /// \param[in] node The node being visited | |||
| /// \param[inout] modified Indicator if the node was changed at all | |||
| @@ -111,5 +111,11 @@ Status PrinterPass::RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modifie | |||
| std::cout << "Visiting ImageFolderOp" << '\n'; | |||
| return Status::OK(); | |||
| } | |||
| Status PrinterPass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) { | |||
| *modified = false; | |||
| std::cout << "Visiting ImageFolderOp" << '\n'; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -58,6 +58,8 @@ class PrinterPass : public NodePass { | |||
| Status RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) override; | |||
| Status RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) override; | |||
| Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) override; | |||
| }; | |||
| } // namespace dataset | |||
| @@ -48,6 +48,7 @@ class TensorOperation; | |||
| class SchemaObj; | |||
| class SamplerObj; | |||
| // Datasets classes (in alphabetical order) | |||
| class AlbumDataset; | |||
| class CelebADataset; | |||
| class Cifar10Dataset; | |||
| class Cifar100Dataset; | |||
| @@ -79,13 +80,27 @@ class ZipDataset; | |||
| /// \return Shared pointer to the current schema | |||
| std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = ""); | |||
| /// \brief Function to create an AlbumDataset | |||
| /// \notes The generated dataset is specified through setting a schema | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] data_schema Path to dataset schema file | |||
| /// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. | |||
| /// (default = {}) | |||
| /// \param[in] decode the option to decode the images in dataset (default = false) | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr) | |||
| /// \return Shared pointer to the current Dataset | |||
| std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema, | |||
| const std::vector<std::string> &column_names = {}, bool decode = false, | |||
| const std::shared_ptr<SamplerObj> &sampler = nullptr); | |||
| /// \brief Function to create a CelebADataset | |||
| /// \notes The generated dataset has two columns ['image', 'attr']. | |||
| // The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. | |||
| // The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'. | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// \param[in] decode Decode the images after reading (default=false). | |||
| /// \param[in] extensions Set of file extensions to be included in the dataset (default={}). | |||
| /// \return Shared pointer to the current Dataset | |||
| @@ -97,7 +112,7 @@ std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std: | |||
| /// \notes The generated dataset has two columns ['image', 'label'] | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// \return Shared pointer to the current Dataset | |||
| std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir, | |||
| const std::shared_ptr<SamplerObj> &sampler = nullptr); | |||
| @@ -106,7 +121,7 @@ std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir, | |||
| /// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label'] | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// \return Shared pointer to the current Dataset | |||
| std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir, | |||
| const std::shared_ptr<SamplerObj> &sampler = nullptr); | |||
| @@ -114,19 +129,19 @@ std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir, | |||
| /// \brief Function to create a CLUEDataset | |||
| /// \notes The generated dataset has a variable number of columns depending on the task and usage | |||
| /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list | |||
| /// will be sorted in a lexicographical order. | |||
| /// will be sorted in a lexicographical order. | |||
| /// \param[in] task The kind of task, one of "AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC" and "CSL" (default="AFQMC"). | |||
| /// \param[in] usage Be used to "train", "test" or "eval" data (default="train"). | |||
| /// \param[in] num_samples The number of samples to be included in the dataset. | |||
| /// (Default = 0 means all samples.) | |||
| /// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal) | |||
| /// Can be any of: | |||
| /// ShuffleMode::kFalse - No shuffling is performed. | |||
| /// ShuffleMode::kFiles - Shuffle files only. | |||
| /// ShuffleMode::kGlobal - Shuffle both the files and samples. | |||
| /// (Default = 0 means all samples.) | |||
| /// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) | |||
| /// Can be any of: | |||
| /// ShuffleMode.kFalse - No shuffling is performed. | |||
| /// ShuffleMode.kFiles - Shuffle files only. | |||
| /// ShuffleMode.kGlobal - Shuffle both the files and samples. | |||
| /// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) | |||
| /// \param[in] shard_id The shard ID within num_shards. This argument should be | |||
| /// specified only when num_shards is also specified. (Default = 0) | |||
| /// specified only when num_shards is also specified. (Default = 0) | |||
| /// \return Shared pointer to the current CLUEDataset | |||
| std::shared_ptr<CLUEDataset> CLUE(const std::vector<std::string> &dataset_files, const std::string &task = "AFQMC", | |||
| const std::string &usage = "train", int64_t num_samples = 0, | |||
| @@ -135,19 +150,19 @@ std::shared_ptr<CLUEDataset> CLUE(const std::vector<std::string> &dataset_files, | |||
| /// \brief Function to create a CocoDataset | |||
| /// \notes The generated dataset has multi-columns : | |||
| /// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], | |||
| /// ['iscrowd', dtype=uint32]]. | |||
| /// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]]. | |||
| /// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32], | |||
| /// ['num_keypoints', dtype=uint32]]. | |||
| /// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], | |||
| /// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]]. | |||
| /// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], | |||
| /// ['iscrowd', dtype=uint32]]. | |||
| /// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]]. | |||
| /// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32], | |||
| /// ['num_keypoints', dtype=uint32]]. | |||
| /// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], | |||
| /// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] annotation_file Path to the annotation json | |||
| /// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint' | |||
| /// \param[in] decode Decode the images after reading | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// \return Shared pointer to the current Dataset | |||
| std::shared_ptr<CocoDataset> Coco(const std::string &dataset_dir, const std::string &annotation_file, | |||
| const std::string &task = "Detection", const bool &decode = false, | |||
| @@ -181,12 +196,12 @@ std::shared_ptr<CSVDataset> CSV(const std::vector<std::string> &dataset_files, c | |||
| /// \brief Function to create an ImageFolderDataset | |||
| /// \notes A source dataset that reads images from a tree of directories | |||
| /// All images within one folder have the same label | |||
| /// The generated dataset has two columns ['image', 'label'] | |||
| /// All images within one folder have the same label | |||
| /// The generated dataset has two columns ['image', 'label'] | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] decode A flag to decode in ImageFolder | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset | |||
| /// \param[in] extensions File extensions to be read | |||
| /// \param[in] class_indexing a class name to label map | |||
| /// \return Shared pointer to the current ImageFolderDataset | |||
| @@ -200,9 +215,9 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir, | |||
| /// \param[in] dataset_file The dataset file to be read | |||
| /// \param[in] usage Need "train", "eval" or "inference" data (default="train") | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset | |||
| /// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder | |||
| /// names will be sorted alphabetically and each class will be given a unique index starting from 0). | |||
| /// names will be sorted alphabetically and each class will be given a unique index starting from 0). | |||
| /// \param[in] decode Decode the images after reading (default=false). | |||
| /// \return Shared pointer to the current ManifestDataset | |||
| std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage = "train", | |||
| @@ -214,7 +229,7 @@ std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string | |||
| /// \notes The generated dataset has two columns ['image', 'label'] | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset | |||
| /// A `RandomSampler` will be used to randomly iterate the entire dataset | |||
| /// \return Shared pointer to the current MnistDataset | |||
| std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, | |||
| const std::shared_ptr<SamplerObj> &sampler = nullptr); | |||
| @@ -245,17 +260,17 @@ std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schem | |||
| /// \brief Function to create a TextFileDataset | |||
| /// \notes The generated dataset has one column ['text'] | |||
| /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list | |||
| /// will be sorted in a lexicographical order. | |||
| /// will be sorted in a lexicographical order. | |||
| /// \param[in] num_samples The number of samples to be included in the dataset. | |||
| /// (Default = 0 means all samples.) | |||
| /// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal) | |||
| /// Can be any of: | |||
| /// ShuffleMode::kFalse - No shuffling is performed. | |||
| /// ShuffleMode::kFiles - Shuffle files only. | |||
| /// ShuffleMode::kGlobal - Shuffle both the files and samples. | |||
| /// (Default = 0 means all samples.) | |||
| /// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) | |||
| /// Can be any of: | |||
| /// ShuffleMode.kFalse - No shuffling is performed. | |||
| /// ShuffleMode.kFiles - Shuffle files only. | |||
| /// ShuffleMode.kGlobal - Shuffle both the files and samples. | |||
| /// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) | |||
| /// \param[in] shard_id The shard ID within num_shards. This argument should be | |||
| /// specified only when num_shards is also specified. (Default = 0) | |||
| /// specified only when num_shards is also specified. (Default = 0) | |||
| /// \return Shared pointer to the current TextFileDataset | |||
| std::shared_ptr<TextFileDataset> TextFile(const std::vector<std::string> &dataset_files, int64_t num_samples = 0, | |||
| ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, | |||
| @@ -263,16 +278,16 @@ std::shared_ptr<TextFileDataset> TextFile(const std::vector<std::string> &datase | |||
| /// \brief Function to create a VOCDataset | |||
| /// \notes The generated dataset has multi-columns : | |||
| /// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32], | |||
| /// ['difficult', dtype=uint32], ['truncate', dtype=uint32]]. | |||
| /// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. | |||
| /// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32], | |||
| /// ['difficult', dtype=uint32], ['truncate', dtype=uint32]]. | |||
| /// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | |||
| /// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" | |||
| /// \param[in] mode Set the data list txt file to be readed | |||
| /// \param[in] class_indexing A str-to-int mapping from label name to index | |||
| /// \param[in] decode Decode the images after reading | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// \return Shared pointer to the current Dataset | |||
| std::shared_ptr<VOCDataset> VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", | |||
| const std::string &mode = "train", | |||
| @@ -335,9 +350,9 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||
| /// \notes Combines batch_size number of consecutive rows into batches | |||
| /// \param[in] batch_size Path to the root directory that contains the dataset | |||
| /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete | |||
| /// batch. If true, and if there are less than batch_size rows | |||
| /// available to make the last batch, then those rows will | |||
| /// be dropped and not propagated to the next node | |||
| /// batch. If true, and if there are less than batch_size rows | |||
| /// available to make the last batch, then those rows will | |||
| /// be dropped and not propagated to the next node | |||
| /// \return Shared pointer to the current BatchDataset | |||
| std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false); | |||
| @@ -368,16 +383,16 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||
| /// \brief Function to create a MapDataset | |||
| /// \notes Applies each operation in operations to this dataset | |||
| /// \param[in] operations Vector of operations to be applied on the dataset. Operations are | |||
| /// applied in the order they appear in this list | |||
| /// applied in the order they appear in this list | |||
| /// \param[in] input_columns Vector of the names of the columns that will be passed to the first | |||
| /// operation as input. The size of this list must match the number of | |||
| /// input columns expected by the first operator. The default input_columns | |||
| /// is the first column | |||
| /// operation as input. The size of this list must match the number of | |||
| /// input columns expected by the first operator. The default input_columns | |||
| /// is the first column | |||
| /// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation | |||
| /// This parameter is mandatory if len(input_columns) != len(output_columns) | |||
| /// The size of this list must match the number of output columns of the | |||
| /// last operation. The default output_columns will have the same | |||
| /// name as the input columns, i.e., the columns will be replaced | |||
| /// This parameter is mandatory if len(input_columns) != len(output_columns) | |||
| /// The size of this list must match the number of output columns of the | |||
| /// last operation. The default output_columns will have the same | |||
| /// name as the input columns, i.e., the columns will be replaced | |||
| /// \param[in] project_columns A list of column names to project | |||
| /// \return Shared pointer to the current MapDataset | |||
| std::shared_ptr<MapDataset> Map(std::vector<std::shared_ptr<TensorOperation>> operations, | |||
| @@ -404,7 +419,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||
| /// \param[in] count Number of times the dataset should be repeated | |||
| /// \return Shared pointer to the current Dataset | |||
| /// \note Repeat will return shared pointer to `Dataset` instead of `RepeatDataset` | |||
| /// due to a limitation in the current implementation | |||
| /// due to a limitation in the current implementation | |||
| std::shared_ptr<Dataset> Repeat(int32_t count = -1); | |||
| /// \brief Function to create a Shuffle Dataset | |||
| @@ -506,6 +521,31 @@ class SchemaObj { | |||
| // DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS | |||
| // (In alphabetical order) | |||
| class AlbumDataset : public Dataset { | |||
| public: | |||
| /// \brief Constructor | |||
| AlbumDataset(const std::string &dataset_dir, const std::string &data_schema, | |||
| const std::vector<std::string> &column_names, bool decode, const std::shared_ptr<SamplerObj> &sampler); | |||
| /// \brief Destructor | |||
| ~AlbumDataset() = default; | |||
| /// \brief a base class override function to create a runtime dataset op object from this class | |||
| /// \return shared pointer to the newly created DatasetOp | |||
| std::vector<std::shared_ptr<DatasetOp>> Build() override; | |||
| /// \brief Parameters validation | |||
| /// \return bool true if all the params are valid | |||
| bool ValidateParams() override; | |||
| private: | |||
| std::string dataset_dir_; | |||
| std::string schema_path_; | |||
| std::vector<std::string> column_names_; | |||
| bool decode_; | |||
| std::shared_ptr<SamplerObj> sampler_; | |||
| }; | |||
| class CelebADataset : public Dataset { | |||
| public: | |||
| /// \brief Constructor | |||
| @@ -5,6 +5,7 @@ SET(DE_UT_SRCS | |||
| common/cvop_common.cc | |||
| common/bboxop_common.cc | |||
| auto_contrast_op_test.cc | |||
| album_op_test.cc | |||
| batch_op_test.cc | |||
| bit_functions_test.cc | |||
| storage_container_test.cc | |||
| @@ -101,6 +102,7 @@ SET(DE_UT_SRCS | |||
| c_api_samplers_test.cc | |||
| c_api_transforms_test.cc | |||
| c_api_dataset_ops_test.cc | |||
| c_api_dataset_album_test.cc | |||
| c_api_dataset_cifar_test.cc | |||
| c_api_dataset_clue_test.cc | |||
| c_api_dataset_coco_test.cc | |||
| @@ -0,0 +1,208 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <fstream> | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/core/client.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "minddata/dataset/engine/datasetops/source/album_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "gtest/gtest.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "securec.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::MsLogLevel::ERROR; | |||
| using mindspore::ExceptionType::NoExceptionType; | |||
| using mindspore::LogStream; | |||
| std::shared_ptr<BatchOp> Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2); | |||
| std::shared_ptr<RepeatOp> Repeat(int repeat_cnt); | |||
| std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops); | |||
| std::shared_ptr<AlbumOp> Album(int64_t num_works, int64_t rows, int64_t conns, std::string path, | |||
| bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr, | |||
| bool decode = false) { | |||
| std::shared_ptr<AlbumOp> so; | |||
| AlbumOp::Builder builder; | |||
| Status rc = builder.SetNumWorkers(num_works) | |||
| .SetAlbumDir(path) | |||
| .SetRowsPerBuffer(rows) | |||
| .SetOpConnectorSize(conns) | |||
| .SetExtensions({".json"}) | |||
| .SetSampler(std::move(sampler)) | |||
| .SetDecode(decode) | |||
| .Build(&so); | |||
| return so; | |||
| } | |||
| std::shared_ptr<AlbumOp> AlbumSchema(int64_t num_works, int64_t rows, int64_t conns, std::string path, | |||
| std::string schema_file, std::vector<std::string> column_names = {}, | |||
| bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr, | |||
| bool decode = false) { | |||
| std::shared_ptr<AlbumOp> so; | |||
| AlbumOp::Builder builder; | |||
| Status rc = builder.SetNumWorkers(num_works) | |||
| .SetSchemaFile(schema_file) | |||
| .SetColumnsToLoad(column_names) | |||
| .SetAlbumDir(path) | |||
| .SetRowsPerBuffer(rows) | |||
| .SetOpConnectorSize(conns) | |||
| .SetExtensions({".json"}) | |||
| .SetSampler(std::move(sampler)) | |||
| .SetDecode(decode) | |||
| .Build(&so); | |||
| return so; | |||
| } | |||
| class MindDataTestAlbum : public UT::DatasetOpTesting { | |||
| protected: | |||
| }; | |||
| TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchema) { | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; | |||
| std::vector<std::string> column_names = {"image", "label", "id"}; | |||
| auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file, column_names, false), Repeat(2)}); | |||
| tree->Prepare(); | |||
| Status rc = tree->Launch(); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; | |||
| EXPECT_TRUE(false); | |||
| } else { | |||
| DatasetIterator di(tree); | |||
| TensorMap tensor_map; | |||
| di.GetNextAsMap(&tensor_map); | |||
| EXPECT_TRUE(rc.IsOk()); | |||
| uint64_t i = 0; | |||
| int32_t label = 0; | |||
| while (tensor_map.size() != 0) { | |||
| tensor_map["label"]->GetItemAt<int32_t>(&label, {}); | |||
| MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" | |||
| << tensor_map["label"] << "\n"; | |||
| i++; | |||
| di.GetNextAsMap(&tensor_map); | |||
| } | |||
| MS_LOG(INFO) << "got rows" << i << "\n"; | |||
| EXPECT_TRUE(i == 14); | |||
| } | |||
| } | |||
| TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaNoOrder) { | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; | |||
| auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)}); | |||
| tree->Prepare(); | |||
| Status rc = tree->Launch(); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; | |||
| EXPECT_TRUE(false); | |||
| } else { | |||
| DatasetIterator di(tree); | |||
| TensorMap tensor_map; | |||
| di.GetNextAsMap(&tensor_map); | |||
| EXPECT_TRUE(rc.IsOk()); | |||
| uint64_t i = 0; | |||
| int32_t label = 0; | |||
| while (tensor_map.size() != 0) { | |||
| tensor_map["label"]->GetItemAt<int32_t>(&label, {}); | |||
| MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" | |||
| << tensor_map["label"] << "\n"; | |||
| i++; | |||
| di.GetNextAsMap(&tensor_map); | |||
| } | |||
| MS_LOG(INFO) << "got rows" << i << "\n"; | |||
| EXPECT_TRUE(i == 14); | |||
| } | |||
| } | |||
| TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaFloat) { | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| // add the priority column | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/floatSchema.json"; | |||
| auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)}); | |||
| tree->Prepare(); | |||
| Status rc = tree->Launch(); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; | |||
| EXPECT_TRUE(false); | |||
| } else { | |||
| DatasetIterator di(tree); | |||
| TensorMap tensor_map; | |||
| di.GetNextAsMap(&tensor_map); | |||
| EXPECT_TRUE(rc.IsOk()); | |||
| uint64_t i = 0; | |||
| int32_t label = 0; | |||
| double priority = 0; | |||
| while (tensor_map.size() != 0) { | |||
| tensor_map["label"]->GetItemAt<int32_t>(&label, {}); | |||
| tensor_map["_priority"]->GetItemAt<double>(&priority, {}); | |||
| MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" | |||
| << tensor_map["label"] << "priority: " << priority << "\n"; | |||
| i++; | |||
| di.GetNextAsMap(&tensor_map); | |||
| } | |||
| MS_LOG(INFO) << "got rows" << i << "\n"; | |||
| EXPECT_TRUE(i == 14); | |||
| } | |||
| } | |||
| TEST_F(MindDataTestAlbum, TestSequentialAlbumWithFullSchema) { | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| // add the priority column | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/fullSchema.json"; | |||
| auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)}); | |||
| tree->Prepare(); | |||
| Status rc = tree->Launch(); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << "Return code error detected during tree launch: " << "."; | |||
| EXPECT_TRUE(false); | |||
| } else { | |||
| DatasetIterator di(tree); | |||
| TensorMap tensor_map; | |||
| di.GetNextAsMap(&tensor_map); | |||
| EXPECT_TRUE(rc.IsOk()); | |||
| uint64_t i = 0; | |||
| int32_t label = 0; | |||
| double priority = 0; | |||
| while (tensor_map.size() != 0) { | |||
| tensor_map["label"]->GetItemAt<int32_t>(&label, {}); | |||
| tensor_map["_priority"]->GetItemAt<double>(&priority, {}); | |||
| MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape" | |||
| << tensor_map["label"] << "priority: " << priority << " embedding : " << | |||
| tensor_map["_embedding"]->shape() << "\n"; | |||
| i++; | |||
| di.GetNextAsMap(&tensor_map); | |||
| } | |||
| MS_LOG(INFO) << "got rows" << i << "\n"; | |||
| EXPECT_TRUE(i == 14); | |||
| } | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| using namespace mindspore::dataset::api; | |||
| using mindspore::dataset::Tensor; | |||
| class MindDataTestPipeline : public UT::DatasetOpTesting { | |||
| protected: | |||
| }; | |||
| TEST_F(MindDataTestPipeline, TestAlbumBasic) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumBasic."; | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; | |||
| std::vector<std::string> column_names = {"image", "label", "id"}; | |||
| // Create a Album Dataset | |||
| std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| i++; | |||
| auto image = row["image"]; | |||
| MS_LOG(INFO) << "Tensor image shape: " << image->shape(); | |||
| iter->GetNextRow(&row); | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestAlbumDecode) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDecode."; | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; | |||
| std::vector<std::string> column_names = {"image", "label", "id"}; | |||
| // Create a Album Dataset | |||
| std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| i++; | |||
| auto image = row["image"]; | |||
| auto shape = image->shape(); | |||
| MS_LOG(INFO) << "Tensor image shape size: " << shape.Size(); | |||
| MS_LOG(INFO) << "Tensor image shape: " << image->shape(); | |||
| EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect | |||
| iter->GetNextRow(&row); | |||
| } | |||
| EXPECT_EQ(i, 7); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers."; | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/images"; | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; | |||
| std::vector<std::string> column_names = {"image", "label", "id"}; | |||
| // Create a Album Dataset | |||
| std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1)); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| i++; | |||
| auto image = row["image"]; | |||
| MS_LOG(INFO) << "Tensor image shape: " << image->shape(); | |||
| iter->GetNextRow(&row); | |||
| } | |||
| EXPECT_EQ(i, 1); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestAlbumError) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumError."; | |||
| std::string folder_path = datasets_root_path_ + "/testAlbum/ima"; | |||
| std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json"; | |||
| std::vector<std::string> column_names = {"image", "label", "id"}; | |||
| // Create a Album Dataset | |||
| std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1)); | |||
| EXPECT_EQ(ds, nullptr); | |||
| } | |||
| @@ -32,6 +32,8 @@ export GLOG_v=2 | |||
| ## prepare data for dataset & mindrecord | |||
| cp -fr $PROJECT_PATH/tests/ut/data ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/ | |||
| ## prepare album dataset, uses absolute path so has to be generated | |||
| python ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/data/dataset/testAlbum/gen_json.py | |||
| if [ $# -gt 0 ]; then | |||
| ./ut_tests --gtest_filter=$1 | |||
| @@ -0,0 +1 @@ | |||
| just some random stuff | |||
| @@ -0,0 +1,16 @@ | |||
| { | |||
| "columns": { | |||
| "image": { | |||
| "type": "uint8", | |||
| "rank": 1 | |||
| }, | |||
| "label" : { | |||
| "type": "string", | |||
| "rank": 1 | |||
| }, | |||
| "id" : { | |||
| "type": "int64", | |||
| "rank": 0 | |||
| } | |||
| } | |||
| } | |||
| @@ -5,7 +5,7 @@ | |||
| "rank": 1 | |||
| }, | |||
| "label" : { | |||
| "type": "int32", | |||
| "type": "string", | |||
| "rank": 1 | |||
| }, | |||
| "id" : { | |||
| @@ -5,7 +5,7 @@ | |||
| "rank": 1 | |||
| }, | |||
| "label" : { | |||
| "type": "int32", | |||
| "type": "string", | |||
| "rank": 1 | |||
| }, | |||
| "id" : { | |||
| @@ -2,21 +2,21 @@ import json | |||
| import os | |||
| def dump_json_from_dict(structure, file_name): | |||
| with open(file_name + '.json', 'w') as file_path: | |||
| json.dump(structure, file_path) | |||
| with open(file_name + '.json', 'w') as fp: | |||
| json.dump(structure, fp) | |||
| if __name__ == '__main__': | |||
| # iterate over directory | |||
| DIRECTORY = "imagefolder" | |||
| i = 0 | |||
| # iterate over DIRECTORY | |||
| DIRECTORY = os.path.dirname(os.path.realpath(__file__)) + "/original" | |||
| PARENT_DIR = os.path.dirname(DIRECTORY) | |||
| i = -1 | |||
| for filename in os.listdir(DIRECTORY): | |||
| default_dict = {} | |||
| default_dict.update(dataset='') | |||
| default_dict.update(image=(os.path.join(DIRECTORY, filename))) | |||
| default_dict.update(label=[1, 2]) | |||
| default_dict.update(image=os.path.abspath(os.path.join(DIRECTORY, filename))) | |||
| default_dict.update(label=['3', '2']) | |||
| default_dict.update(_priority=0.8) | |||
| default_dict.update(_embedding='sample.bin') | |||
| default_dict.update(_segmented_image=(os.path.join(DIRECTORY, filename))) | |||
| default_dict.update(_processed_image=(os.path.join(DIRECTORY, filename))) | |||
| default_dict.update(_embedding=os.path.abspath(os.path.join(PARENT_DIR, 'sample.bin'))) | |||
| default_dict.update(_processed_image=os.path.abspath(os.path.join(DIRECTORY, filename))) | |||
| i = i + 1 | |||
| dump_json_from_dict(default_dict, 'images/'+str(i)) | |||
| dump_json_from_dict(default_dict, PARENT_DIR + '/images/'+str(i)) | |||
| @@ -0,0 +1 @@ | |||
| {"dataset": "", "image": "original/apple_expect_decoded.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "sample.bin", "_processed_image": "original/apple_expect_decoded.jpg"} | |||
| @@ -1 +1 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_decoded.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_decoded.jpg", "_processed_image": "imagefolder/apple_expect_decoded.jpg"} | |||
| {"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg"} | |||
| @@ -1 +1 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_resize_bilinear.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_resize_bilinear.jpg", "_processed_image": "imagefolder/apple_expect_resize_bilinear.jpg"} | |||
| {"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg"} | |||
| @@ -1 +1 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_changemode.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_changemode.jpg", "_processed_image": "imagefolder/apple_expect_changemode.jpg"} | |||
| {"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg"} | |||
| @@ -1 +1 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_not_flip.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_not_flip.jpg", "_processed_image": "imagefolder/apple_expect_not_flip.jpg"} | |||
| {"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg"} | |||
| @@ -1 +1 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_flipped_horizontal.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_horizontal.jpg", "_processed_image": "imagefolder/apple_expect_flipped_horizontal.jpg"} | |||
| {"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg"} | |||
| @@ -1 +1 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_rescaled.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_rescaled.jpg", "_processed_image": "imagefolder/apple_expect_rescaled.jpg"} | |||
| {"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg"} | |||
| @@ -1 +0,0 @@ | |||
| {"dataset": "", "image": "imagefolder/apple_expect_flipped_vertical.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_vertical.jpg", "_processed_image": "imagefolder/apple_expect_flipped_vertical.jpg"} | |||