Browse Source

!4772 Adding AlbumDataset for device training

Merge pull request !4772 from EricZ/ms-album
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
0bbce9367e
51 changed files with 1366 additions and 72 deletions
  1. +1
    -1
      build.sh
  2. +53
    -0
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  3. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt
  4. +508
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
  5. +298
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h
  6. +0
    -1
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc
  7. +6
    -0
      mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc
  8. +4
    -0
      mindspore/ccsrc/minddata/dataset/engine/opt/pass.h
  9. +6
    -0
      mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc
  10. +6
    -0
      mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.h
  11. +6
    -0
      mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.cc
  12. +2
    -0
      mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.h
  13. +90
    -50
      mindspore/ccsrc/minddata/dataset/include/datasets.h
  14. +2
    -0
      tests/ut/cpp/dataset/CMakeLists.txt
  15. +208
    -0
      tests/ut/cpp/dataset/album_op_test.cc
  16. +136
    -0
      tests/ut/cpp/dataset/c_api_dataset_album_test.cc
  17. +2
    -0
      tests/ut/cpp/runtest.sh
  18. +1
    -0
      tests/ut/data/dataset/testAlbum/bin/sample.bin
  19. +16
    -0
      tests/ut/data/dataset/testAlbum/datasetSchema.json
  20. +1
    -1
      tests/ut/data/dataset/testAlbum/floatSchema.json
  21. +1
    -1
      tests/ut/data/dataset/testAlbum/fullSchema.json
  22. +11
    -11
      tests/ut/data/dataset/testAlbum/gen_json.py
  23. +1
    -0
      tests/ut/data/dataset/testAlbum/images/0.json
  24. +1
    -1
      tests/ut/data/dataset/testAlbum/images/1.json
  25. +1
    -1
      tests/ut/data/dataset/testAlbum/images/2.json
  26. +1
    -1
      tests/ut/data/dataset/testAlbum/images/3.json
  27. +1
    -1
      tests/ut/data/dataset/testAlbum/images/4.json
  28. +1
    -1
      tests/ut/data/dataset/testAlbum/images/5.json
  29. +1
    -1
      tests/ut/data/dataset/testAlbum/images/6.json
  30. +0
    -1
      tests/ut/data/dataset/testAlbum/images/7.json
  31. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_changemode.jpg
  32. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_decoded.jpg
  33. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_horizontal.jpg
  34. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_vertical.jpg
  35. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_not_flip.jpg
  36. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_rescaled.jpg
  37. +0
    -0
      tests/ut/data/dataset/testAlbum/original/apple_expect_resize_bilinear.jpg
  38. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_changemode.jpg
  39. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_decoded.jpg
  40. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_horizontal.jpg
  41. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_vertical.jpg
  42. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_not_flip.jpg
  43. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_rescaled.jpg
  44. BIN
      tests/ut/data/dataset/testAlbum/processed/apple_expect_resize_bilinear.jpg
  45. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_changemode.jpg
  46. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_decoded.jpg
  47. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_horizontal.jpg
  48. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_vertical.jpg
  49. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_not_flip.jpg
  50. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_rescaled.jpg
  51. BIN
      tests/ut/data/dataset/testAlbum/segmented/apple_expect_resize_bilinear.jpg

+ 1
- 1
build.sh View File

@@ -393,7 +393,7 @@ build_mindspore()
CMAKE_VERBOSE="--verbose"
fi
cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM
echo "success to build mindspore project!"
echo "success building mindspore project!"
}

checkndk() {


+ 53
- 0
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -21,6 +21,7 @@
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/engine/dataset_iterator.h"
// Source dataset headers (in alphabetical order)
#include "minddata/dataset/engine/datasetops/source/album_op.h"
#include "minddata/dataset/engine/datasetops/source/celeba_op.h"
#include "minddata/dataset/engine/datasetops/source/cifar_op.h"
#include "minddata/dataset/engine/datasetops/source/clue_op.h"
@@ -117,6 +118,15 @@ std::shared_ptr<SchemaObj> Schema(const std::string &schema_file) {
// FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS
// (In alphabetical order)

// Function to create a AlbumDataset.
std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema,
const std::vector<std::string> &column_names, bool decode,
const std::shared_ptr<SamplerObj> &sampler) {
auto ds = std::make_shared<AlbumDataset>(dataset_dir, data_schema, column_names, decode, sampler);

return ds->ValidateParams() ? ds : nullptr;
}

// Function to create a CelebADataset.
std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std::string &dataset_type,
const std::shared_ptr<SamplerObj> &sampler, bool decode,
@@ -687,6 +697,49 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha
// DERIVED DATASET CLASSES LEAF-NODE DATASETS
// (In alphabetical order)

// Constructor for AlbumDataset
AlbumDataset::AlbumDataset(const std::string &dataset_dir, const std::string &data_schema,
const std::vector<std::string> &column_names, bool decode,
const std::shared_ptr<SamplerObj> &sampler)
: dataset_dir_(dataset_dir),
schema_path_(data_schema),
column_names_(column_names),
decode_(decode),
sampler_(sampler) {}

bool AlbumDataset::ValidateParams() {
if (!ValidateDatasetDirParam("AlbumDataset", dataset_dir_)) {
return false;
}

if (!ValidateDatasetFilesParam("AlbumDataset", {schema_path_})) {
return false;
}

return true;
}

// Function to build AlbumDataset
std::vector<std::shared_ptr<DatasetOp>> AlbumDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;

// If user does not specify Sampler, create a default sampler, i.e., RandomSampler.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}

auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_));

// Argument that is not exposed to user in the API.
std::set<std::string> extensions = {};

node_ops.push_back(std::make_shared<AlbumOp>(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_,
decode_, extensions, std::move(schema), std::move(sampler_->Build())));
return node_ops;
}

// Constructor for CelebADataset
CelebADataset::CelebADataset(const std::string &dataset_dir, const std::string &dataset_type,
const std::shared_ptr<SamplerObj> &sampler, const bool &decode,


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt View File

@@ -13,6 +13,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
text_file_op.cc
clue_op.cc
csv_op.cc
album_op.cc
)

set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES


+ 508
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc View File

@@ -0,0 +1,508 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/datasetops/source/album_op.h"
#include <fstream>
#include <iomanip>
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/tensor_shape.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
#include "minddata/dataset/engine/db_connector.h"
#include "minddata/dataset/engine/execution_tree.h"
#include "minddata/dataset/engine/opt/pass.h"
#include "minddata/dataset/kernels/image/image_utils.h"

namespace mindspore {
namespace dataset {
AlbumOp::Builder::Builder() : builder_decode_(false), builder_sampler_(nullptr), builder_schema_file_("") {
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
builder_num_workers_ = cfg->num_parallel_workers();
builder_rows_per_buffer_ = cfg->rows_per_buffer();
builder_op_connector_size_ = cfg->op_connector_size();
}

Status AlbumOp::Builder::Build(std::shared_ptr<AlbumOp> *ptr) {
RETURN_IF_NOT_OK(SanityCheck());
if (builder_sampler_ == nullptr) {
int64_t num_samples = 0; // default num samples of 0 means to sample entire set of data
int64_t start_index = 0;
builder_sampler_ = std::make_shared<SequentialSampler>(start_index, num_samples);
}

builder_schema_ = std::make_unique<DataSchema>();
Path schema_file(builder_schema_file_);
if (builder_schema_file_ == "" || !schema_file.Exists()) {
RETURN_STATUS_UNEXPECTED("Schema not provided");
} else {
MS_LOG(INFO) << "Schema file provided: " << builder_schema_file_ << ".";
builder_schema_->LoadSchemaFile(builder_schema_file_, builder_columns_to_load_);
}
*ptr = std::make_shared<AlbumOp>(builder_num_workers_, builder_rows_per_buffer_, builder_dir_,
builder_op_connector_size_, builder_decode_, builder_extensions_,
std::move(builder_schema_), std::move(builder_sampler_));
return Status::OK();
}

Status AlbumOp::Builder::SanityCheck() {
Path dir(builder_dir_);
std::string err_msg;
err_msg += dir.IsDirectory() == false ? "Album path is invalid or not set\n" : "";
err_msg += builder_num_workers_ <= 0 ? "Num of parallel workers is set to 0\n" : "";
return err_msg.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg);
}

AlbumOp::AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode,
const std::set<std::string> &exts, std::unique_ptr<DataSchema> data_schema,
std::shared_ptr<Sampler> sampler)
: ParallelOp(num_wkrs, queue_size),
rows_per_buffer_(rows_per_buffer),
folder_path_(file_dir),
decode_(do_decode),
extensions_(exts),
data_schema_(std::move(data_schema)),
sampler_(std::move(sampler)),
row_cnt_(0),
buf_cnt_(0),
sampler_ind_(0),
dirname_offset_(0) {
// Set the column name map (base class field)
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
io_block_queues_.Init(num_workers_, queue_size);
}

// Helper function for string comparison
bool StrComp(const std::string &a, const std::string &b) {
// returns 1 if string a is alphabetically
// less than string b
// quite similar to strcmp operation
return a < b;
}

// Single thread to go through the folder directory and gets all file names
// calculate numRows then return
Status AlbumOp::PrescanEntry() {
Path folder(folder_path_);
dirname_offset_ = folder_path_.length();
std::shared_ptr<Path::DirIterator> dirItr = Path::DirIterator::OpenDirectory(&folder);
if (folder.Exists() == false || dirItr == nullptr) {
RETURN_STATUS_UNEXPECTED("Error unable to open: " + folder_path_);
}
MS_LOG(INFO) << "Album folder Path found: " << folder_path_ << ".";

while (dirItr->hasNext()) {
Path file = dirItr->next();
if (extensions_.empty() || extensions_.find(file.Extension()) != extensions_.end()) {
(void)image_rows_.push_back(file.toString().substr(dirname_offset_));
} else {
MS_LOG(INFO) << "Album operator unsupported file found: " << file.toString()
<< ", extension: " << file.Extension() << ".";
}
}

std::sort(image_rows_.begin(), image_rows_.end(), StrComp);
num_rows_ = image_rows_.size();
return Status::OK();
}

// Main logic, Register Queue with TaskGroup, launch all threads and do the functor's work
Status AlbumOp::operator()() {
RETURN_IF_NOT_OK(this->PrescanEntry());
RETURN_IF_NOT_OK(LaunchThreadsAndInitOp());
std::unique_ptr<DataBuffer> sampler_buffer;
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
while (true) { // each iterator is 1 epoch
std::vector<int64_t> keys;
keys.reserve(rows_per_buffer_);
while (sampler_buffer->eoe() == false) {
TensorRow sample_row;
RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row));
std::shared_ptr<Tensor> sample_ids = sample_row[0];
for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
if ((*itr) >= num_rows_) continue; // index out of bound, skipping
keys.push_back(*itr);
row_cnt_++;
if (row_cnt_ % rows_per_buffer_ == 0) {
RETURN_IF_NOT_OK(
io_block_queues_[buf_cnt_++ % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
keys.clear();
}
}
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
}
if (keys.empty() == false) {
RETURN_IF_NOT_OK(
io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
}
if (IsLastIteration()) {
std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block)));
RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block)));
for (int32_t i = 0; i < num_workers_; ++i) {
RETURN_IF_NOT_OK(
io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
}
return Status::OK();
} else { // not the last repeat. Sleep master thread, wait for the wake-up from reset
RETURN_IF_NOT_OK(
io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
RETURN_IF_NOT_OK(wp_.Wait()); // Master thread goes to sleep after it has made all the IOBlocks
wp_.Clear();
RETURN_IF_NOT_OK(sampler_->GetNextSample(&sampler_buffer));
}
UpdateRepeatAndEpochCounter();
}
}

// contains the main logic of pulling a IOBlock from IOBlockQueue, load a buffer and push the buffer to out_connector_
// IMPORTANT: 1 IOBlock produces 1 DataBuffer
Status AlbumOp::WorkerEntry(int32_t worker_id) {
TaskManager::FindMe()->Post();
int64_t buffer_id = worker_id;
std::unique_ptr<IOBlock> io_block;
RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
while (io_block != nullptr) {
if (io_block->eoe() == true) {
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
buffer_id = worker_id;
} else if (io_block->eof() == true) {
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
} else {
std::vector<int64_t> keys;
RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
if (keys.empty() == true) return Status::OK(); // empty key is a quit signal for workers
std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
buffer_id += num_workers_;
}
RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
}
RETURN_STATUS_UNEXPECTED("Unexpected nullptr received in worker");
}

// Only support JPEG/PNG/GIF/BMP
// Optimization: Could take in a tensor
Status AlbumOp::CheckImageType(const std::string &file_name, bool *valid) {
std::ifstream file_handle;
constexpr int read_num = 3;
*valid = false;
file_handle.open(file_name, std::ios::binary | std::ios::in);
if (!file_handle.is_open()) {
RETURN_STATUS_UNEXPECTED("Can not open image file " + file_name);
}
unsigned char file_type[read_num];
(void)file_handle.read(reinterpret_cast<char *>(file_type), read_num);

if (file_handle.fail()) {
file_handle.close();
RETURN_STATUS_UNEXPECTED("Read image file failed " + file_name);
}
file_handle.close();
if (file_type[0] == 0xff && file_type[1] == 0xd8 && file_type[2] == 0xff) {
// Normal JPEGs start with \xff\xd8\xff\xe0
// JPEG with EXIF stats with \xff\xd8\xff\xe1
// Use \xff\xd8\xff to cover both.
*valid = true;
} else if (file_type[0] == 0x89 && file_type[1] == 0x50 && file_type[2] == 0x4e) {
// It's a PNG
*valid = true;
} else if (file_type[0] == 0x47 && file_type[1] == 0x49 && file_type[2] == 0x46) {
// It's a GIF
*valid = true;
} else if (file_type[0] == 0x42 && file_type[1] == 0x4d) {
// It's a BMP
*valid = true;
}
return Status::OK();
}

Status AlbumOp::LoadImageTensor(const std::string &image_file_path, uint32_t col_num, TensorRow *row) {
std::shared_ptr<Tensor> image;
std::ifstream fs;
fs.open(image_file_path, std::ios::binary | std::ios::in);
if (fs.fail()) {
MS_LOG(INFO) << "Image file not found:" << image_file_path << ".";
// If file doesn't exist, we don't flag this as error in input check, simply skip
return Status::OK();
}

MS_LOG(INFO) << "Image file found: " << image_file_path << ".";

// check that the file is an image before decoding
bool valid = false;
RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid));
RETURN_IF_NOT_OK(Tensor::CreateFromFile(image_file_path, &image));
if (decode_ && valid) {
Status rc = Decode(image, &image);
if (rc.IsError()) {
std::string err = "Fail to decode image:" + image_file_path;
RETURN_STATUS_UNEXPECTED(err);
}
}
row->push_back(std::move(image));
return Status::OK();
}

Status AlbumOp::LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
std::vector<std::string> data = json_obj;

MS_LOG(INFO) << "String array label found: " << data << ".";
std::shared_ptr<Tensor> label;
RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
row->push_back(std::move(label));
return Status::OK();
}

Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
std::string data = json_obj;
// now we iterate over the elements in json

MS_LOG(INFO) << "String label found: " << data << ".";
std::shared_ptr<Tensor> label;
RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(data, &label));
row->push_back(std::move(label));
return Status::OK();
}

Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
std::shared_ptr<Tensor> label;
// consider templating this function to handle all ints
if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT64)) {
std::vector<int64_t> data;

// Iterate over the integer list and add those values to the output shape tensor
auto items = json_obj.items();
using it_type = decltype(items.begin());
(void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });

RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
} else if (data_schema_->column(col_num).type() == DataType(DataType::DE_INT32)) {
std::vector<int32_t> data;

// Iterate over the integer list and add those values to the output shape tensor
auto items = json_obj.items();
using it_type = decltype(items.begin());
(void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });

MS_LOG(INFO) << "Int array found: " << data << ".";
RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
} else {
RETURN_STATUS_UNEXPECTED("Error in Load Int Tensor");
}
row->push_back(std::move(label));
return Status::OK();
}

Status AlbumOp::LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row) {
if (data_schema_->column(col_num).type() == DataType(DataType::DE_STRING)) {
std::shared_ptr<Tensor> id;
RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(file, &id));
row->push_back(std::move(id));
return Status::OK();
}
// hack to get the file name without extension, the 1 is to get rid of the backslash character
int64_t image_id = std::atoi(file.substr(1, file.find(".")).c_str());
std::shared_ptr<Tensor> id;
RETURN_IF_NOT_OK(Tensor::CreateScalar<int64_t>(image_id, &id));
MS_LOG(INFO) << "File ID " << image_id << ".";
row->push_back(std::move(id));
return Status::OK();
}

Status AlbumOp::LoadEmptyTensor(uint32_t col_num, TensorRow *row) {
// hack to get the file name without extension, the 1 is to get rid of the backslash character
std::shared_ptr<Tensor> empty_tensor;
RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({}), data_schema_->column(col_num).type(), &empty_tensor));
row->push_back(std::move(empty_tensor));
return Status::OK();
}

// Loads a tensor with float value, issue with float64, we don't have reverse look up to the type
// So we actually have to check what type we want to fill the tensor with.
// Float64 doesn't work with reinterpret cast here. Otherwise we limit the float in the schema to
// only be float32, seems like a weird limitation to impose
Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row) {
std::shared_ptr<Tensor> float_tensor;
if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT64)) {
double data = json_obj;
MS_LOG(INFO) << "double found: " << json_obj << ".";
RETURN_IF_NOT_OK(Tensor::CreateScalar<double>(data, &float_tensor));
} else if (data_schema_->column(col_num).type() == DataType(DataType::DE_FLOAT32)) {
float data = json_obj;
RETURN_IF_NOT_OK(Tensor::CreateScalar<float>(data, &float_tensor));
MS_LOG(INFO) << "float found: " << json_obj << ".";
}
row->push_back(std::move(float_tensor));
return Status::OK();
}

// Load 1 TensorRow (image,label) using 1 ImageColumns. 1 function call produces 1 TensorTow in a DataBuffer
// possible optimization: the helper functions of LoadTensorRow should be optimized
// to take a reference to a column descriptor?
Status AlbumOp::LoadTensorRow(const std::string &file, TensorRow *row) {
// testing here is to just print out file path
(*row) = {};
MS_LOG(INFO) << "Image row file: " << file << ".";

std::ifstream file_handle(folder_path_ + file);
if (!file_handle.is_open()) {
RETURN_STATUS_UNEXPECTED("Json file " + folder_path_ + file + " can not open.");
}
std::string line;
while (getline(file_handle, line)) {
try {
nlohmann::json js = nlohmann::json::parse(line);
MS_LOG(INFO) << "This Line: " << line << ".";

// note if take a schema here, then we have to iterate over all column descriptors in schema and check for key
// get columns in schema:
int32_t columns = data_schema_->NumColumns();

// loop over each column descriptor, this can optimized by swtich cases
for (int32_t i = 0; i < columns; i++) {
// special case to handle
if (data_schema_->column(i).name() == "id") {
// id is internal, special case to load from file
RETURN_IF_NOT_OK(LoadIDTensor(file, i, row));
continue;
}
// find if key does not exist, insert placeholder nullptr if not found
if (js.find(data_schema_->column(i).name()) == js.end()) {
// iterator not found, push nullptr as placeholder
MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->column(i).name() << ".";
RETURN_IF_NOT_OK(LoadEmptyTensor(i, row));
continue;
}
nlohmann::json column_value = js.at(data_schema_->column(i).name());
MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << ".";
bool is_array = column_value.is_array();
// load single string
if (column_value.is_string() && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) {
RETURN_IF_NOT_OK(LoadStringTensor(column_value, i, row));
continue;
}
// load string array
if (is_array && data_schema_->column(i).type() == DataType(DataType::DE_STRING)) {
RETURN_IF_NOT_OK(LoadStringArrayTensor(column_value, i, row));
continue;
}
// load image file
if (column_value.is_string() && data_schema_->column(i).type() != DataType(DataType::DE_STRING)) {
std::string image_file_path = column_value;
RETURN_IF_NOT_OK(LoadImageTensor(image_file_path, i, row));
continue;
}
// load float array
if (!is_array && (data_schema_->column(i).type() == DataType(DataType::DE_FLOAT32) ||
data_schema_->column(i).type() == DataType(DataType::DE_FLOAT64))) {
RETURN_IF_NOT_OK(LoadFloatTensor(column_value, i, row));
continue;
}
// int array
if (is_array && (data_schema_->column(i).type() == DataType(DataType::DE_INT64) ||
data_schema_->column(i).type() == DataType(DataType::DE_INT32))) {
RETURN_IF_NOT_OK(LoadIntArrayTensor(column_value, i, row));
continue;
} else {
MS_LOG(WARNING) << "Value type for column: " << data_schema_->column(i).name() << " is not supported.";
continue;
}
}
} catch (const std::exception &err) {
file_handle.close();
RETURN_STATUS_UNEXPECTED("Parse Json file failed");
}
}
file_handle.close();
return Status::OK();
}

// Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
Status AlbumOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
TensorRow trow;

for (const int64_t &key : keys) {
RETURN_IF_NOT_OK(this->LoadTensorRow(image_rows_[key], &trow));
deq->push_back(std::move(trow));
}
(*db)->set_tensor_table(std::move(deq));
return Status::OK();
}

void AlbumOp::Print(std::ostream &out, bool show_all) const {
// Always show the id and name as first line regardless if this summary or detailed print
out << "(" << std::setw(2) << operator_id_ << ") <AlbumOp>:";
if (!show_all) {
// Call the super class for displaying any common 1-liner info
ParallelOp::Print(out, show_all);
// Then show any custom derived-internal 1-liner info for this op
out << "\n";
} else {
// Call the super class for displaying any common detailed info
ParallelOp::Print(out, show_all);
// Then show any custom derived-internal stuff
out << "\nNumber of rows:" << num_rows_ << "\nAlbum directory: " << folder_path_ << "\n\n";
}
}

// Reset Sampler and wakeup Master thread (functor)
Status AlbumOp::Reset() {
RETURN_IF_NOT_OK(sampler_->ResetSampler());
row_cnt_ = 0;
wp_.Set(); // wake up master thread after reset is done
return Status::OK();
}

// hand shake with Sampler, allow Sampler to call RandomAccessOp's functions to get NumRows
Status AlbumOp::InitSampler() {
RETURN_IF_NOT_OK(sampler_->HandshakeRandomAccessOp(this));
return Status::OK();
}

Status AlbumOp::LaunchThreadsAndInitOp() {
RETURN_UNEXPECTED_IF_NULL(tree_);
// registers QueueList and individual Queues for interrupt services
RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
RETURN_IF_NOT_OK(wp_.Register(tree_->AllTasks()));
// launch main workers that load DataBuffers by reading all images
RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&AlbumOp::WorkerEntry, this, std::placeholders::_1)));
TaskManager::FindMe()->Post();
RETURN_IF_NOT_OK(this->InitSampler()); // pass numRows to Sampler
return Status::OK();
}

// Visitor accept method for NodePass
Status AlbumOp::Accept(NodePass *p, bool *modified) {
// Downcast shared pointer then call visitor
return p->RunOnNode(shared_from_base<AlbumOp>(), modified);
}

Status AlbumOp::ComputeColMap() {
// Set the column name map (base class field)
if (column_name_id_map_.empty()) {
for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
column_name_id_map_[data_schema_->column(i).name()] = i;
}
} else {
MS_LOG(WARNING) << "Column name map is already set!";
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 298
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.h View File

@@ -0,0 +1,298 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_

#include <deque>
#include <memory>
#include <queue>
#include <string>
#include <algorithm>
#include <map>
#include <set>
#include <utility>
#include <vector>
#include <unordered_map>
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/engine/data_buffer.h"
#include "minddata/dataset/engine/data_schema.h"
#include "minddata/dataset/engine/datasetops/parallel_op.h"
#include "minddata/dataset/engine/datasetops/source/io_block.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/queue.h"
#include "minddata/dataset/util/services.h"
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/util/wait_post.h"

namespace mindspore {
namespace dataset {
// Forward declares
template <typename T>
class Queue;

// Define row information as a list of file objects to read
using FolderImages = std::shared_ptr<std::pair<std::string, std::queue<std::string>>>;

/// \class AlbumOp album_op.h
class AlbumOp : public ParallelOp, public RandomAccessOp {
public:
class Builder {
public:
/// \brief Constructor for Builder class of AlbumOp
Builder();

/// \brief Destructor.
~Builder() = default;

/// \brief Setter method
/// \param[in] rows_per_buffer
/// \return Builder setter method returns reference to the builder
Builder &SetRowsPerBuffer(int32_t rows_per_buffer) {
builder_rows_per_buffer_ = rows_per_buffer;
return *this;
}

/// \brief Setter method
/// \param[in] size
/// \return Builder setter method returns reference to the builder
Builder &SetOpConnectorSize(int32_t size) {
builder_op_connector_size_ = size;
return *this;
}

/// \brief Setter method
/// \param[in] exts - file extensions to be read
/// \return Builder setter method returns reference to the builder
Builder &SetExtensions(const std::set<std::string> &exts) {
builder_extensions_ = exts;
return *this;
}

/// \brief Setter method
/// \param[in] do_decode
/// \return Builder setter method returns reference to the builder
Builder &SetDecode(bool do_decode) {
builder_decode_ = do_decode;
return *this;
}

/// \brief Setter method
/// \param[in] num_workers
/// \return Builder setter method returns reference to the builder
Builder &SetNumWorkers(int32_t num_workers) {
builder_num_workers_ = num_workers;
return *this;
}

/// \brief Setter method
/// \param[in] sampler
/// \return Builder setter method returns reference to the builder
Builder &SetSampler(std::shared_ptr<Sampler> sampler) {
builder_sampler_ = std::move(sampler);
return *this;
}

/// \brief Setter method
/// \param[in] dir - dataset directory
/// \return Builder setter method returns reference to the builder
Builder &SetAlbumDir(const std::string &dir) {
builder_dir_ = dir;
return *this;
}

/// \brief Setter method
/// \param[in] file - schema file to load
/// \return Builder setter method returns reference to the builder
Builder &SetSchemaFile(const std::string &file) {
builder_schema_file_ = file;
return *this;
}

/// \brief Setter method
/// \param[in] columns - input columns
/// \return Builder setter method returns reference to the builder
Builder &SetColumnsToLoad(const std::vector<std::string> &columns) {
builder_columns_to_load_ = columns;
return *this;
}

/// \brief Check validity of input args
/// \return - The error code return
Status SanityCheck();

/// \brief The builder "build" method creates the final object.
/// \param[inout] std::shared_ptr<AlbumOp> *op - DatasetOp
/// \return - The error code return
Status Build(std::shared_ptr<AlbumOp> *op);

private:
bool builder_decode_;
std::vector<std::string> builder_columns_to_load_;
std::string builder_dir_;
std::string builder_schema_file_;
int32_t builder_num_workers_;
int32_t builder_rows_per_buffer_;
int32_t builder_op_connector_size_;
std::set<std::string> builder_extensions_;
std::shared_ptr<Sampler> builder_sampler_;
std::unique_ptr<DataSchema> builder_schema_;
};

/// \brief Constructor
/// \param[in] num_wkrs - Num of workers reading images in parallel
/// \param[in] rows_per_buffer Number of images (rows) in each buffer
/// \param[in] file_dir - directory of Album
/// \param[in] queue_size - connector size
/// \param[in] do_decode - decode image files
/// \param[in] exts - set of file extensions to read, if empty, read everything under the dir
/// \param[in] data_schema - schema of dataset
/// \param[in] sampler - sampler tells AlbumOp what to read
AlbumOp(int32_t num_wkrs, int32_t rows_per_buffer, std::string file_dir, int32_t queue_size, bool do_decode,
const std::set<std::string> &exts, std::unique_ptr<DataSchema> data_schema, std::shared_ptr<Sampler> sampler);

/// \brief Destructor.
~AlbumOp() = default;

/// \brief Initialize AlbumOp related var, calls the function to walk all files
/// \return - The error code return
Status PrescanEntry();

/// \brief Worker thread pulls a number of IOBlock from IOBlock Queue, make a buffer and push it to Connector
/// \param[in] int32_t workerId - id of each worker
/// \return Status - The error code return
Status WorkerEntry(int32_t worker_id) override;

/// \brief Main Loop of AlbumOp
/// Master thread: Fill IOBlockQueue, then goes to sleep
/// Worker thread: pulls IOBlock from IOBlockQueue, work on it then put buffer to mOutConnector
/// \return Status - The error code return
Status operator()() override;

/// \brief A print method typically used for debugging
/// \param[in] out
/// \param[in] show_all
void Print(std::ostream &out, bool show_all) const override;

/// \brief Check if image ia valid.Only support JPEG/PNG/GIF/BMP
/// This function could be optimized to return the tensor to reduce open/closing files
/// \return Status - The error code return
Status CheckImageType(const std::string &file_name, bool *valid);

// Base-class override for NodePass visitor acceptor.
// @param p - Pointer to the NodePass to be accepted.
// @param modified - Whether this node visit modified the pipeline.
// @return - Status of the node visit.
Status Accept(NodePass *p, bool *modified) override;

// Op name getter
// @return Name of the current Op
std::string Name() const override { return "AlbumOp"; }

private:
/// \brief Initialize Sampler, calls sampler->Init() within
/// \return Status The error code return
Status InitSampler();

/// \brief Load image to tensor row
/// \param[in] image_file Image name of file
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadImageTensor(const std::string &image_file, uint32_t col_num, TensorRow *row);

/// \brief Load vector of ints to tensor, append tensor to tensor row
/// \param[in] json_obj Json object containing multi-dimensional label
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadIntArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);

/// \brief Load string array into a tensor, append tensor to tensor row
/// \param[in] json_obj Json object containing string tensor
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadStringArrayTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);

/// \brief Load string into a tensor, append tensor to tensor row
/// \param[in] json_obj Json object containing string tensor
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadStringTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);

/// \brief Load float value to tensor row
/// \param[in] json_obj Json object containing float
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadFloatTensor(const nlohmann::json &json_obj, uint32_t col_num, TensorRow *row);

/// \brief Load emtpy tensor to tensor row
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadEmptyTensor(uint32_t col_num, TensorRow *row);

/// \brief Load id from file name to tensor row
/// \param[in] file The file name to get ID from
/// \param[in] col_num Column num in schema
/// \param[inout] row Tensor row to push to
/// \return Status The error code return
Status LoadIDTensor(const std::string &file, uint32_t col_num, TensorRow *row);

/// \brief Load a tensor row according to a json file
/// \param[in] ImageColumns file Json file location
/// \param[inout] TensorRow row Json content stored into a tensor row
/// \return Status The error code return
Status LoadTensorRow(const std::string &file, TensorRow *row);

/// \param[in] const std::vector<int64_t> &keys Keys in ioblock
/// \param[inout] std::unique_ptr<DataBuffer> db Databuffer to push to
/// \return Status The error code return
Status LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db);

/// \brief Called first when function is called
/// \return The error code return
Status LaunchThreadsAndInitOp();

/// \brief reset Op
/// \return Status The error code return
Status Reset() override;

// Private function for computing the assignment of the column name map.
// @return - Status
Status ComputeColMap() override;

int32_t rows_per_buffer_;
std::string folder_path_; // directory of image folder
bool decode_;
std::set<std::string> extensions_; // extensions allowed
std::unordered_map<std::string, int32_t> col_name_map_;
std::unique_ptr<DataSchema> data_schema_;
std::shared_ptr<Sampler> sampler_;
int64_t row_cnt_;
int64_t buf_cnt_;
int64_t sampler_ind_;
int64_t dirname_offset_;
WaitPost wp_;
std::vector<std::string> image_rows_;
QueueList<std::unique_ptr<IOBlock>> io_block_queues_; // queues of IOBlocks
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_ALBUM_OP_H_

+ 0
- 1
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc View File

@@ -134,7 +134,6 @@ Status ImageFolderOp::operator()() {
TensorRow sample_row;
RETURN_IF_NOT_OK(sampler_buffer->PopRow(&sample_row));
std::shared_ptr<Tensor> sample_ids = sample_row[0];
if (sample_ids->type() != DataType(DataType::DE_INT64)) RETURN_STATUS_UNEXPECTED("Sampler Tensor isn't int64");
for (auto itr = sample_ids->begin<int64_t>(); itr != sample_ids->end<int64_t>(); ++itr) {
if ((*itr) >= num_rows_) continue; // index out of bound, skipping
keys.push_back(*itr);


+ 6
- 0
mindspore/ccsrc/minddata/dataset/engine/opt/pass.cc View File

@@ -30,6 +30,7 @@
#include "minddata/dataset/engine/datasetops/repeat_op.h"
#include "minddata/dataset/engine/datasetops/skip_op.h"
#include "minddata/dataset/engine/datasetops/shuffle_op.h"
#include "minddata/dataset/engine/datasetops/source/album_op.h"
#include "minddata/dataset/engine/datasetops/source/celeba_op.h"
#include "minddata/dataset/engine/datasetops/source/cifar_op.h"
#include "minddata/dataset/engine/datasetops/source/coco_op.h"
@@ -199,6 +200,11 @@ Status NodePass::RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified)
return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
}

Status NodePass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) {
// Fallback to base class visitor by default
return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);
}

Status NodePass::RunOnNode(std::shared_ptr<CacheOp> node, bool *modified) {
// Fallback to base class visitor by default
return RunOnNode(std::static_pointer_cast<DatasetOp>(node), modified);


+ 4
- 0
mindspore/ccsrc/minddata/dataset/engine/opt/pass.h View File

@@ -49,6 +49,8 @@ class FilterOp;
class GeneratorOp;
#endif

class AlbumOp;

class RandomDataOp;

class RepeatOp;
@@ -178,6 +180,8 @@ class NodePass : public Pass {

virtual Status RunOnNode(std::shared_ptr<RandomDataOp> node, bool *modified);

virtual Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified);

virtual Status RunOnNode(std::shared_ptr<TakeOp> node, bool *modified);

virtual Status RunOnNode(std::shared_ptr<ZipOp> node, bool *modified);


+ 6
- 0
mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc View File

@@ -21,6 +21,7 @@
#include "minddata/dataset/engine/datasetops/cache_lookup_op.h"
#include "minddata/dataset/engine/datasetops/cache_merge_op.h"
#include "minddata/dataset/engine/datasetops/cache_op.h"
#include "minddata/dataset/engine/datasetops/source/album_op.h"
#include "minddata/dataset/engine/datasetops/source/celeba_op.h"
#include "minddata/dataset/engine/datasetops/source/cifar_op.h"
#include "minddata/dataset/engine/datasetops/source/coco_op.h"
@@ -152,6 +153,11 @@ Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<ImageFolderOp> n
return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node));
}

// Perform leaf node cache transform identification
Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) {
return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node));
}

// Perform leaf node cache transform identification
Status CacheTransformPass::CachePass::RunOnNode(std::shared_ptr<MnistOp> node, bool *modified) {
return MappableCacheLeafSetup(std::static_pointer_cast<DatasetOp>(node));


+ 6
- 0
mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.h View File

@@ -79,6 +79,12 @@ class CacheTransformPass : public TreePass {
/// \return Status The error code return
Status RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) override;

/// \brief Perform leaf node cache tranform identifications
/// \param[in] node The node being visited
/// \param[inout] modified Indicator if the node was changed at all
/// \return Status The error code return
Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) override;

/// \brief Perform leaf node cache tranform identifications
/// \param[in] node The node being visited
/// \param[inout] modified Indicator if the node was changed at all


+ 6
- 0
mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.cc View File

@@ -111,5 +111,11 @@ Status PrinterPass::RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modifie
std::cout << "Visiting ImageFolderOp" << '\n';
return Status::OK();
}

Status PrinterPass::RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) {
*modified = false;
std::cout << "Visiting ImageFolderOp" << '\n';
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/engine/opt/util/printer_pass.h View File

@@ -58,6 +58,8 @@ class PrinterPass : public NodePass {
Status RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) override;

Status RunOnNode(std::shared_ptr<ImageFolderOp> node, bool *modified) override;

Status RunOnNode(std::shared_ptr<AlbumOp> node, bool *modified) override;
};

} // namespace dataset


+ 90
- 50
mindspore/ccsrc/minddata/dataset/include/datasets.h View File

@@ -48,6 +48,7 @@ class TensorOperation;
class SchemaObj;
class SamplerObj;
// Datasets classes (in alphabetical order)
class AlbumDataset;
class CelebADataset;
class Cifar10Dataset;
class Cifar100Dataset;
@@ -79,13 +80,27 @@ class ZipDataset;
/// \return Shared pointer to the current schema
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");

/// \brief Function to create an AlbumDataset
/// \notes The generated dataset is specified through setting a schema
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] data_schema Path to dataset schema file
/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
/// (default = {})
/// \param[in] decode the option to decode the images in dataset (default = false)
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr)
/// \return Shared pointer to the current Dataset
std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema,
const std::vector<std::string> &column_names = {}, bool decode = false,
const std::shared_ptr<SamplerObj> &sampler = nullptr);

/// \brief Function to create a CelebADataset
/// \notes The generated dataset has two columns ['image', 'attr'].
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'.
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// will be used to randomly iterate the entire dataset
/// \param[in] decode Decode the images after reading (default=false).
/// \param[in] extensions Set of file extensions to be included in the dataset (default={}).
/// \return Shared pointer to the current Dataset
@@ -97,7 +112,7 @@ std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std:
/// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current Dataset
std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
@@ -106,7 +121,7 @@ std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir,
/// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current Dataset
std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
@@ -114,19 +129,19 @@ std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir,
/// \brief Function to create a CLUEDataset
/// \notes The generated dataset has a variable number of columns depending on the task and usage
/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list
/// will be sorted in a lexicographical order.
/// will be sorted in a lexicographical order.
/// \param[in] task The kind of task, one of "AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC" and "CSL" (default="AFQMC").
/// \param[in] usage Be used to "train", "test" or "eval" data (default="train").
/// \param[in] num_samples The number of samples to be included in the dataset.
/// (Default = 0 means all samples.)
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal)
/// Can be any of:
/// ShuffleMode::kFalse - No shuffling is performed.
/// ShuffleMode::kFiles - Shuffle files only.
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
/// (Default = 0 means all samples.)
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal)
/// Can be any of:
/// ShuffleMode.kFalse - No shuffling is performed.
/// ShuffleMode.kFiles - Shuffle files only.
/// ShuffleMode.kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1)
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified. (Default = 0)
/// specified only when num_shards is also specified. (Default = 0)
/// \return Shared pointer to the current CLUEDataset
std::shared_ptr<CLUEDataset> CLUE(const std::vector<std::string> &dataset_files, const std::string &task = "AFQMC",
const std::string &usage = "train", int64_t num_samples = 0,
@@ -135,19 +150,19 @@ std::shared_ptr<CLUEDataset> CLUE(const std::vector<std::string> &dataset_files,

/// \brief Function to create a CocoDataset
/// \notes The generated dataset has multi-columns :
/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32],
/// ['iscrowd', dtype=uint32]].
/// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]].
/// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32],
/// ['num_keypoints', dtype=uint32]].
/// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32],
/// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]].
/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32],
/// ['iscrowd', dtype=uint32]].
/// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]].
/// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32],
/// ['num_keypoints', dtype=uint32]].
/// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32],
/// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]].
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] annotation_file Path to the annotation json
/// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint'
/// \param[in] decode Decode the images after reading
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current Dataset
std::shared_ptr<CocoDataset> Coco(const std::string &dataset_dir, const std::string &annotation_file,
const std::string &task = "Detection", const bool &decode = false,
@@ -181,12 +196,12 @@ std::shared_ptr<CSVDataset> CSV(const std::vector<std::string> &dataset_files, c

/// \brief Function to create an ImageFolderDataset
/// \notes A source dataset that reads images from a tree of directories
/// All images within one folder have the same label
/// The generated dataset has two columns ['image', 'label']
/// All images within one folder have the same label
/// The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] decode A flag to decode in ImageFolder
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \param[in] extensions File extensions to be read
/// \param[in] class_indexing a class name to label map
/// \return Shared pointer to the current ImageFolderDataset
@@ -200,9 +215,9 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
/// \param[in] dataset_file The dataset file to be read
/// \param[in] usage Need "train", "eval" or "inference" data (default="train")
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder
/// names will be sorted alphabetically and each class will be given a unique index starting from 0).
/// names will be sorted alphabetically and each class will be given a unique index starting from 0).
/// \param[in] decode Decode the images after reading (default=false).
/// \return Shared pointer to the current ManifestDataset
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage = "train",
@@ -214,7 +229,7 @@ std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string
/// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current MnistDataset
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
@@ -245,17 +260,17 @@ std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schem
/// \brief Function to create a TextFileDataset
/// \notes The generated dataset has one column ['text']
/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list
/// will be sorted in a lexicographical order.
/// will be sorted in a lexicographical order.
/// \param[in] num_samples The number of samples to be included in the dataset.
/// (Default = 0 means all samples.)
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal)
/// Can be any of:
/// ShuffleMode::kFalse - No shuffling is performed.
/// ShuffleMode::kFiles - Shuffle files only.
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
/// (Default = 0 means all samples.)
/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal)
/// Can be any of:
/// ShuffleMode.kFalse - No shuffling is performed.
/// ShuffleMode.kFiles - Shuffle files only.
/// ShuffleMode.kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1)
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified. (Default = 0)
/// specified only when num_shards is also specified. (Default = 0)
/// \return Shared pointer to the current TextFileDataset
std::shared_ptr<TextFileDataset> TextFile(const std::vector<std::string> &dataset_files, int64_t num_samples = 0,
ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1,
@@ -263,16 +278,16 @@ std::shared_ptr<TextFileDataset> TextFile(const std::vector<std::string> &datase

/// \brief Function to create a VOCDataset
/// \notes The generated dataset has multi-columns :
/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32],
/// ['difficult', dtype=uint32], ['truncate', dtype=uint32]].
/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]].
/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32],
/// ['difficult', dtype=uint32], ['truncate', dtype=uint32]].
/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]].
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection"
/// \param[in] mode Set the data list txt file to be readed
/// \param[in] class_indexing A str-to-int mapping from label name to index
/// \param[in] decode Decode the images after reading
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current Dataset
std::shared_ptr<VOCDataset> VOC(const std::string &dataset_dir, const std::string &task = "Segmentation",
const std::string &mode = "train",
@@ -335,9 +350,9 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \notes Combines batch_size number of consecutive rows into batches
/// \param[in] batch_size Path to the root directory that contains the dataset
/// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete
/// batch. If true, and if there are less than batch_size rows
/// available to make the last batch, then those rows will
/// be dropped and not propagated to the next node
/// batch. If true, and if there are less than batch_size rows
/// available to make the last batch, then those rows will
/// be dropped and not propagated to the next node
/// \return Shared pointer to the current BatchDataset
std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false);

@@ -368,16 +383,16 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \brief Function to create a MapDataset
/// \notes Applies each operation in operations to this dataset
/// \param[in] operations Vector of operations to be applied on the dataset. Operations are
/// applied in the order they appear in this list
/// applied in the order they appear in this list
/// \param[in] input_columns Vector of the names of the columns that will be passed to the first
/// operation as input. The size of this list must match the number of
/// input columns expected by the first operator. The default input_columns
/// is the first column
/// operation as input. The size of this list must match the number of
/// input columns expected by the first operator. The default input_columns
/// is the first column
/// \param[in] output_columns Vector of names assigned to the columns outputted by the last operation
/// This parameter is mandatory if len(input_columns) != len(output_columns)
/// The size of this list must match the number of output columns of the
/// last operation. The default output_columns will have the same
/// name as the input columns, i.e., the columns will be replaced
/// This parameter is mandatory if len(input_columns) != len(output_columns)
/// The size of this list must match the number of output columns of the
/// last operation. The default output_columns will have the same
/// name as the input columns, i.e., the columns will be replaced
/// \param[in] project_columns A list of column names to project
/// \return Shared pointer to the current MapDataset
std::shared_ptr<MapDataset> Map(std::vector<std::shared_ptr<TensorOperation>> operations,
@@ -404,7 +419,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \param[in] count Number of times the dataset should be repeated
/// \return Shared pointer to the current Dataset
/// \note Repeat will return shared pointer to `Dataset` instead of `RepeatDataset`
/// due to a limitation in the current implementation
/// due to a limitation in the current implementation
std::shared_ptr<Dataset> Repeat(int32_t count = -1);

/// \brief Function to create a Shuffle Dataset
@@ -506,6 +521,31 @@ class SchemaObj {
// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS
// (In alphabetical order)

class AlbumDataset : public Dataset {
public:
/// \brief Constructor
AlbumDataset(const std::string &dataset_dir, const std::string &data_schema,
const std::vector<std::string> &column_names, bool decode, const std::shared_ptr<SamplerObj> &sampler);

/// \brief Destructor
~AlbumDataset() = default;

/// \brief a base class override function to create a runtime dataset op object from this class
/// \return shared pointer to the newly created DatasetOp
std::vector<std::shared_ptr<DatasetOp>> Build() override;

/// \brief Parameters validation
/// \return bool true if all the params are valid
bool ValidateParams() override;

private:
std::string dataset_dir_;
std::string schema_path_;
std::vector<std::string> column_names_;
bool decode_;
std::shared_ptr<SamplerObj> sampler_;
};

class CelebADataset : public Dataset {
public:
/// \brief Constructor


+ 2
- 0
tests/ut/cpp/dataset/CMakeLists.txt View File

@@ -5,6 +5,7 @@ SET(DE_UT_SRCS
common/cvop_common.cc
common/bboxop_common.cc
auto_contrast_op_test.cc
album_op_test.cc
batch_op_test.cc
bit_functions_test.cc
storage_container_test.cc
@@ -101,6 +102,7 @@ SET(DE_UT_SRCS
c_api_samplers_test.cc
c_api_transforms_test.cc
c_api_dataset_ops_test.cc
c_api_dataset_album_test.cc
c_api_dataset_cifar_test.cc
c_api_dataset_clue_test.cc
c_api_dataset_coco_test.cc


+ 208
- 0
tests/ut/cpp/dataset/album_op_test.cc View File

@@ -0,0 +1,208 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include "common/common.h"
#include "minddata/dataset/core/client.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/engine/datasetops/source/album_op.h"
#include "minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
#include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"
#include "gtest/gtest.h"
#include "utils/log_adapter.h"
#include "securec.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/transforms.h"

using namespace mindspore::dataset;
using mindspore::MsLogLevel::ERROR;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::LogStream;

std::shared_ptr<BatchOp> Batch(int batch_size = 1, bool drop = false, int rows_per_buf = 2);

std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);

std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);

std::shared_ptr<AlbumOp> Album(int64_t num_works, int64_t rows, int64_t conns, std::string path,
bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
bool decode = false) {
std::shared_ptr<AlbumOp> so;
AlbumOp::Builder builder;
Status rc = builder.SetNumWorkers(num_works)
.SetAlbumDir(path)
.SetRowsPerBuffer(rows)
.SetOpConnectorSize(conns)
.SetExtensions({".json"})
.SetSampler(std::move(sampler))
.SetDecode(decode)
.Build(&so);
return so;
}

std::shared_ptr<AlbumOp> AlbumSchema(int64_t num_works, int64_t rows, int64_t conns, std::string path,
std::string schema_file, std::vector<std::string> column_names = {},
bool shuf = false, std::unique_ptr<Sampler> sampler = nullptr,
bool decode = false) {
std::shared_ptr<AlbumOp> so;
AlbumOp::Builder builder;
Status rc = builder.SetNumWorkers(num_works)
.SetSchemaFile(schema_file)
.SetColumnsToLoad(column_names)
.SetAlbumDir(path)
.SetRowsPerBuffer(rows)
.SetOpConnectorSize(conns)
.SetExtensions({".json"})
.SetSampler(std::move(sampler))
.SetDecode(decode)
.Build(&so);
return so;
}

class MindDataTestAlbum : public UT::DatasetOpTesting {
protected:
};

TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchema) {
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file, column_names, false), Repeat(2)});
tree->Prepare();
Status rc = tree->Launch();
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
EXPECT_TRUE(false);
} else {
DatasetIterator di(tree);
TensorMap tensor_map;
di.GetNextAsMap(&tensor_map);
EXPECT_TRUE(rc.IsOk());
uint64_t i = 0;
int32_t label = 0;
while (tensor_map.size() != 0) {
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
<< tensor_map["label"] << "\n";
i++;
di.GetNextAsMap(&tensor_map);
}
MS_LOG(INFO) << "got rows" << i << "\n";
EXPECT_TRUE(i == 14);
}
}

TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaNoOrder) {
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)});
tree->Prepare();
Status rc = tree->Launch();
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
EXPECT_TRUE(false);
} else {
DatasetIterator di(tree);
TensorMap tensor_map;
di.GetNextAsMap(&tensor_map);
EXPECT_TRUE(rc.IsOk());
uint64_t i = 0;
int32_t label = 0;
while (tensor_map.size() != 0) {
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
<< tensor_map["label"] << "\n";
i++;
di.GetNextAsMap(&tensor_map);
}
MS_LOG(INFO) << "got rows" << i << "\n";
EXPECT_TRUE(i == 14);
}
}

TEST_F(MindDataTestAlbum, TestSequentialAlbumWithSchemaFloat) {
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
// add the priority column
std::string schema_file = datasets_root_path_ + "/testAlbum/floatSchema.json";
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)});
tree->Prepare();
Status rc = tree->Launch();
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
EXPECT_TRUE(false);
} else {
DatasetIterator di(tree);
TensorMap tensor_map;
di.GetNextAsMap(&tensor_map);
EXPECT_TRUE(rc.IsOk());
uint64_t i = 0;
int32_t label = 0;
double priority = 0;
while (tensor_map.size() != 0) {
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
tensor_map["_priority"]->GetItemAt<double>(&priority, {});
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
<< tensor_map["label"] << "priority: " << priority << "\n";
i++;
di.GetNextAsMap(&tensor_map);
}
MS_LOG(INFO) << "got rows" << i << "\n";
EXPECT_TRUE(i == 14);
}
}

TEST_F(MindDataTestAlbum, TestSequentialAlbumWithFullSchema) {
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
// add the priority column
std::string schema_file = datasets_root_path_ + "/testAlbum/fullSchema.json";
auto tree = Build({AlbumSchema(16, 2, 32, folder_path, schema_file), Repeat(2)});
tree->Prepare();
Status rc = tree->Launch();
if (rc.IsError()) {
MS_LOG(ERROR) << "Return code error detected during tree launch: " << ".";
EXPECT_TRUE(false);
} else {
DatasetIterator di(tree);
TensorMap tensor_map;
di.GetNextAsMap(&tensor_map);
EXPECT_TRUE(rc.IsOk());
uint64_t i = 0;
int32_t label = 0;
double priority = 0;
while (tensor_map.size() != 0) {
tensor_map["label"]->GetItemAt<int32_t>(&label, {});
tensor_map["_priority"]->GetItemAt<double>(&priority, {});
MS_LOG(DEBUG) << "row: " << i << "\t" << tensor_map["image"]->shape() << "label:" << label << "label shape"
<< tensor_map["label"] << "priority: " << priority << " embedding : " <<
tensor_map["_embedding"]->shape() << "\n";
i++;
di.GetNextAsMap(&tensor_map);
}
MS_LOG(INFO) << "got rows" << i << "\n";
EXPECT_TRUE(i == 14);
}
}


+ 136
- 0
tests/ut/cpp/dataset/c_api_dataset_album_test.cc View File

@@ -0,0 +1,136 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"

using namespace mindspore::dataset::api;
using mindspore::dataset::Tensor;

class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};

TEST_F(MindDataTestPipeline, TestAlbumBasic) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumBasic.";

std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
// Create a Album Dataset
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
iter->GetNextRow(&row);
}

EXPECT_EQ(i, 7);

// Manually terminate the pipeline
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestAlbumDecode) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDecode.";
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
// Create a Album Dataset
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
auto shape = image->shape();
MS_LOG(INFO) << "Tensor image shape size: " << shape.Size();
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
EXPECT_GT(shape.Size(), 1); // Verify decode=true took effect
iter->GetNextRow(&row);
}

EXPECT_EQ(i, 7);

// Manually terminate the pipeline
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers.";
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
// Create a Album Dataset
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1));
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
iter->GetNextRow(&row);
}

EXPECT_EQ(i, 1);

// Manually terminate the pipeline
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestAlbumError) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumError.";
std::string folder_path = datasets_root_path_ + "/testAlbum/ima";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
// Create a Album Dataset
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, SequentialSampler(0, 1));

EXPECT_EQ(ds, nullptr);
}

+ 2
- 0
tests/ut/cpp/runtest.sh View File

@@ -32,6 +32,8 @@ export GLOG_v=2

## prepare data for dataset & mindrecord
cp -fr $PROJECT_PATH/tests/ut/data ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/
## prepare album dataset, uses absolute path so has to be generated
python ${PROJECT_PATH}/build/mindspore/tests/ut/cpp/data/dataset/testAlbum/gen_json.py

if [ $# -gt 0 ]; then
./ut_tests --gtest_filter=$1


+ 1
- 0
tests/ut/data/dataset/testAlbum/bin/sample.bin View File

@@ -0,0 +1 @@
just some random stuff

+ 16
- 0
tests/ut/data/dataset/testAlbum/datasetSchema.json View File

@@ -0,0 +1,16 @@
{
"columns": {
"image": {
"type": "uint8",
"rank": 1
},
"label" : {
"type": "string",
"rank": 1
},
"id" : {
"type": "int64",
"rank": 0
}
}
}

+ 1
- 1
tests/ut/data/dataset/testAlbum/floatSchema.json View File

@@ -5,7 +5,7 @@
"rank": 1
},
"label" : {
"type": "int32",
"type": "string",
"rank": 1
},
"id" : {


+ 1
- 1
tests/ut/data/dataset/testAlbum/fullSchema.json View File

@@ -5,7 +5,7 @@
"rank": 1
},
"label" : {
"type": "int32",
"type": "string",
"rank": 1
},
"id" : {


+ 11
- 11
tests/ut/data/dataset/testAlbum/gen_json.py View File

@@ -2,21 +2,21 @@ import json
import os

def dump_json_from_dict(structure, file_name):
with open(file_name + '.json', 'w') as file_path:
json.dump(structure, file_path)
with open(file_name + '.json', 'w') as fp:
json.dump(structure, fp)

if __name__ == '__main__':
# iterate over directory
DIRECTORY = "imagefolder"
i = 0
# iterate over DIRECTORY
DIRECTORY = os.path.dirname(os.path.realpath(__file__)) + "/original"
PARENT_DIR = os.path.dirname(DIRECTORY)
i = -1
for filename in os.listdir(DIRECTORY):
default_dict = {}
default_dict.update(dataset='')
default_dict.update(image=(os.path.join(DIRECTORY, filename)))
default_dict.update(label=[1, 2])
default_dict.update(image=os.path.abspath(os.path.join(DIRECTORY, filename)))
default_dict.update(label=['3', '2'])
default_dict.update(_priority=0.8)
default_dict.update(_embedding='sample.bin')
default_dict.update(_segmented_image=(os.path.join(DIRECTORY, filename)))
default_dict.update(_processed_image=(os.path.join(DIRECTORY, filename)))
default_dict.update(_embedding=os.path.abspath(os.path.join(PARENT_DIR, 'sample.bin')))
default_dict.update(_processed_image=os.path.abspath(os.path.join(DIRECTORY, filename)))
i = i + 1
dump_json_from_dict(default_dict, 'images/'+str(i))
dump_json_from_dict(default_dict, PARENT_DIR + '/images/'+str(i))

+ 1
- 0
tests/ut/data/dataset/testAlbum/images/0.json View File

@@ -0,0 +1 @@
{"dataset": "", "image": "original/apple_expect_decoded.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "sample.bin", "_processed_image": "original/apple_expect_decoded.jpg"}

+ 1
- 1
tests/ut/data/dataset/testAlbum/images/1.json View File

@@ -1 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_decoded.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_decoded.jpg", "_processed_image": "imagefolder/apple_expect_decoded.jpg"}
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_resize_bilinear.jpg"}

+ 1
- 1
tests/ut/data/dataset/testAlbum/images/2.json View File

@@ -1 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_resize_bilinear.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_resize_bilinear.jpg", "_processed_image": "imagefolder/apple_expect_resize_bilinear.jpg"}
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_changemode.jpg"}

+ 1
- 1
tests/ut/data/dataset/testAlbum/images/3.json View File

@@ -1 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_changemode.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_changemode.jpg", "_processed_image": "imagefolder/apple_expect_changemode.jpg"}
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_not_flip.jpg"}

+ 1
- 1
tests/ut/data/dataset/testAlbum/images/4.json View File

@@ -1 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_not_flip.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_not_flip.jpg", "_processed_image": "imagefolder/apple_expect_not_flip.jpg"}
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_horizontal.jpg"}

+ 1
- 1
tests/ut/data/dataset/testAlbum/images/5.json View File

@@ -1 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_flipped_horizontal.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_horizontal.jpg", "_processed_image": "imagefolder/apple_expect_flipped_horizontal.jpg"}
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_rescaled.jpg"}

+ 1
- 1
tests/ut/data/dataset/testAlbum/images/6.json View File

@@ -1 +1 @@
{"dataset": "", "image": "imagefolder/apple_expect_rescaled.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_rescaled.jpg", "_processed_image": "imagefolder/apple_expect_rescaled.jpg"}
{"dataset": "", "image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg", "label": ["3", "2"], "_priority": 0.8, "_embedding": "testAlbum//testAlbum/sample.bin", "_processed_image": "testAlbum//testAlbum/original/apple_expect_flipped_vertical.jpg"}

+ 0
- 1
tests/ut/data/dataset/testAlbum/images/7.json View File

@@ -1 +0,0 @@
{"dataset": "", "image": "imagefolder/apple_expect_flipped_vertical.jpg", "label": [1, 2], "_priority": 0.8, "_embedding": "sample.bin", "_segmented_image": "imagefolder/apple_expect_flipped_vertical.jpg", "_processed_image": "imagefolder/apple_expect_flipped_vertical.jpg"}

tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_changemode.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_changemode.jpg View File


tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_decoded.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_decoded.jpg View File


tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_flipped_horizontal.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_horizontal.jpg View File


tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_flipped_vertical.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_flipped_vertical.jpg View File


tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_not_flip.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_not_flip.jpg View File


tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_rescaled.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_rescaled.jpg View File


tests/ut/data/dataset/testAlbum/imagefolder/apple_expect_resize_bilinear.jpg → tests/ut/data/dataset/testAlbum/original/apple_expect_resize_bilinear.jpg View File


BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_changemode.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_decoded.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_horizontal.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 440 kB

BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_flipped_vertical.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 851 kB

BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_not_flip.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_rescaled.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 150 kB

BIN
tests/ut/data/dataset/testAlbum/processed/apple_expect_resize_bilinear.jpg View File

Before After
Width: 1134  |  Height: 637  |  Size: 115 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_changemode.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_decoded.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_horizontal.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 440 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_flipped_vertical.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 851 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_not_flip.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_rescaled.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 150 kB

BIN
tests/ut/data/dataset/testAlbum/segmented/apple_expect_resize_bilinear.jpg View File

Before After
Width: 1134  |  Height: 637  |  Size: 115 kB

Loading…
Cancel
Save