/** * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/samplers.h" #include "minddata/dataset/engine/dataset_iterator.h" #include "minddata/dataset/engine/datasetops/source/image_folder_op.h" #include "minddata/dataset/engine/datasetops/source/mnist_op.h" #include "minddata/dataset/engine/datasetops/source/cifar_op.h" #include "minddata/dataset/engine/datasetops/batch_op.h" #include "minddata/dataset/engine/datasetops/map_op.h" #include "minddata/dataset/engine/datasetops/repeat_op.h" #include "minddata/dataset/engine/datasetops/shuffle_op.h" #include "minddata/dataset/engine/datasetops/project_op.h" #include "minddata/dataset/engine/datasetops/zip_op.h" #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h" #include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/util/random.h" namespace mindspore { namespace dataset { namespace api { #define RETURN_NULL_IF_ERROR(_s) \ do { \ Status __rc = (_s); \ if (__rc.IsError()) { \ return nullptr; \ } \ } while (false) // Function to create the iterator, which will build and launch the execution tree. std::shared_ptr Dataset::CreateIterator() { std::shared_ptr iter; try { iter = std::make_shared(); Status rc = iter->BuildAndLaunchTree(shared_from_this()); if (rc.IsError()) { MS_LOG(ERROR) << rc; MS_LOG(ERROR) << "CreateIterator failed."; return nullptr; } return iter; } catch (const std::exception &err) { MS_LOG(ERROR) << "CreateIterator: Iterator exception caught: " << err.what(); return nullptr; } return iter; } // Constructor Dataset::Dataset() { // Fetch some default value from config manager std::shared_ptr cfg = GlobalContext::config_manager(); num_workers_ = cfg->num_parallel_workers(); rows_per_buffer_ = cfg->rows_per_buffer(); connector_que_size_ = cfg->op_connector_size(); } // Function to create a ImageFolderDataset. std::shared_ptr ImageFolder(std::string dataset_dir, bool decode, std::shared_ptr sampler, std::set extensions, std::map class_indexing) { // This arg is exist in ImageFolderOp, but not externalized (in Python API). The default value is false. bool recursive = false; // Create logical representation of ImageFolderDataset. auto ds = std::make_shared(dataset_dir, decode, sampler, recursive, extensions, class_indexing); // Call derived class validation method. return ds->ValidateParams() ? ds : nullptr; } // Function to create a MnistDataset. std::shared_ptr Mnist(std::string dataset_dir, std::shared_ptr sampler) { auto ds = std::make_shared(dataset_dir, sampler); // Call derived class validation method. return ds->ValidateParams() ? ds : nullptr; } // Function to create a Cifar10Dataset. std::shared_ptr Cifar10(const std::string &dataset_dir, int32_t num_samples, std::shared_ptr sampler) { auto ds = std::make_shared(dataset_dir, num_samples, sampler); // Call derived class validation method. return ds->ValidateParams() ? ds : nullptr; } // Function to create a Batch dataset std::shared_ptr Dataset::Batch(int32_t batch_size, bool drop_remainder) { // Default values std::vector cols_to_map = {}; std::map>> pad_map; bool pad = false; auto ds = std::make_shared(batch_size, drop_remainder, pad, cols_to_map, pad_map); if (!ds->ValidateParams()) { return nullptr; } ds->children.push_back(shared_from_this()); return ds; } // Function to create Repeat dataset. std::shared_ptr Dataset::Repeat(int32_t count) { // Workaround for repeat == 1, do not inject repeat. if (count == 1) { return shared_from_this(); } auto ds = std::make_shared(count); if (!ds->ValidateParams()) { return nullptr; } ds->children.push_back(shared_from_this()); return ds; } // Function to create a Map dataset. std::shared_ptr Dataset::Map(std::vector> operations, std::vector input_columns, std::vector output_columns, const std::vector &project_columns) { auto ds = std::make_shared(operations, input_columns, output_columns, project_columns); if (!ds->ValidateParams()) { return nullptr; } ds->children.push_back(shared_from_this()); return ds; } // Function to create a ShuffleOp std::shared_ptr Dataset::Shuffle(int32_t shuffle_size) { // Pass in reshuffle_each_epoch with true auto ds = std::make_shared(shuffle_size, true); if (!ds->ValidateParams()) { return nullptr; } ds->children.push_back(shared_from_this()); return ds; } // Function to create a ProjectDataset. std::shared_ptr Dataset::Project(const std::vector &columns) { auto ds = std::make_shared(columns); // Call derived class validation method. if (!ds->ValidateParams()) { return nullptr; } ds->children.push_back(shared_from_this()); return ds; } // Function to create a Zip dataset std::shared_ptr Dataset::Zip(const std::vector> &datasets) { // Default values auto ds = std::make_shared(); if (!ds->ValidateParams()) { return nullptr; } for (auto dataset : datasets) { ds->children.push_back(dataset); } return ds; } // Helper function to create default RandomSampler. std::shared_ptr CreateDefaultSampler() { const int32_t num_samples = 0; // 0 means to sample all ids. bool replacement = false; return std::make_shared(replacement, num_samples); } /* ####################################### Derived Dataset classes ################################# */ ImageFolderDataset::ImageFolderDataset(std::string dataset_dir, bool decode, std::shared_ptr sampler, bool recursive, std::set extensions, std::map class_indexing) : dataset_dir_(dataset_dir), decode_(decode), sampler_(sampler), recursive_(recursive), class_indexing_(class_indexing), exts_(extensions) {} bool ImageFolderDataset::ValidateParams() { if (dataset_dir_.empty()) { MS_LOG(ERROR) << "No dataset path is specified."; return false; } return true; } std::shared_ptr>> ImageFolderDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. if (sampler_ == nullptr) { sampler_ = CreateDefaultSampler(); } // Do internal Schema generation. // This arg is exist in ImageFolderOp, but not externalized (in Python API). std::unique_ptr schema = std::make_unique(); TensorShape scalar = TensorShape::CreateScalar(); RETURN_NULL_IF_ERROR( schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); RETURN_NULL_IF_ERROR( schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_INT32), TensorImpl::kFlexible, 0, &scalar))); node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, recursive_, decode_, exts_, class_indexing_, std::move(schema), std::move(sampler_->Build()))); return std::make_shared>>(node_ops); } MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} bool MnistDataset::ValidateParams() { if (dataset_dir_.empty()) { MS_LOG(ERROR) << "No dataset path is specified."; return false; } return true; } std::shared_ptr>> MnistDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. if (sampler_ == nullptr) { sampler_ = CreateDefaultSampler(); } // Do internal Schema generation. auto schema = std::make_unique(); RETURN_NULL_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); TensorShape scalar = TensorShape::CreateScalar(); RETURN_NULL_IF_ERROR( schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); node_ops.push_back(std::make_shared(num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, std::move(schema), std::move(sampler_->Build()))); return std::make_shared>>(node_ops); } BatchDataset::BatchDataset(int32_t batch_size, bool drop_remainder, bool pad, std::vector cols_to_map, std::map>> pad_map) : batch_size_(batch_size), drop_remainder_(drop_remainder), pad_(pad), cols_to_map_(cols_to_map), pad_map_(pad_map) {} std::shared_ptr>> BatchDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; #ifdef ENABLE_PYTHON py::function noop; node_ops.push_back(std::make_shared(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, cols_to_map_, noop, noop, pad_map_)); #else node_ops.push_back(std::make_shared(batch_size_, drop_remainder_, pad_, connector_que_size_, num_workers_, cols_to_map_, pad_map_)); #endif return std::make_shared>>(node_ops); } bool BatchDataset::ValidateParams() { if (batch_size_ <= 0) { return false; } return true; } RepeatDataset::RepeatDataset(uint32_t count) : repeat_count_(count) {} std::shared_ptr>> RepeatDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; node_ops.push_back(std::make_shared(repeat_count_)); return std::make_shared>>(node_ops); } bool RepeatDataset::ValidateParams() { if (repeat_count_ <= 0) { return false; } return true; } MapDataset::MapDataset(std::vector> operations, std::vector input_columns, std::vector output_columns, const std::vector &project_columns) : operations_(operations), input_columns_(input_columns), output_columns_(output_columns), project_columns_(project_columns) {} std::shared_ptr>> MapDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; // Currently default is true, and this is not exposed to user. bool perf_mode = true; std::vector> tensor_ops; // Build tensorOp from tensorOperation vector // This is to ensure each iterator hold its own copy of the tensorOp objects. (void)std::transform( operations_.begin(), operations_.end(), std::back_inserter(tensor_ops), [](std::shared_ptr operation) -> std::shared_ptr { return operation->Build(); }); // This parameter will be removed with next rebase std::vector col_orders; auto map_op = std::make_shared(input_columns_, output_columns_, tensor_ops, num_workers_, connector_que_size_, perf_mode); if (!project_columns_.empty()) { auto project_op = std::make_shared(project_columns_); node_ops.push_back(project_op); } node_ops.push_back(map_op); return std::make_shared>>(node_ops); } bool MapDataset::ValidateParams() { if (operations_.empty()) { return false; } return true; } // Constructor for ShuffleDataset ShuffleDataset::ShuffleDataset(int32_t shuffle_size, bool reset_every_epoch) : shuffle_size_(shuffle_size), shuffle_seed_(GetSeed()), reset_every_epoch_(reset_every_epoch) {} // Function to build the ShuffleOp std::shared_ptr>> ShuffleDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; node_ops.push_back(std::make_shared(shuffle_size_, shuffle_seed_, connector_que_size_, reset_every_epoch_, rows_per_buffer_)); return std::make_shared>>(node_ops); } // Function to validate the parameters for ShuffleDataset bool ShuffleDataset::ValidateParams() { if (shuffle_size_ <= 1) { MS_LOG(ERROR) << "ShuffleDataset: Invalid input, shuffle_size: " << shuffle_size_; return false; } return true; } // Constructor for Cifar10Dataset Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, int32_t num_samples, std::shared_ptr sampler) : dataset_dir_(dataset_dir), num_samples_(num_samples), sampler_(sampler) {} bool Cifar10Dataset::ValidateParams() { if (dataset_dir_.empty()) { MS_LOG(ERROR) << "No dataset path is specified."; return false; } if (num_samples_ < 0) { MS_LOG(ERROR) << "Number of samples cannot be negative"; return false; } return true; } // Function to build CifarOp std::shared_ptr>> Cifar10Dataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; // If user does not specify Sampler, create a default sampler based on the shuffle variable. if (sampler_ == nullptr) { sampler_ = CreateDefaultSampler(); } // Do internal Schema generation. auto schema = std::make_unique(); RETURN_NULL_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); TensorShape scalar = TensorShape::CreateScalar(); RETURN_NULL_IF_ERROR( schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); node_ops.push_back(std::make_shared(CifarOp::CifarType::kCifar10, num_workers_, rows_per_buffer_, dataset_dir_, connector_que_size_, std::move(schema), std::move(sampler_->Build()))); return std::make_shared>>(node_ops); } // Function to build ProjectOp ProjectDataset::ProjectDataset(const std::vector &columns) : columns_(columns) {} bool ProjectDataset::ValidateParams() { if (columns_.empty()) { MS_LOG(ERROR) << "No columns are specified."; return false; } return true; } std::shared_ptr>> ProjectDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; node_ops.push_back(std::make_shared(columns_)); return std::make_shared>>(node_ops); } // Function to build ZipOp ZipDataset::ZipDataset() {} bool ZipDataset::ValidateParams() { return true; } std::shared_ptr>> ZipDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; node_ops.push_back(std::make_shared(rows_per_buffer_, connector_que_size_)); return std::make_shared>>(node_ops); } } // namespace api } // namespace dataset } // namespace mindspore