diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 2e748cae1c..2ab554305b 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -86,6 +86,7 @@ #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" #include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h" #include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h" +#include "minddata/dataset/engine/ir/datasetops/source/random_node.h" #include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h" // IR leaf nodes disabled for android @@ -140,26 +141,11 @@ bool Dataset::DeviceQueue(bool send_epoch_end) { return false; } - // Get a uuid for queue name - std::string queue_name = Services::GetUniqueID(); - - // TODO(CRC): - // Get device type from ms context - std::string device_type = "CPU"; - - // Get device ID from children - int32_t device_id = 0; - rc = TransferNode::get_distribution(shared_from_this(), &device_id); - if (rc.IsError()) { - MS_LOG(ERROR) << "Failed to get shard id. Error status: " << rc; - return false; - } - // Add TransferNode IR on top of dataset d - auto ds = std::make_shared(shared_from_this(), queue_name, device_id, device_type, send_epoch_end); + auto ds = std::make_shared(shared_from_this()->IRNode(), send_epoch_end); // Get ToDevice consumer - auto consumer = std::make_unique(device_type, send_epoch_end, -1); + auto consumer = std::make_unique(send_epoch_end, -1); ToDevice *consumer_ = consumer.get(); rc = consumer->Init(ds); if (rc.IsError()) { @@ -199,7 +185,7 @@ bool Dataset::Save(std::string dataset_path, int32_t num_files, std::string data return false; } SaveToDisk *consumer_ = consumer.get(); - rc = consumer->Init(ds); + rc = consumer->Init(ds->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "CreateSaver failed." << rc; return false; @@ -225,19 +211,10 @@ bool Dataset::Save(std::string dataset_path, int32_t num_files, std::string data #endif // Constructor -Dataset::Dataset() { - // Fetch some default value from config manager - std::shared_ptr cfg = GlobalContext::config_manager(); - num_workers_ = cfg->num_parallel_workers(); - rows_per_buffer_ = cfg->rows_per_buffer(); - connector_que_size_ = cfg->op_connector_size(); - worker_connector_size_ = cfg->worker_connector_size(); - tree_getters_ = std::make_shared(); -} +Dataset::Dataset() { tree_getters_ = std::make_shared(); } int64_t Dataset::GetDatasetSize() { int64_t dataset_size; - auto ds = shared_from_this(); Status rc; std::unique_ptr runtime_context = std::make_unique(); rc = runtime_context->Init(); @@ -246,7 +223,7 @@ int64_t Dataset::GetDatasetSize() { return -1; } if (!tree_getters_->isInitialized()) { - rc = tree_getters_->Init(ds); + rc = tree_getters_->Init(this->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "GetDatasetSize: Initializing TreeGetters failed."; return -1; @@ -267,7 +244,7 @@ std::vector Dataset::GetOutputTypes() { return types; } if (!tree_getters_->isInitialized()) { - rc = tree_getters_->Init(shared_from_this()); + rc = tree_getters_->Init(this->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "GetOutputTypes: Initializing TreeGetters failed."; types.clear(); @@ -294,7 +271,7 @@ std::vector Dataset::GetOutputShapes() { return shapes; } if (!tree_getters_->isInitialized()) { - rc = tree_getters_->Init(shared_from_this()); + rc = tree_getters_->Init(this->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "GetOutputShapes: Initializing TreeGetters failed."; shapes.clear(); @@ -321,7 +298,7 @@ int64_t Dataset::GetNumClasses() { return -1; } if (!tree_getters_->isInitialized()) { - rc = tree_getters_->Init(ds); + rc = tree_getters_->Init(ds->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "GetNumClasses: Initializing TreeGetters failed."; return -1; @@ -331,9 +308,6 @@ int64_t Dataset::GetNumClasses() { return rc.IsError() ? -1 : num_classes; } -// Constructor to initialize the cache -Dataset::Dataset(const std::shared_ptr &dataset_cache) : Dataset() { cache_ = dataset_cache; } - /// \brief Function to create a SchemaObj /// \param[in] schema_file Path of schema file /// \return Shared pointer to the current schema @@ -346,161 +320,155 @@ std::shared_ptr Schema(const std::string &schema_file) { // FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS // (In alphabetical order) -// Function to create a AlbumNode. -std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, - const std::vector &column_names, bool decode, - const std::shared_ptr &sampler, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, data_schema, column_names, decode, sampler, cache); +// Function to create a AlbumDataset. +std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, data_schema, column_names, decode, sampler, cache); return ds; } -// Function to create a CelebANode. -std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &usage, - const std::shared_ptr &sampler, bool decode, - const std::set &extensions, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, usage, sampler, decode, extensions, cache); +// Function to create a CelebADataset. +std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, bool decode, + const std::set &extensions, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, decode, extensions, cache); return ds; } -// Function to create a Cifar10Node. -std::shared_ptr Cifar10(const std::string &dataset_dir, const std::string &usage, - const std::shared_ptr &sampler, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, usage, sampler, cache); +// Function to create a Cifar10Dataset. +std::shared_ptr Cifar10(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, cache); return ds; } -// Function to create a Cifar100Node. -std::shared_ptr Cifar100(const std::string &dataset_dir, const std::string &usage, - const std::shared_ptr &sampler, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, usage, sampler, cache); +// Function to create a Cifar100Dataset. +std::shared_ptr Cifar100(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, cache); return ds; } -// Function to create a CLUENode. -std::shared_ptr CLUE(const std::vector &clue_files, const std::string &task, - const std::string &usage, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, - int32_t shard_id, const std::shared_ptr &cache) { - auto ds = std::make_shared(clue_files, task, usage, num_samples, shuffle, num_shards, shard_id, cache); +// Function to create a CLUEDataset. +std::shared_ptr CLUE(const std::vector &clue_files, const std::string &task, + const std::string &usage, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache) { + auto ds = std::make_shared(clue_files, task, usage, num_samples, shuffle, num_shards, shard_id, cache); return ds; } -// Function to create a CocoNode. -std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, - const std::string &task, const bool &decode, const std::shared_ptr &sampler, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, annotation_file, task, decode, sampler, cache); +// Function to create a CocoDataset. +std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, + const std::string &task, const bool &decode, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, annotation_file, task, decode, sampler, cache); return ds; } -// Function to create a CSVNode. -std::shared_ptr CSV(const std::vector &dataset_files, char field_delim, - const std::vector> &column_defaults, - const std::vector &column_names, int64_t num_samples, ShuffleMode shuffle, - int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_files, field_delim, column_defaults, column_names, num_samples, shuffle, - num_shards, shard_id, cache); +// Function to create a CSVDataset. +std::shared_ptr CSV(const std::vector &dataset_files, char field_delim, + const std::vector> &column_defaults, + const std::vector &column_names, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_files, field_delim, column_defaults, column_names, num_samples, + shuffle, num_shards, shard_id, cache); return ds; } -// Function to create a ImageFolderNode. -std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode, - const std::shared_ptr &sampler, - const std::set &extensions, - const std::map &class_indexing, - const std::shared_ptr &cache) { - // This arg exists in ImageFolderOp, but not externalized (in Python API). The default value is false. - bool recursive = false; - - // Create logical representation of ImageFolderNode. - auto ds = - std::make_shared(dataset_dir, decode, sampler, recursive, extensions, class_indexing, cache); +// Function to create a ImageFolderDataset. +std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode, + const std::shared_ptr &sampler, + const std::set &extensions, + const std::map &class_indexing, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, decode, sampler, extensions, class_indexing, cache); return ds; } #ifndef ENABLE_ANDROID -// Function to create a ManifestNode. -std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage, - const std::shared_ptr &sampler, - const std::map &class_indexing, bool decode, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_file, usage, sampler, class_indexing, decode, cache); +// Function to create a ManifestDataset. +std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, + const std::map &class_indexing, bool decode, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_file, usage, sampler, class_indexing, decode, cache); return ds; } -// Function to create a MindDataNode. -std::shared_ptr MindData(const std::string &dataset_file, const std::vector &columns_list, - const std::shared_ptr &sampler, nlohmann::json padded_sample, - int64_t num_padded) { - auto ds = std::make_shared(dataset_file, columns_list, sampler, padded_sample, num_padded); +// Function to create a MindDataDataset. +std::shared_ptr MindData(const std::string &dataset_file, const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, + int64_t num_padded) { + auto ds = std::make_shared(dataset_file, columns_list, sampler, padded_sample, num_padded); return ds; } -// Function to create a MindDataNode. -std::shared_ptr MindData(const std::vector &dataset_files, - const std::vector &columns_list, - const std::shared_ptr &sampler, nlohmann::json padded_sample, - int64_t num_padded) { - auto ds = std::make_shared(dataset_files, columns_list, sampler, padded_sample, num_padded); +// Function to create a MindDataDataset. +std::shared_ptr MindData(const std::vector &dataset_files, + const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, + int64_t num_padded) { + auto ds = std::make_shared(dataset_files, columns_list, sampler, padded_sample, num_padded); return ds; } #endif -// Function to create a MnistNode. -std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage, - const std::shared_ptr &sampler, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, usage, sampler, cache); +// Function to create a MnistDataset. +std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, cache); return ds; } // Function to overload "+" operator to concat two datasets -std::shared_ptr operator+(const std::shared_ptr &datasets1, - const std::shared_ptr &datasets2) { - std::shared_ptr ds = std::make_shared(std::vector({datasets2, datasets1})); - - return ds; +std::shared_ptr operator+(const std::shared_ptr &datasets1, + const std::shared_ptr &datasets2) { + return std::make_shared(std::vector({datasets2, datasets1})); } -// Function to create a TextFileNode. -std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples, - ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, - const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_files, num_samples, shuffle, num_shards, shard_id, cache); +// Function to create a TextFileDataset. +std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples, + ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_files, num_samples, shuffle, num_shards, shard_id, cache); return ds; } #ifndef ENABLE_ANDROID -// Function to create a VOCNode. -std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task, const std::string &usage, - const std::map &class_indexing, bool decode, - const std::shared_ptr &sampler, const std::shared_ptr &cache) { - auto ds = std::make_shared(dataset_dir, task, usage, class_indexing, decode, sampler, cache); +// Function to create a VOCDataset. +std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task, const std::string &usage, + const std::map &class_indexing, bool decode, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, task, usage, class_indexing, decode, sampler, cache); return ds; } #endif // Function to create a ZipNode. -std::shared_ptr Zip(const std::vector> &datasets) { - auto ds = std::make_shared(datasets); - +std::shared_ptr Zip(const std::vector> &datasets) { + auto ds = std::make_shared(datasets); return ds; } @@ -508,170 +476,112 @@ std::shared_ptr Zip(const std::vector> &datase // (In alphabetical order) // Function to create a Batch dataset -std::shared_ptr Dataset::Batch(int32_t batch_size, bool drop_remainder) { +BatchDataset::BatchDataset(std::shared_ptr input, int32_t batch_size, bool drop_remainder) { // Default values std::vector cols_to_map = {}; std::map>> pad_map; bool pad = false; - auto ds = std::make_shared(shared_from_this(), batch_size, drop_remainder, pad, cols_to_map, pad_map); - - return ds; + auto ds = std::make_shared(input->IRNode(), batch_size, drop_remainder, pad, cols_to_map, pad_map); + ir_node_ = std::static_pointer_cast(ds); } #ifndef ENABLE_ANDROID // Function to create a BucketBatchByLength dataset -std::shared_ptr Dataset::BucketBatchByLength( - const std::vector &column_names, const std::vector &bucket_boundaries, - const std::vector &bucket_batch_sizes, std::function element_length_function, +BucketBatchByLengthDataset::BucketBatchByLengthDataset( + std::shared_ptr input, const std::vector &column_names, + const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, + std::function element_length_function, const std::map>> &pad_info, bool pad_to_bucket_boundary, bool drop_remainder) { - auto ds = std::make_shared(shared_from_this(), column_names, bucket_boundaries, + auto ds = std::make_shared(input->IRNode(), column_names, bucket_boundaries, bucket_batch_sizes, element_length_function, pad_info, pad_to_bucket_boundary, drop_remainder); - return ds; -} - -// Function to create a SentencePieceVocab from dataset -std::shared_ptr Dataset::BuildSentencePieceVocab( - const std::vector &col_names, uint32_t vocab_size, float character_coverage, - SentencePieceModel model_type, const std::unordered_map ¶ms) { - auto vocab = std::make_shared(); - auto ds = std::make_shared(shared_from_this(), vocab, col_names, vocab_size, - character_coverage, model_type, params); - - // Run tree here to start building vocab - std::shared_ptr iter = ds->CreateIterator(); - if (iter == nullptr) { - MS_LOG(ERROR) << "Fail to run iterator in BuildSentencePieceVocab."; - return nullptr; - } - - // Finish building vocab by triggering GetNextRow - std::unordered_map> row; - if (!iter->GetNextRow(&row)) { - return nullptr; - } - - return vocab; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a Vocab from dataset -std::shared_ptr Dataset::BuildVocab(const std::vector &columns, - const std::pair &freq_range, int64_t top_k, - const std::vector &special_tokens, bool special_first) { - auto vocab = std::make_shared(); - auto ds = std::make_shared(shared_from_this(), vocab, columns, freq_range, top_k, special_tokens, - special_first); - - // Run tree here to starting building vocab - std::shared_ptr iter = ds->CreateIterator(); - if (iter == nullptr) { - MS_LOG(ERROR) << "Fail to run iterator in BuildVocab."; - return nullptr; - } - - // Finish building vocab by triggering GetNextRow - std::unordered_map> row; - if (!iter->GetNextRow(&row)) { - return nullptr; - } - - return vocab; -} #endif -// Function to create a Concat dataset -std::shared_ptr Dataset::Concat(const std::vector> &datasets) { - auto ds = std::make_shared(datasets); - ds->children.push_back(shared_from_this()); +ConcatDataset::ConcatDataset(const std::vector> &datasets) { + std::vector> all_datasets; + (void)std::transform( + datasets.begin(), datasets.end(), std::back_inserter(all_datasets), + [](std::shared_ptr dataset) -> std::shared_ptr { return dataset->IRNode(); }); - return ds; + auto ds = std::make_shared(all_datasets); + + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a Map dataset. -std::shared_ptr Dataset::Map(std::vector> operations, - std::vector input_columns, std::vector output_columns, - const std::vector &project_columns, - const std::shared_ptr &cache) { +MapDataset::MapDataset(std::shared_ptr input, std::vector> operations, + std::vector input_columns, std::vector output_columns, + const std::vector &project_columns, const std::shared_ptr &cache) { auto ds = - std::make_shared(shared_from_this(), operations, input_columns, output_columns, project_columns, cache); + std::make_shared(input->IRNode(), operations, input_columns, output_columns, project_columns, cache); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a ProjectNode. -std::shared_ptr Dataset::Project(const std::vector &columns) { - auto ds = std::make_shared(shared_from_this(), columns); +ProjectDataset::ProjectDataset(std::shared_ptr input, const std::vector &columns) { + auto ds = std::make_shared(input->IRNode(), columns); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a RenameNode. -std::shared_ptr Dataset::Rename(const std::vector &input_columns, - const std::vector &output_columns) { - auto ds = std::make_shared(shared_from_this(), input_columns, output_columns); +RenameDataset::RenameDataset(std::shared_ptr input, const std::vector &input_columns, + const std::vector &output_columns) { + auto ds = std::make_shared(input->IRNode(), input_columns, output_columns); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create Repeat dataset. -std::shared_ptr Dataset::Repeat(int32_t count) { +RepeatDataset::RepeatDataset(std::shared_ptr input, int32_t count) { // Workaround for repeat == 1, do not inject repeat. if (count == 1) { - return shared_from_this(); + ir_node_ = input->IRNode(); + return; } - auto ds = std::make_shared(shared_from_this(), count); + auto ds = std::make_shared(input->IRNode(), count); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a ShuffleOp -std::shared_ptr Dataset::Shuffle(int32_t buffer_size) { +ShuffleDataset::ShuffleDataset(std::shared_ptr input, int32_t buffer_size) { // Pass in reshuffle_each_epoch with true - auto ds = std::make_shared(shared_from_this(), buffer_size, true); + auto ds = std::make_shared(input->IRNode(), buffer_size, true); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a SkipNode. -std::shared_ptr Dataset::Skip(int32_t count) { - auto ds = std::make_shared(shared_from_this(), count); +SkipDataset::SkipDataset(std::shared_ptr input, int32_t count) { + auto ds = std::make_shared(input->IRNode(), count); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a TakeNode. -std::shared_ptr Dataset::Take(int32_t count) { +TakeDataset::TakeDataset(std::shared_ptr input, int32_t count) { // If count is greater than the number of element in dataset or equal to -1, // all the element in dataset will be taken if (count == -1) { - return shared_from_this(); + ir_node_ = input->IRNode(); + return; } - auto ds = std::make_shared(shared_from_this(), count); + auto ds = std::make_shared(input->IRNode(), count); - return ds; + ir_node_ = std::static_pointer_cast(ds); } -// Function to create a Zip dataset -std::shared_ptr Dataset::Zip(const std::vector> &datasets) { - // Default values - auto ds = std::make_shared(datasets); - ds->children.push_back(shared_from_this()); +ZipDataset::ZipDataset(const std::vector> &datasets) { + std::vector> all_datasets; + (void)std::transform( + datasets.begin(), datasets.end(), std::back_inserter(all_datasets), + [](std::shared_ptr dataset) -> std::shared_ptr { return dataset->IRNode(); }); - return ds; -} + auto ds = std::make_shared(all_datasets); -Status Dataset::AddCacheOp(std::vector> *node_ops) { - if (cache_ != nullptr) { - RETURN_IF_NOT_OK(cache_->Build()); - std::shared_ptr cache_op; - RETURN_IF_NOT_OK(cache_->CreateCacheOp(num_workers_, &cache_op)); - node_ops->push_back(cache_op); - } - return Status::OK(); + ir_node_ = std::static_pointer_cast(ds); } int64_t Dataset::GetBatchSize() { @@ -685,7 +595,7 @@ int64_t Dataset::GetBatchSize() { return -1; } if (!tree_getters_->isInitialized()) { - rc = tree_getters_->Init(ds); + rc = tree_getters_->Init(ds->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "GetBatchSize: Initializing TreeGetters failed."; return -1; @@ -706,7 +616,7 @@ int64_t Dataset::GetRepeatCount() { return -1; } if (!tree_getters_->isInitialized()) { - rc = tree_getters_->Init(ds); + rc = tree_getters_->Init(ds->IRNode()); if (rc.IsError()) { MS_LOG(ERROR) << "GetRepeatCount: Initializing TreeGetters failed."; return -1; @@ -715,7 +625,77 @@ int64_t Dataset::GetRepeatCount() { rc = tree_getters_->GetRepeatCount(&repeat_count); return rc.IsError() ? 0 : repeat_count; } +std::shared_ptr Dataset::SetNumWorkers(int32_t num_workers) { + if (ir_node_ == nullptr || ir_node_->SetNumWorkers(num_workers) == nullptr) { + return nullptr; + } + return shared_from_this(); +} +#ifndef ENABLE_ANDROID +std::shared_ptr Dataset::BuildSentencePieceVocab( + const std::vector &col_names, uint32_t vocab_size, float character_coverage, + SentencePieceModel model_type, const std::unordered_map ¶ms) { + auto vocab = std::make_shared(); + auto ds = std::make_shared(IRNode(), vocab, col_names, vocab_size, character_coverage, + model_type, params); + std::unique_ptr runtime_context = std::make_unique(); + Status rc = runtime_context->Init(); + if (rc.IsError()) { + MS_LOG(ERROR) << "Failed to init runtime context. Error status: " << rc; + return nullptr; + } + + auto consumer = std::make_unique(); + BuildVocabConsumer *bv_consumer = consumer.get(); + rc = consumer->Init(ds); + if (rc.IsError()) { + MS_LOG(ERROR) << "BuildVocab: Failed to init. Error status: " << rc; + return nullptr; + } + runtime_context->AssignConsumer(std::move(consumer)); + + // Run tree here to starting building vocab + rc = bv_consumer->Start(); + if (rc.IsError()) { + MS_LOG(ERROR) << "BuildVocab: Failed to start. Error status: " << rc; + return nullptr; + } + return vocab; +} + +std::shared_ptr Dataset::BuildVocab(const std::vector &columns, + const std::pair &freq_range, int64_t top_k, + const std::vector &special_tokens, bool special_first) { + auto vocab = std::make_shared(); + auto ds = + std::make_shared(IRNode(), vocab, columns, freq_range, top_k, special_tokens, special_first); + + std::unique_ptr runtime_context = std::make_unique(); + Status rc = runtime_context->Init(); + if (rc.IsError()) { + MS_LOG(ERROR) << "Failed to init runtime context. Error status: " << rc; + return nullptr; + } + + auto consumer = std::make_unique(); + BuildVocabConsumer *bv_consumer = consumer.get(); + rc = consumer->Init(ds); + if (rc.IsError()) { + MS_LOG(ERROR) << "BuildVocab: Failed to init. Error status: " << rc; + return nullptr; + } + runtime_context->AssignConsumer(std::move(consumer)); + + // Run tree here to starting building vocab + rc = bv_consumer->Start(); + if (rc.IsError()) { + MS_LOG(ERROR) << "BuildVocab: Failed to start. Error status: " << rc; + return nullptr; + } + return vocab; +} +#endif SchemaObj::SchemaObj(const std::string &schema_file) : schema_file_(schema_file), num_rows_(0), dataset_type_("") {} // SchemaObj init function @@ -1046,6 +1026,136 @@ std::shared_ptr CreateDatasetCache(session_id_type id, uint64_t me } #endif +AlbumDataset::AlbumDataset(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names, bool decode, + const std::shared_ptr &sampler, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, data_schema, column_names, decode, sampler, cache); + ir_node_ = std::static_pointer_cast(ds); +} +CelebADataset::CelebADataset(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, bool decode, + const std::set &extensions, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, decode, extensions, cache); + ir_node_ = std::static_pointer_cast(ds); +} +Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, cache); + ir_node_ = std::static_pointer_cast(ds); +} +Cifar100Dataset::Cifar100Dataset(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, cache); + ir_node_ = std::static_pointer_cast(ds); +} +CLUEDataset::CLUEDataset(const std::vector &dataset_files, const std::string &task, + const std::string &usage, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, + int32_t shard_id, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_files, task, usage, num_samples, shuffle, num_shards, shard_id, cache); + ir_node_ = std::static_pointer_cast(ds); +} +CocoDataset::CocoDataset(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, + const bool &decode, const std::shared_ptr &sampler, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, annotation_file, task, decode, sampler, cache); + ir_node_ = std::static_pointer_cast(ds); +} +CSVDataset::CSVDataset(const std::vector &dataset_files, char field_delim, + const std::vector> &column_defaults, + const std::vector &column_names, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_files, field_delim, column_defaults, column_names, num_samples, shuffle, + num_shards, shard_id, cache); + ir_node_ = std::static_pointer_cast(ds); +} +ImageFolderDataset::ImageFolderDataset(const std::string &dataset_dir, bool decode, + const std::shared_ptr &sampler, + const std::set &extensions, + const std::map &class_indexing, + const std::shared_ptr &cache) { + // This arg exists in ImageFolderOp, but not externalized (in Python API). The default value is false. + bool recursive = false; + + // Create logical representation of ImageFolderDataset. + auto ds = + std::make_shared(dataset_dir, decode, sampler, recursive, extensions, class_indexing, cache); + ir_node_ = std::static_pointer_cast(ds); +} + +#ifndef ENABLE_ANDROID +ManifestDataset::ManifestDataset(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, + const std::map &class_indexing, bool decode, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_file, usage, sampler, class_indexing, decode, cache); + ir_node_ = std::static_pointer_cast(ds); +} +MindDataDataset::MindDataDataset(const std::string &dataset_file, const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, + int64_t num_padded) { + auto ds = std::make_shared(dataset_file, columns_list, sampler, padded_sample, num_padded); + ir_node_ = std::static_pointer_cast(ds); +} +MindDataDataset::MindDataDataset(const std::vector &dataset_files, + const std::vector &columns_list, + const std::shared_ptr &sampler, nlohmann::json padded_sample, + int64_t num_padded) { + auto ds = std::make_shared(dataset_files, columns_list, sampler, padded_sample, num_padded); + ir_node_ = std::static_pointer_cast(ds); +} +#endif +MnistDataset::MnistDataset(const std::string &dataset_dir, const std::string &usage, + const std::shared_ptr &sampler, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, usage, sampler, cache); + ir_node_ = std::static_pointer_cast(ds); +} +TextFileDataset::TextFileDataset(const std::vector &dataset_files, int64_t num_samples, + ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, + const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_files, num_samples, shuffle, num_shards, shard_id, cache); + ir_node_ = std::static_pointer_cast(ds); +} +#ifndef ENABLE_ANDROID +VOCDataset::VOCDataset(const std::string &dataset_dir, const std::string &task, const std::string &usage, + const std::map &class_indexing, bool decode, + const std::shared_ptr &sampler, const std::shared_ptr &cache) { + auto ds = std::make_shared(dataset_dir, task, usage, class_indexing, decode, sampler, cache); + ir_node_ = std::static_pointer_cast(ds); +} +#endif +RandomDataDataset::RandomDataDataset(const int32_t &total_rows, std::shared_ptr schema, + const std::vector &columns_list, + const std::shared_ptr &sampler, std::shared_ptr cache) { + auto ds = + std::make_shared(total_rows, std::move(schema), std::move(columns_list), std::move(sampler), cache); + ir_node_ = std::static_pointer_cast(ds); +} +RandomDataDataset::RandomDataDataset(const int32_t &total_rows, std::string schema_path, + const std::vector &columns_list, + const std::shared_ptr &sampler, std::shared_ptr cache) { + auto ds = std::make_shared(total_rows, std::move(schema_path), std::move(columns_list), + std::move(sampler), cache); + ir_node_ = std::static_pointer_cast(ds); +} +#ifndef ENABLE_ANDROID +TFRecordDataset::TFRecordDataset(const std::vector &dataset_files, std::string schema, + const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, bool shard_equal_rows, + std::shared_ptr cache) { + auto ds = std::make_shared(dataset_files, schema, columns_list, num_samples, shuffle, num_shards, + shard_id, shard_equal_rows, cache); + ir_node_ = std::static_pointer_cast(ds); +} +TFRecordDataset::TFRecordDataset(const std::vector &dataset_files, std::shared_ptr schema, + const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, bool shard_equal_rows, + std::shared_ptr cache) { + auto ds = std::make_shared(dataset_files, schema, columns_list, num_samples, shuffle, num_shards, + shard_id, shard_equal_rows, cache); + ir_node_ = std::static_pointer_cast(ds); +} +#endif std::shared_ptr SelectSampler(int64_t num_samples, bool shuffle, int32_t num_shards, int32_t shard_id) { if (shuffle) { if (num_shards > 1) { @@ -1062,7 +1172,6 @@ std::shared_ptr SelectSampler(int64_t num_samples, bool shuffle, int // If shuffle disabled, sharding disabled, use sequential sampler return SequentialSampler(0, num_samples); } - } // namespace api } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/iterator.cc b/mindspore/ccsrc/minddata/dataset/api/iterator.cc index 39082e626d..1e8e74ca1a 100644 --- a/mindspore/ccsrc/minddata/dataset/api/iterator.cc +++ b/mindspore/ccsrc/minddata/dataset/api/iterator.cc @@ -53,7 +53,7 @@ Status Iterator::BuildAndLaunchTree(std::shared_ptr ds) { RETURN_IF_NOT_OK(runtime_context->Init()); auto consumer = std::make_unique(); consumer_ = consumer.get(); - RETURN_IF_NOT_OK(consumer->Init(ds)); + RETURN_IF_NOT_OK(consumer->Init(ds->IRNode())); runtime_context->AssignConsumer(std::move(consumer)); return Status::OK(); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt index 4aa54a53cd..c97bc098bb 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt @@ -11,7 +11,7 @@ endif () file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) -add_library(engine OBJECT +set(SRC_FILES_LIST execution_tree.cc data_buffer.cc data_schema.cc @@ -20,10 +20,19 @@ add_library(engine OBJECT runtime_context.cc consumers/tree_consumer.cc ) +if (ENABLE_PYTHON) + set(SRC_FILES_LIST + ${SRC_FILES_LIST} + python_runtime_context.cc + consumers/python_tree_consumer.cc + ) +endif () + +add_library(engine OBJECT ${SRC_FILES_LIST}) if (ENABLE_PYTHON) - target_include_directories(engine PRIVATE ${pybind11_INCLUDE_DIRS}) -endif() + target_include_directories(engine PRIVATE ${pybind11_INCLUDE_DIRS}) +endif () add_dependencies(engine engine-datasetops engine-datasetops-source engine-opt engine-gnn engine-perf engine-cache-client engine-datasetops-mapop) diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc new file mode 100644 index 0000000000..bca21d61cb --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc @@ -0,0 +1,46 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "minddata/dataset/engine/consumers/python_tree_consumer.h" + +namespace mindspore::dataset { + +Status PythonIteratorConsumer::GetNextAsList(py::list *out) { + std::vector row; + { + py::gil_scoped_release gil_release; + RETURN_IF_NOT_OK(GetNextAsVector(&row)); + } + for (auto el : row) { + (*out).append(el); + } + return Status::OK(); +} +Status PythonIteratorConsumer::GetNextAsDict(py::dict *out) { + std::unordered_map row; + { + py::gil_scoped_release gil_release; + RETURN_IF_NOT_OK(GetNextAsMap(&row)); + } + for (auto el : row) { + (*out)[common::SafeCStr(el.first)] = el.second; + } + return Status::OK(); +} +} // namespace mindspore::dataset diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.h b/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.h index 0359a43c23..19e98f570e 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.h +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.h @@ -26,24 +26,21 @@ namespace mindspore::dataset { /// Consumer that iterates over the dataset and returns the rows one by one as a python list or a dict -class PythonIterator : public IteratorConsumer { - /// Constructor + +class PythonIteratorConsumer : public IteratorConsumer { + public: + /// Constructor which will call the base class default constructor. /// \param num_epochs number of epochs. Default to -1 (infinite epochs). - explicit PythonIterator(int32_t num_epochs = -1) : IteratorConsumer(num_epochs) {} + explicit PythonIteratorConsumer(int32_t num_epochs = -1) : IteratorConsumer(num_epochs) {} + /// Returns the next row in a vector format + /// \param[out] out std::vector of Tensors + /// \return Status error code + Status GetNextAsList(py::list *out); - /// Get the next row as a python dict - /// \param[out] output python dict - /// \return Status error code - Status GetNextAsMap(py::dict *output) { - return Status(StatusCode::kNotImplementedYet, __LINE__, __FILE__, "Method is not implemented yet."); - } - /// Get the next row as a python dict - /// \param[out] output python dict - /// \return Status error code - Status GetNextAsList(py::list *output) { - return Status(StatusCode::kNotImplementedYet, __LINE__, __FILE__, "Method is not implemented yet."); - } + /// Returns the next row in as a map + /// \param[out] out std::map of string to Tensor + /// \return Status error code + Status GetNextAsDict(py::dict *out); }; - } // namespace mindspore::dataset #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_CONSUMERS_PYTHON_TREE_CONSUMER_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc index c65a749103..5b62deefd2 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc @@ -34,10 +34,11 @@ namespace mindspore::dataset { // TreeConsumer TreeConsumer::TreeConsumer() { tree_adapter_ = std::make_unique(); } -Status TreeConsumer::Init(std::shared_ptr d) { return tree_adapter_->BuildAndPrepare(std::move(d)); } +Status TreeConsumer::Init(std::shared_ptr d) { return tree_adapter_->BuildAndPrepare(std::move(d)); } +Status TreeConsumer::Terminate() { return tree_adapter_->AllTasks()->DoServiceStop(); } // IteratorConsumer -Status IteratorConsumer::Init(std::shared_ptr d) { +Status IteratorConsumer::Init(std::shared_ptr d) { return tree_adapter_->BuildAndPrepare(std::move(d), num_epochs_); } @@ -73,7 +74,7 @@ Status IteratorConsumer::GetNextAsMap(std::unordered_map } // ToDevice -Status ToDevice::Init(std::shared_ptr d) { +Status ToDevice::Init(std::shared_ptr d) { return tree_adapter_->BuildAndPrepare(std::move(d), num_epochs_); } @@ -384,7 +385,7 @@ TreeGetters::TreeGetters() : dataset_size_(-1), init_flag_(false), row_flag_(fal tree_adapter_ = std::make_unique(); } -Status TreeGetters::Init(std::shared_ptr d) { +Status TreeGetters::Init(std::shared_ptr d) { Status s = tree_adapter_->BuildAndPrepare(std::move(d)); if (!s.IsError()) { init_flag_ = true; @@ -463,4 +464,15 @@ Status TreeGetters::GetNumClasses(int64_t *num_classes) { RETURN_IF_NOT_OK(root->GetNumClasses(num_classes)); return Status::OK(); } +Status BuildVocabConsumer::Init(std::shared_ptr d) { + return tree_adapter_->BuildAndPrepare(std::move(d), 1); +} +Status BuildVocabConsumer::Start() { + // Getting one row would trigger building the vocab + TensorRow row; + RETURN_IF_NOT_OK(tree_adapter_->GetNext(&row)); + // The returned row would EOE which is an empty row + CHECK_FAIL_RETURN_UNEXPECTED(row.empty(), "The fetched row from BuildVocab should be an EOE."); + return Status::OK(); +} } // namespace mindspore::dataset diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h index 1dcd314f9f..7e947f3909 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h +++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h @@ -22,14 +22,16 @@ #include #include #include + #include "minddata/dataset/engine/tree_adapter.h" +#include "minddata/dataset/text/vocab.h" namespace mindspore::dataset { // Forward declare class TreeAdapter; namespace api { -class Dataset; +class DatasetNode; } /// A base class for tree consumers which would fetch rows from the tree pipeline @@ -40,7 +42,9 @@ class TreeConsumer { /// Initializes the consumer, this involves constructing and preparing the tree. /// \param d The dataset node that represent the root of the IR tree. /// \return Status error code. - virtual Status Init(std::shared_ptr d); + virtual Status Init(std::shared_ptr d); + + Status Terminate(); protected: /// The class owns the tree_adapter that handles execution tree operations. @@ -57,7 +61,7 @@ class IteratorConsumer : public TreeConsumer { /// \param num_epochs number of epochs. Default to -1 (infinite epochs). explicit IteratorConsumer(int32_t num_epochs = -1) : TreeConsumer(), num_epochs_(num_epochs) {} - Status Init(std::shared_ptr d) override; + Status Init(std::shared_ptr d) override; /// Returns the next row in a vector format /// \param[out] out std::vector of Tensors @@ -126,10 +130,10 @@ class SaveToDisk : public TreeConsumer { /// Consumer that iterates over the dataset and send it to a device class ToDevice : public TreeConsumer { public: - ToDevice(std::string device_type, bool send_epoch_end, int32_t num_epochs = -1) - : TreeConsumer(), device_type_(device_type), send_epoch_end_(send_epoch_end), num_epochs_(num_epochs) {} + explicit ToDevice(bool send_epoch_end, int32_t num_epochs = -1) + : TreeConsumer(), send_epoch_end_(send_epoch_end), num_epochs_(num_epochs) {} - Status Init(std::shared_ptr d) override; + Status Init(std::shared_ptr d) override; /// Send the data to device /// \return Status error code @@ -158,7 +162,7 @@ class ToDevice : public TreeConsumer { class TreeGetters : public TreeConsumer { public: TreeGetters(); - Status Init(std::shared_ptr d) override; + Status Init(std::shared_ptr d) override; Status GetDatasetSize(int64_t *size); Status GetOutputTypes(std::vector *types); Status GetOutputShapes(std::vector *shapes); @@ -176,5 +180,23 @@ class TreeGetters : public TreeConsumer { bool row_flag_; // indicate whether the first row has been stored in row_ }; +class BuildVocabConsumer : public TreeConsumer { + public: + /// BuildVocabConsumer Constructor which will call the base class default constructor. + BuildVocabConsumer() = default; + + Status Init(std::shared_ptr d) override; + + /// Save the given dataset to MindRecord format on disk. This is a blocking method (i.e., after returning, all rows + /// would be written to disk) + /// \return Status error code + Status Start(); + + protected: + /// Method to return the name of the consumer + /// \return string + std::string Name() override { return "BuildVocab"; } +}; + } // namespace mindspore::dataset #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_CONSUMERS_TREE_CONSUMER_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt index 2c6f8e133d..a2a2d18595 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/CMakeLists.txt @@ -3,6 +3,7 @@ set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE add_subdirectory(source) set(DATASET_ENGINE_IR_DATASETOPS_SRC_FILES + dataset_node.cc batch_node.cc bucket_batch_by_length_node.cc build_sentence_piece_vocab_node.cc diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc index edcf95cde7..7b8fc14d5f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc @@ -28,7 +28,7 @@ namespace mindspore { namespace dataset { namespace api { -BatchNode::BatchNode(std::shared_ptr child, int32_t batch_size, bool drop_remainder, bool pad, +BatchNode::BatchNode(std::shared_ptr child, int32_t batch_size, bool drop_remainder, bool pad, std::vector cols_to_map, std::map>> pad_map) : batch_size_(batch_size), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h index a2462f49d4..708ad6d969 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h @@ -23,16 +23,16 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class BatchNode : public Dataset { +class BatchNode : public DatasetNode { public: /// \brief Constructor - BatchNode(std::shared_ptr child, int32_t batch_size, bool drop_remainder, bool pad, + BatchNode(std::shared_ptr child, int32_t batch_size, bool drop_remainder, bool pad, std::vector cols_to_map, std::map>> pad_map); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.cc index 49bae0ac60..81be43eac1 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.cc @@ -29,7 +29,7 @@ namespace mindspore { namespace dataset { namespace api { BucketBatchByLengthNode::BucketBatchByLengthNode( - std::shared_ptr child, const std::vector &column_names, + std::shared_ptr child, const std::vector &column_names, const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, std::function element_length_function, const std::map>> &pad_info, bool pad_to_bucket_boundary, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h index 64f861721e..e9f395c363 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/bucket_batch_by_length_node.h @@ -23,15 +23,15 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class BucketBatchByLengthNode : public Dataset { +class BucketBatchByLengthNode : public DatasetNode { public: /// \brief Constructor - BucketBatchByLengthNode(std::shared_ptr child, const std::vector &column_names, + BucketBatchByLengthNode(std::shared_ptr child, const std::vector &column_names, const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, std::function element_length_function = nullptr, const std::map>> &pad_info = {}, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc index 5704617f9a..1dda6410cd 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc @@ -28,7 +28,7 @@ namespace mindspore { namespace dataset { namespace api { -BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr child, +BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr child, std::shared_ptr vocab, const std::vector &col_names, uint32_t vocab_size, float character_coverage, SentencePieceModel model_type, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h index 868daaae5f..10eb7e99d0 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h @@ -29,10 +29,10 @@ namespace mindspore { namespace dataset { namespace api { -class BuildSentenceVocabNode : public Dataset { +class BuildSentenceVocabNode : public DatasetNode { public: /// \brief Constructor - BuildSentenceVocabNode(std::shared_ptr child, std::shared_ptr vocab, + BuildSentenceVocabNode(std::shared_ptr child, std::shared_ptr vocab, const std::vector &col_names, uint32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.cc index 6fdb5d7c3a..bad52db138 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.cc @@ -28,7 +28,7 @@ namespace mindspore { namespace dataset { namespace api { -BuildVocabNode::BuildVocabNode(std::shared_ptr child, std::shared_ptr vocab, +BuildVocabNode::BuildVocabNode(std::shared_ptr child, std::shared_ptr vocab, const std::vector &columns, const std::pair &freq_range, int64_t top_k, const std::vector &special_tokens, bool special_first) : vocab_(vocab), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h index a7a20c3897..9f0de2d133 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_vocab_node.h @@ -22,17 +22,17 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class BuildVocabNode : public Dataset { +class BuildVocabNode : public DatasetNode { public: /// \brief Constructor - BuildVocabNode(std::shared_ptr child, std::shared_ptr vocab, const std::vector &columns, - const std::pair &freq_range, int64_t top_k, + BuildVocabNode(std::shared_ptr child, std::shared_ptr vocab, + const std::vector &columns, const std::pair &freq_range, int64_t top_k, const std::vector &special_tokens, bool special_first); /// \brief Destructor diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.cc index a1a506261d..36ce94a732 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.cc @@ -27,18 +27,16 @@ namespace mindspore { namespace dataset { namespace api { // Function to build ConcatOp -ConcatNode::ConcatNode(const std::vector> &datasets) : datasets_(datasets) { - this->children = datasets_; -} +ConcatNode::ConcatNode(const std::vector> &datasets) { this->children = datasets; } Status ConcatNode::ValidateParams() { - if (datasets_.empty()) { + if (children.size() < 2) { std::string err_msg = "ConcatNode: concatenated datasets are not specified."; MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } - if (find(datasets_.begin(), datasets_.end(), nullptr) != datasets_.end()) { + if (find(children.begin(), children.end(), nullptr) != children.end()) { std::string err_msg = "ConcatNode: concatenated datasets should not be null."; MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h index f0b9fcae94..61822b1283 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/concat_node.h @@ -21,16 +21,16 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class ConcatNode : public Dataset { +class ConcatNode : public DatasetNode { public: /// \brief Constructor - explicit ConcatNode(const std::vector> &datasets); + explicit ConcatNode(const std::vector> &datasets); /// \brief Destructor ~ConcatNode() = default; @@ -42,9 +42,6 @@ class ConcatNode : public Dataset { /// \brief Parameters validation /// \return Status Status::OK() if all the parameters are valid Status ValidateParams() override; - - private: - std::vector> datasets_; }; } // namespace api diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc new file mode 100644 index 0000000000..895fc5fd03 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" + +#include + +namespace mindspore { +namespace dataset { +namespace api { + +Status DatasetNode::AddCacheOp(std::vector> *node_ops) { + if (cache_ != nullptr) { + RETURN_IF_NOT_OK(cache_->Build()); + std::shared_ptr cache_op; + RETURN_IF_NOT_OK(cache_->CreateCacheOp(num_workers_, &cache_op)); + node_ops->push_back(cache_op); + } + return Status::OK(); +} +// Constructor to initialize the cache +DatasetNode::DatasetNode(const std::shared_ptr &dataset_cache) : DatasetNode() { cache_ = dataset_cache; } + +std::shared_ptr DatasetNode::SetNumWorkers(int32_t num_workers) { +#if !defined(_WIN32) && !defined(_WIN64) +#ifndef ENABLE_ANDROID + int32_t cpu_count = sysconf(_SC_NPROCESSORS_CONF); + if (cpu_count < 0 || cpu_count > INT32_MAX) { + MS_LOG(ERROR) << "Error determining current CPU: " << cpu_count; + return nullptr; + } + if (num_workers < 1 || num_workers > cpu_count) { + MS_LOG(ERROR) << "num_workers exceeds the boundary between 1 and " << cpu_count; + return nullptr; + } +#endif +#endif + num_workers_ = num_workers; + return shared_from_this(); +} +DatasetNode::DatasetNode() { + // Fetch some default value from config manager + std::shared_ptr cfg = GlobalContext::config_manager(); + num_workers_ = cfg->num_parallel_workers(); + rows_per_buffer_ = cfg->rows_per_buffer(); + connector_que_size_ = cfg->op_connector_size(); + worker_connector_size_ = cfg->worker_connector_size(); +} + +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h new file mode 100644 index 0000000000..89766d31a7 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h @@ -0,0 +1,126 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_DATASET_NODE_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_DATASET_NODE_H_ + +#include +#include +#include +#include +#include +#include + +#include "minddata/dataset/include/datasets.h" + +namespace mindspore { +namespace dataset { +namespace api { + +class Dataset; +class SamplerObj; + +#define RETURN_EMPTY_IF_ERROR(_s) \ + do { \ + Status __rc = (_s); \ + if (__rc.IsError()) { \ + MS_LOG(ERROR) << __rc; \ + return {}; \ + } \ + } while (false) + +Status AddShuffleOp(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows, + int32_t connector_que_size, int32_t rows_per_buffer, std::shared_ptr *shuffle_op); + +// Helper function to validate dataset files parameter +Status ValidateDatasetFilesParam(const std::string &dataset_name, const std::vector &dataset_files); + +// Helper function to validate dataset num_shards and shard_id parameters +Status ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_shards, int32_t shard_id); + +// Helper function to validate dataset sampler parameter +Status ValidateDatasetSampler(const std::string &dataset_name, const std::shared_ptr &sampler); + +Status ValidateStringValue(const std::string &dataset_name, const std::string &str, + const std::unordered_set &valid_strings); + +// Helper function to validate dataset input/output column parameterCD - +Status ValidateDatasetColumnParam(const std::string &dataset_name, const std::string &column_param, + const std::vector &columns); + +// Helper function to validate dataset directory parameter +Status ValidateDatasetDirParam(const std::string &dataset_name, std::string dataset_dir); + +/// \brief Function to create a sampler for non-mappable dataset (to be used by cache op later). +/// \notes Non-mappable dataset does not directly support a sampler. It has provided sampling arguments (shuffle, +/// num_samples, num_shards, shard_id) and it DOES support sampling if somewhere above it in the pipeline contains +/// a cache. If there is no cache above it, then the sampler is not used. +/// \param[in] num_samples The number of samples to be included in the dataset. +/// \param[in] shuffle If true, the indices are shuffled. +/// \param[in] num_shards Number of shards to divide the dataset into. +/// \param[in] shard_id Shard ID of the current shard within num_shards. +/// \return Shared pointer to the current Sampler. +std::shared_ptr SelectSampler(int64_t num_samples, bool shuffle, int32_t num_shards, int32_t shard_id); + +class DatasetNode : public std::enable_shared_from_this { + public: + /// \brief Constructor + DatasetNode(); + + /// \brief Constructor that initializes the cache + /// \param dataset_cache DatasetCache + explicit DatasetNode(const std::shared_ptr &dataset_cache); + + /// \brief Destructor + ~DatasetNode() = default; + + /// \brief Pure virtual function to convert a DatasetNode class into a runtime dataset object + /// \return The list of shared pointers to the newly created DatasetOps + virtual std::vector> Build() = 0; + + /// \brief Pure virtual function for derived class to implement parameters validation + /// \return Status Status::OK() if all the parameters are valid + virtual Status ValidateParams() = 0; + + const std::vector> Children() const { return children; } + + /// \brief Pure virtual function for derived class to get the shard id of specific node + /// \return Status Status::OK() if get shard id successfully + virtual Status GetShardId(int32_t *shard_id) { + return Status(StatusCode::kNotImplementedYet, __LINE__, __FILE__, "Method is not implemented yet."); + } + + /// \brief Setter function for runtime number of workers + /// \param[in] num_workers The number of threads in this operator + /// \return Shared pointer to the original object + std::shared_ptr SetNumWorkers(int32_t num_workers); + + protected: + std::vector> children; + std::shared_ptr parent; + std::shared_ptr cache_; + Status AddCacheOp(std::vector> *node_ops); + + int32_t num_workers_; + int32_t rows_per_buffer_; + int32_t connector_que_size_; + int32_t worker_connector_size_; +}; + +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_DATASET_NODE_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc index cd27a48936..c7dc98a9e5 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc @@ -28,14 +28,14 @@ namespace mindspore { namespace dataset { namespace api { -MapNode::MapNode(std::shared_ptr child, std::vector> operations, +MapNode::MapNode(std::shared_ptr child, std::vector> operations, std::vector input_columns, std::vector output_columns, const std::vector &project_columns, std::shared_ptr cache) : operations_(operations), input_columns_(input_columns), output_columns_(output_columns), project_columns_(project_columns), - Dataset(std::move(cache)) { + DatasetNode(std::move(cache)) { this->children.push_back(child); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h index 0eec6981b7..9ee5d1b8b8 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h @@ -21,15 +21,15 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class MapNode : public Dataset { +class MapNode : public DatasetNode { public: /// \brief Constructor - MapNode(std::shared_ptr child, std::vector> operations, + MapNode(std::shared_ptr child, std::vector> operations, std::vector input_columns = {}, std::vector output_columns = {}, const std::vector &columns = {}, std::shared_ptr cache = nullptr); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc index 10573671d7..9fa7234c58 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc @@ -28,7 +28,8 @@ namespace dataset { namespace api { // Function to build ProjectOp -ProjectNode::ProjectNode(std::shared_ptr child, const std::vector &columns) : columns_(columns) { +ProjectNode::ProjectNode(std::shared_ptr child, const std::vector &columns) + : columns_(columns) { this->children.push_back(child); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h index 5faf88a986..7a6fb52869 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h @@ -21,17 +21,17 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class ProjectNode : public Dataset { +class ProjectNode : public DatasetNode { public: /// \brief Constructor - explicit ProjectNode(std::shared_ptr child, const std::vector &columns); + explicit ProjectNode(std::shared_ptr child, const std::vector &columns); /// \brief Destructor ~ProjectNode() = default; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc index 284959f3ad..4d29b8e030 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc @@ -27,7 +27,7 @@ namespace mindspore { namespace dataset { namespace api { // Function to build RenameOp -RenameNode::RenameNode(std::shared_ptr child, const std::vector &input_columns, +RenameNode::RenameNode(std::shared_ptr child, const std::vector &input_columns, const std::vector &output_columns) : input_columns_(input_columns), output_columns_(output_columns) { this->children.push_back(child); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h index 11d8975056..379c74beae 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h @@ -21,17 +21,17 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class RenameNode : public Dataset { +class RenameNode : public DatasetNode { public: /// \brief Constructor - explicit RenameNode(std::shared_ptr child, const std::vector &input_columns, + explicit RenameNode(std::shared_ptr child, const std::vector &input_columns, const std::vector &output_columns); /// \brief Destructor diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc index 40043e6117..071d92c816 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc @@ -27,7 +27,7 @@ namespace mindspore { namespace dataset { namespace api { -RepeatNode::RepeatNode(std::shared_ptr child, int32_t count) : repeat_count_(count) { +RepeatNode::RepeatNode(std::shared_ptr child, int32_t count) : repeat_count_(count) { this->children.push_back(child); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h index d893da637a..3385b33db0 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h @@ -23,17 +23,17 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class RepeatNode : public Dataset { +class RepeatNode : public DatasetNode { public: /// \brief Constructor - explicit RepeatNode(std::shared_ptr child, int32_t count); + explicit RepeatNode(std::shared_ptr child, int32_t count); /// \brief Destructor ~RepeatNode() = default; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc index 6bca72c785..a82f3367fe 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc @@ -28,7 +28,7 @@ namespace dataset { namespace api { // Constructor for ShuffleNode -ShuffleNode::ShuffleNode(std::shared_ptr child, int32_t shuffle_size, bool reset_every_epoch) +ShuffleNode::ShuffleNode(std::shared_ptr child, int32_t shuffle_size, bool reset_every_epoch) : shuffle_size_(shuffle_size), shuffle_seed_(GetSeed()), reset_every_epoch_(reset_every_epoch) { this->children.push_back(child); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h index 07a1503f24..0274cf8b69 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h @@ -23,16 +23,16 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class ShuffleNode : public Dataset { +class ShuffleNode : public DatasetNode { public: - ShuffleNode(std::shared_ptr child, int32_t shuffle_size, bool reset_every_epoch); + ShuffleNode(std::shared_ptr child, int32_t shuffle_size, bool reset_every_epoch); ~ShuffleNode() = default; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc index 5fa20dae1f..c2e5618106 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc @@ -28,7 +28,7 @@ namespace dataset { namespace api { // Constructor for SkipNode -SkipNode::SkipNode(std::shared_ptr child, int32_t count) : skip_count_(count) { +SkipNode::SkipNode(std::shared_ptr child, int32_t count) : skip_count_(count) { this->children.push_back(child); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h index 81c6d26311..438eb54f99 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h @@ -21,16 +21,16 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class SkipNode : public Dataset { +class SkipNode : public DatasetNode { public: /// \brief Constructor - explicit SkipNode(std::shared_ptr child, int32_t count); + explicit SkipNode(std::shared_ptr child, int32_t count); /// \brief Destructor ~SkipNode() = default; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc index 5bcab68789..1e9cdd9c4d 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc @@ -32,7 +32,7 @@ namespace api { AlbumNode::AlbumNode(const std::string &dataset_dir, const std::string &data_schema, const std::vector &column_names, bool decode, const std::shared_ptr &sampler, const std::shared_ptr &cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), schema_path_(data_schema), column_names_(column_names), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h index 6ef0d159fb..fb50353df2 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h @@ -21,13 +21,13 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class AlbumNode : public Dataset { +class AlbumNode : public DatasetNode { public: /// \brief Constructor AlbumNode(const std::string &dataset_dir, const std::string &data_schema, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc index d5e447acec..a9eaa3442d 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc @@ -31,7 +31,7 @@ namespace api { CelebANode::CelebANode(const std::string &dataset_dir, const std::string &usage, const std::shared_ptr &sampler, const bool &decode, const std::set &extensions, const std::shared_ptr &cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h index 3829302cff..0e90a72b2c 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h @@ -23,13 +23,13 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class CelebANode : public Dataset { +class CelebANode : public DatasetNode { public: /// \brief Constructor CelebANode(const std::string &dataset_dir, const std::string &usage, const std::shared_ptr &sampler, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc index 835967005d..104c00ee3c 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc @@ -31,7 +31,7 @@ namespace api { // Constructor for Cifar100Node Cifar100Node::Cifar100Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} Status Cifar100Node::ValidateParams() { RETURN_IF_NOT_OK(ValidateDatasetDirParam("Cifar100Node", dataset_dir_)); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h index bbde01ba20..79dd35486b 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h @@ -21,13 +21,13 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class Cifar100Node : public Dataset { +class Cifar100Node : public DatasetNode { public: /// \brief Constructor Cifar100Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc index 578e2d9bfb..3d19d8fd79 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc @@ -31,7 +31,7 @@ namespace api { // Constructor for Cifar10Node Cifar10Node::Cifar10Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} Status Cifar10Node::ValidateParams() { RETURN_IF_NOT_OK(ValidateDatasetDirParam("Cifar10Node", dataset_dir_)); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h index ff851c420f..3037caefc0 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h @@ -21,13 +21,13 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class Cifar10Node : public Dataset { +class Cifar10Node : public DatasetNode { public: /// \brief Constructor Cifar10Node(const std::string &dataset_dir, const std::string &usage, std::shared_ptr sampler, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc index 538fa63817..17cd4769e4 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc @@ -33,7 +33,7 @@ namespace api { // Constructor for CLUENode CLUENode::CLUENode(const std::vector clue_files, std::string task, std::string usage, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_files_(clue_files), task_(task), usage_(usage), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h index 8e2eb8aff8..eba34dfab3 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h @@ -21,14 +21,14 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { /// \class CLUENode /// \brief A Dataset derived class to represent CLUE dataset -class CLUENode : public Dataset { +class CLUENode : public DatasetNode { public: /// \brief Constructor CLUENode(const std::vector dataset_files, std::string task, std::string usage, int64_t num_samples, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc index 0b447d9960..3e0729f146 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc @@ -30,7 +30,7 @@ namespace api { // Constructor for CocoNode CocoNode::CocoNode(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, const bool &decode, const std::shared_ptr &sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), annotation_file_(annotation_file), task_(task), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h index 50b621ef01..2593534509 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h @@ -21,12 +21,12 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class CocoNode : public Dataset { +class CocoNode : public DatasetNode { public: /// \brief Constructor CocoNode(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc index 20f3e3701a..1909b1cfe6 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc @@ -33,7 +33,7 @@ CSVNode::CSVNode(const std::vector &csv_files, char field_delim, const std::vector> &column_defaults, const std::vector &column_names, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_files_(csv_files), field_delim_(field_delim), column_defaults_(column_defaults), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h index ca673c0ee5..9828d5d03f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h @@ -21,7 +21,7 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { @@ -47,7 +47,7 @@ class CsvRecord : public CsvBase { T value; }; -class CSVNode : public Dataset { +class CSVNode : public DatasetNode { public: /// \brief Constructor CSVNode(const std::vector &dataset_files, char field_delim, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.h index ae52020289..6237346f11 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.h @@ -21,7 +21,7 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" #include "minddata/dataset/util/status.h" namespace mindspore { @@ -31,7 +31,7 @@ namespace api { /// \class GeneratorNode /// \brief A Dataset derived class to represent GeneratorNode dataset -class GeneratorNode : public Dataset { +class GeneratorNode : public DatasetNode { public: /// \brief Constructor GeneratorNode(py::function generator_function, const std::vector &column_names, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc index 338308a0c8..5c4159435d 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc @@ -40,7 +40,7 @@ ImageFolderNode::ImageFolderNode(std::string dataset_dir, bool decode, std::shar recursive_(recursive), class_indexing_(class_indexing), exts_(extensions), - Dataset(std::move(cache)) {} + DatasetNode(std::move(cache)) {} Status ImageFolderNode::ValidateParams() { RETURN_IF_NOT_OK(ValidateDatasetDirParam("ImageFolderNode", dataset_dir_)); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h index fe4e4cd13d..7922eda8bd 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h @@ -24,7 +24,7 @@ #include #include "mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h" -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { @@ -33,7 +33,7 @@ namespace api { /// \class ImageFolderNode /// \brief A Dataset derived class to represent ImageFolder dataset -class ImageFolderNode : public Dataset { +class ImageFolderNode : public DatasetNode { public: /// \brief Constructor ImageFolderNode(std::string dataset_dir, bool decode, std::shared_ptr sampler, bool recursive, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc index 41141b0ce5..cbb01a9cf2 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc @@ -32,7 +32,7 @@ ManifestNode::ManifestNode(const std::string &dataset_file, const std::string &u const std::shared_ptr &sampler, const std::map &class_indexing, bool decode, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_file_(dataset_file), usage_(usage), decode_(decode), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h index 99d7ef435b..b8da1555d5 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h @@ -22,12 +22,12 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class ManifestNode : public Dataset { +class ManifestNode : public DatasetNode { public: /// \brief Constructor ManifestNode(const std::string &dataset_file, const std::string &usage, const std::shared_ptr &sampler, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h index 1663f14762..ea10456bcf 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/minddata_node.h @@ -22,12 +22,12 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class MindDataNode : public Dataset { +class MindDataNode : public DatasetNode { public: /// \brief Constructor MindDataNode(const std::vector &dataset_files, const std::vector &columns_list, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc index 6d6e1fdee8..5feee17998 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc @@ -30,7 +30,7 @@ namespace api { MnistNode::MnistNode(std::string dataset_dir, std::string usage, std::shared_ptr sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} Status MnistNode::ValidateParams() { RETURN_IF_NOT_OK(ValidateDatasetDirParam("MnistNode", dataset_dir_)); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h index 713ba94fdf..663e2ede97 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h @@ -21,13 +21,13 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class MnistNode : public Dataset { +class MnistNode : public DatasetNode { public: /// \brief Constructor MnistNode(std::string dataset_dir, std::string usage, std::shared_ptr sampler, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h index 09e980a14c..79d995438a 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.h @@ -22,13 +22,13 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class RandomNode : public Dataset { +class RandomNode : public DatasetNode { public: // Some constants to provide limits to random generation. static constexpr int32_t kMaxNumColumns = 4; @@ -38,7 +38,7 @@ class RandomNode : public Dataset { /// \brief Constructor RandomNode(const int32_t &total_rows, std::shared_ptr schema, const std::vector &columns_list, const std::shared_ptr &sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), total_rows_(total_rows), schema_path_(""), schema_(std::move(schema)), @@ -48,7 +48,7 @@ class RandomNode : public Dataset { /// \brief Constructor RandomNode(const int32_t &total_rows, std::string schema_path, const std::vector &columns_list, const std::shared_ptr &sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc index 8af0388d4a..2d4841f9e7 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc @@ -31,7 +31,7 @@ namespace api { // Constructor for TextFileNode TextFileNode::TextFileNode(std::vector dataset_files, int32_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_files_(dataset_files), num_samples_(num_samples), shuffle_(shuffle), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h index e5762f8b37..9011aa1603 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h @@ -21,14 +21,14 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { /// \class TextFileNode /// \brief A Dataset derived class to represent TextFile dataset -class TextFileNode : public Dataset { +class TextFileNode : public DatasetNode { public: /// \brief Constructor TextFileNode(std::vector dataset_files, int32_t num_samples, ShuffleMode shuffle, int32_t num_shards, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc index 486c481e19..7492ec1131 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc @@ -55,6 +55,53 @@ bool ValidateFirstRowCrc(const std::string &filename) { // Validator for TFRecordNode Status TFRecordNode::ValidateParams() { + if (dataset_files_.empty()) { + std::string err_msg = "TFRecordNode: dataset_files is not specified."; + MS_LOG(ERROR) << err_msg; + return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); + } + + for (const auto &f : dataset_files_) { + Path dataset_file(f); + if (!dataset_file.Exists()) { + std::string err_msg = "TFRecordNode: dataset file: [" + f + "] is invalid or does not exist."; + MS_LOG(ERROR) << err_msg; + + return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); + } + } + + if (num_samples_ < 0) { + std::string err_msg = "TFRecordNode: Invalid number of samples: " + std::to_string(num_samples_); + MS_LOG(ERROR) << err_msg; + + return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); + } + + if (num_shards_ <= 0) { + std::string err_msg = "TFRecordNode: Invalid num_shards: " + std::to_string(num_shards_); + MS_LOG(ERROR) << err_msg; + + return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); + } + + if (shard_id_ < 0 || shard_id_ >= num_shards_) { + std::string err_msg = "TFRecordNode: Invalid input, shard_id: " + std::to_string(shard_id_) + + ", num_shards: " + std::to_string(num_shards_); + MS_LOG(ERROR) << err_msg; + + return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); + } + + if (cache_ == nullptr && !shard_equal_rows_ && dataset_files_.size() < num_shards_) { + // This check only makes sense in a non-cache path. We should make sure there is at least one file per + // shard in file-based sharding + std::string err_msg = + "TFRecordNode: Invalid number of dataset files, should at least be " + std::to_string(num_shards_); + MS_LOG(ERROR) << err_msg; + return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, err_msg); + } + std::vector invalid_files(dataset_files_.size()); auto it = std::copy_if(dataset_files_.begin(), dataset_files_.end(), invalid_files.begin(), [](const std::string &filename) { return !ValidateFirstRowCrc(filename); }); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h index ebc493d09a..08e4d094c4 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h @@ -22,21 +22,21 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { /// \class TFRecordNode /// \brief A Dataset derived class to represent TFRecord dataset -class TFRecordNode : public Dataset { +class TFRecordNode : public DatasetNode { public: /// \brief Constructor /// \note Parameter 'schema' is the path to the schema file TFRecordNode(const std::vector &dataset_files, std::string schema, const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_files_(dataset_files), schema_path_(schema), columns_list_(columns_list), @@ -51,7 +51,7 @@ class TFRecordNode : public Dataset { TFRecordNode(const std::vector &dataset_files, std::shared_ptr schema, const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_files_(dataset_files), schema_obj_(schema), columns_list_(columns_list), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc index f263ceb096..68ade8aa07 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc @@ -32,7 +32,7 @@ namespace api { VOCNode::VOCNode(const std::string &dataset_dir, const std::string &task, const std::string &usage, const std::map &class_indexing, bool decode, std::shared_ptr sampler, std::shared_ptr cache) - : Dataset(std::move(cache)), + : DatasetNode(std::move(cache)), dataset_dir_(dataset_dir), task_(task), usage_(usage), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h index a61b758fd6..ed3656397c 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h @@ -22,12 +22,12 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class VOCNode : public Dataset { +class VOCNode : public DatasetNode { public: /// \brief Constructor VOCNode(const std::string &dataset_dir, const std::string &task, const std::string &usage, diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.cc index c4c83903dd..8642cdb79f 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.cc @@ -27,7 +27,7 @@ namespace mindspore { namespace dataset { namespace api { // Constructor for SyncWaitNode -SyncWaitNode::SyncWaitNode(std::shared_ptr child, const std::string &condition_name, int32_t num_batch, +SyncWaitNode::SyncWaitNode(std::shared_ptr child, const std::string &condition_name, int32_t num_batch, py::function callback) : condition_name_(condition_name), num_batch_(num_batch), callback_(callback) { this->children.push_back(child); diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.h index 3cd033b49c..b108e257af 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/sync_wait_node.h @@ -21,7 +21,7 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { @@ -30,10 +30,10 @@ namespace api { /// \class SyncWaitNode /// \brief A Dataset derived class to represent SyncWaitNode dataset -class SyncWaitNode : public Dataset { +class SyncWaitNode : public DatasetNode { public: /// \brief Constructor - explicit SyncWaitNode(std::shared_ptr child, const std::string &condition_name, int32_t num_batch, + explicit SyncWaitNode(std::shared_ptr child, const std::string &condition_name, int32_t num_batch, py::function callback); /// \brief Destructor diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc index b11bd7d345..9a3fed7b87 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc @@ -27,7 +27,7 @@ namespace mindspore { namespace dataset { namespace api { // Constructor for TakeNode -TakeNode::TakeNode(std::shared_ptr child, int32_t count) : take_count_(count) { +TakeNode::TakeNode(std::shared_ptr child, int32_t count) : take_count_(count) { this->children.push_back(child); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h index 6706878be5..dfc7199384 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h @@ -21,17 +21,17 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class TakeNode : public Dataset { +class TakeNode : public DatasetNode { public: /// \brief Constructor - explicit TakeNode(std::shared_ptr child, int32_t count); + explicit TakeNode(std::shared_ptr child, int32_t count); /// \brief Destructor ~TakeNode() = default; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc index 787b48df99..d566d30d3a 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc @@ -28,14 +28,8 @@ namespace dataset { namespace api { // Constructor for TransferNode -TransferNode::TransferNode(std::shared_ptr child, const std::string &queue_name, int32_t device_id, - const std::string &device_type, bool send_epoch_end) - : queue_name_(queue_name), - device_id_(device_id), - device_type_(device_type), - prefetch_size_(16), - send_epoch_end_(send_epoch_end), - total_batch_(0) { +TransferNode::TransferNode(std::shared_ptr child, bool send_epoch_end) + : prefetch_size_(16), send_epoch_end_(send_epoch_end), total_batch_(0) { this->children.push_back(child); } @@ -48,6 +42,15 @@ Status TransferNode::ValidateParams() { // Function to build TransferNode std::vector> TransferNode::Build() { + // Get a uuid for queue name + queue_name_ = Services::GetUniqueID(); + // TODO(CRC): + // Get device type from ms context + device_type_ = "CPU"; + // Get device ID from children + device_id_ = 0; + RETURN_EMPTY_IF_ERROR(TransferNode::get_distribution(shared_from_this(), &device_id_)); + // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; @@ -67,13 +70,13 @@ std::vector> TransferNode::Build() { } // Function to get the device_id -Status TransferNode::get_distribution(std::shared_ptr ds, int32_t *device_id) { +Status TransferNode::get_distribution(std::shared_ptr ds, int32_t *device_id) { // Get device id according to the type of dataset Status rc = ds->GetShardId(device_id); if (rc != Status::OK()) { // Get device id from the child node - if (ds->children.size()) { - ds = ds->children[0]; + if (ds->Children().size()) { + ds = ds->Children()[0]; return TransferNode::get_distribution(ds, device_id); } else { std::string err_msg = "Unknown dataset type."; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h index 000287155b..34f00800e5 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h @@ -21,18 +21,17 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class TransferNode : public Dataset { +class TransferNode : public DatasetNode { public: /// \brief Constructor - TransferNode(std::shared_ptr child, const std::string &queue_name, int32_t device_id, - const std::string &device_type, bool send_epoch_end); + TransferNode(std::shared_ptr child, bool send_epoch_end); /// \brief Destructor ~TransferNode() = default; @@ -45,7 +44,7 @@ class TransferNode : public Dataset { /// \return Status Status::OK() if all the parameters are valid Status ValidateParams() override; - static Status get_distribution(std::shared_ptr ds, int32_t *device_id); + static Status get_distribution(std::shared_ptr ds, int32_t *device_id); private: std::string queue_name_; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc index b34f385530..2099cd1035 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.cc @@ -27,7 +27,7 @@ namespace mindspore { namespace dataset { namespace api { -ZipNode::ZipNode(const std::vector> &datasets) : datasets_(datasets) { +ZipNode::ZipNode(const std::vector> &datasets) : datasets_(datasets) { for (auto dataset : datasets_) { this->children.push_back(dataset); } diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h index 0073f66e92..f7046842a9 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/zip_node.h @@ -21,16 +21,16 @@ #include #include -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class ZipNode : public Dataset { +class ZipNode : public DatasetNode { public: /// \brief Constructor - explicit ZipNode(const std::vector> &datasets); + explicit ZipNode(const std::vector> &datasets); /// \brief Destructor ~ZipNode() = default; @@ -44,7 +44,7 @@ class ZipNode : public Dataset { Status ValidateParams() override; private: - std::vector> datasets_; + std::vector> datasets_; }; } // namespace api diff --git a/mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.cc b/mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.cc new file mode 100644 index 0000000000..ae9ab4c275 --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.cc @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/engine/python_runtime_context.h" +#include "pybind11/pybind11.h" + +namespace mindspore::dataset { + +Status PythonRuntimeContext::Terminate() { + // Release GIL before joining all threads + py::gil_scoped_release gil_release; + return tree_consumer_->Terminate(); +} +} // namespace mindspore::dataset diff --git a/mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.h b/mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.h new file mode 100644 index 0000000000..63353b7efd --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.h @@ -0,0 +1,48 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PYTHON_RUNTIME_CONTEXT_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PYTHON_RUNTIME_CONTEXT_H_ + +#include +#include +#include "minddata/dataset/core/client.h" +#include "minddata/dataset/engine/consumers/tree_consumer.h" +#include "minddata/dataset/engine/consumers/python_tree_consumer.h" +#include "minddata/dataset/engine/runtime_context.h" + +namespace mindspore::dataset { +class RuntimeContext; + +/// Class the represents single runtime instance which can consume data from a data pipeline +class PythonRuntimeContext : public RuntimeContext { + public: + /// Method to terminate the runtime, this will not release the resources + /// \return Status error code + Status Terminate() override; + + ~PythonRuntimeContext() { + Terminate(); + { + py::gil_scoped_acquire gil_acquire; + tree_consumer_.reset(); + } + } + + PythonIteratorConsumer *GetPythonConsumer() { return dynamic_cast(tree_consumer_.get()); } +}; + +} // namespace mindspore::dataset +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PYTHON_RUNTIME_CONTEXT_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/runtime_context.cc b/mindspore/ccsrc/minddata/dataset/engine/runtime_context.cc index e82d6a1bd9..1d15495c7b 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/runtime_context.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/runtime_context.cc @@ -19,7 +19,7 @@ #include namespace mindspore::dataset { -void RuntimeContext::AssignConsumer(std::unique_ptr tree_consumer) { +void RuntimeContext::AssignConsumer(std::shared_ptr tree_consumer) { tree_consumer_ = std::move(tree_consumer); } } // namespace mindspore::dataset diff --git a/mindspore/ccsrc/minddata/dataset/engine/runtime_context.h b/mindspore/ccsrc/minddata/dataset/engine/runtime_context.h index 855acc3fc0..a2f3e17b47 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/runtime_context.h +++ b/mindspore/ccsrc/minddata/dataset/engine/runtime_context.h @@ -40,14 +40,16 @@ class RuntimeContext { /// Set the tree consumer /// \param tree_consumer to be assigned - void AssignConsumer(std::unique_ptr tree_consumer); + void AssignConsumer(std::shared_ptr tree_consumer); /// Get the tree consumer /// \return Raw pointer to the tree consumer. TreeConsumer *GetConsumer() { return tree_consumer_.get(); } - private: - std::unique_ptr tree_consumer_; + ~RuntimeContext() { Terminate(); } + + protected: + std::shared_ptr tree_consumer_; }; } // namespace mindspore::dataset diff --git a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc index c891dc1f25..07d01ceec1 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc @@ -22,7 +22,7 @@ namespace mindspore { namespace dataset { -Status TreeAdapter::BuildAndPrepare(std::shared_ptr root_ir, int32_t num_epoch) { +Status TreeAdapter::BuildAndPrepare(std::shared_ptr root_ir, int32_t num_epoch) { // Check whether this function has been called before. If so, return failure CHECK_FAIL_RETURN_UNEXPECTED(tree_ == nullptr, "ExecutionTree is already built."); RETURN_UNEXPECTED_IF_NULL(root_ir); @@ -65,7 +65,7 @@ Status TreeAdapter::GetNext(TensorRow *row) { return Status::OK(); } -Status TreeAdapter::DFSBuildTree(std::shared_ptr ir, std::shared_ptr *op) { +Status TreeAdapter::DFSBuildTree(std::shared_ptr ir, std::shared_ptr *op) { // validate the op can be built first before building the DatasetOp RETURN_IF_NOT_OK(ir->ValidateParams()); std::vector> ops = ir->Build(); @@ -80,7 +80,7 @@ Status TreeAdapter::DFSBuildTree(std::shared_ptr ir, std::shared_p } // Build the children of ir, once they return, add the return value to *op - for (std::shared_ptr child_ir : ir->children) { + for (const auto &child_ir : ir->Children()) { std::shared_ptr child_op; RETURN_IF_NOT_OK(DFSBuildTree(child_ir, &child_op)); RETURN_IF_NOT_OK(ops.back()->AddChild(child_op)); // append children to the last of ops diff --git a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.h b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.h index d13c47c413..a1f8201cee 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.h +++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.h @@ -24,12 +24,12 @@ #include #include "minddata/dataset/engine/execution_tree.h" -#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" namespace mindspore { namespace dataset { namespace api { -class Dataset; +class DatasetNode; } class TreeAdapter { public: @@ -40,7 +40,7 @@ class TreeAdapter { // This will construct an ExeTree from a Dataset root and Prepare() the ExeTree // This function is only meant to be called once and needs to be called before GetNext // ExeTree will be launched when the first GetNext is called - Status BuildAndPrepare(std::shared_ptr root, int32_t num_epoch = -1); + Status BuildAndPrepare(std::shared_ptr root, int32_t num_epoch = -1); // This is the main method TreeConsumer uses to interact with TreeAdapter // 1. GetNext will Launch() the ExeTree on its first call by iterator (tree is already prepared) @@ -62,7 +62,7 @@ class TreeAdapter { private: // This RECURSIVE function converts IR nodes into DatasetOp in ExecutionTree. IR could build a vector of ops. In // such case, the first node is returned. Op is added as child when the current function returns. - Status DFSBuildTree(std::shared_ptr ir, std::shared_ptr *op); + Status DFSBuildTree(std::shared_ptr ir, std::shared_ptr *op); std::unique_ptr cur_db_; std::unordered_map column_name_map_; diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index 2e56ded0de..1cb9426bb0 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -26,10 +26,12 @@ #include #include #include -#include "mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h" +#include "minddata/dataset/engine/ir/cache/dataset_cache.h" + #include "minddata/dataset/core/constants.h" #include "minddata/dataset/engine/consumers/tree_consumer.h" #include "minddata/dataset/engine/data_schema.h" +#include "minddata/dataset/engine/ir/datasetops/dataset_node.h" #include "minddata/dataset/include/iterator.h" #include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/tensor.h" @@ -45,7 +47,6 @@ namespace mindspore { namespace dataset { -// Forward declare class DatasetOp; class DataSchema; class Tensor; @@ -57,6 +58,8 @@ class Vocab; #endif namespace api { +// Forward declare +class DatasetNode; class Dataset; class Iterator; @@ -64,630 +67,109 @@ class TensorOperation; class SchemaObj; class SamplerObj; // Datasets classes (in alphabetical order) -class AlbumNode; -class CelebANode; -class Cifar10Node; -class Cifar100Node; -class CLUENode; -class CocoNode; -class CSVNode; class CsvBase; -class ImageFolderNode; -class BatchNode; -#ifndef ENABLE_ANDROID -class ManifestNode; -class MindDataNode; -#endif -class MnistNode; -class RandomNode; -class TextFileNode; -#ifndef ENABLE_ANDROID -class TFRecordNode; -class VOCNode; -#endif +class BatchDataset; // Dataset Op classes (in alphabetical order) #ifndef ENABLE_ANDROID -class BucketBatchByLengthNode; +class BucketBatchByLengthDataset; #endif -class ConcatNode; -class MapNode; -class ProjectNode; -class RenameNode; -class RepeatNode; -class ShuffleNode; -class SkipNode; -class TakeNode; -class TransferNode; -class ZipNode; - -#define RETURN_EMPTY_IF_ERROR(_s) \ - do { \ - Status __rc = (_s); \ - if (__rc.IsError()) { \ - MS_LOG(ERROR) << __rc; \ - return {}; \ - } \ - } while (false) - -Status AddShuffleOp(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows, - int32_t connector_que_size, int32_t rows_per_buffer, std::shared_ptr *shuffle_op); - -// Helper function to validate dataset files parameter -Status ValidateDatasetFilesParam(const std::string &dataset_name, const std::vector &dataset_files); - -// Helper function to validate dataset num_shards and shard_id parameters -Status ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_shards, int32_t shard_id); - -// Helper function to validate dataset sampler parameter -Status ValidateDatasetSampler(const std::string &dataset_name, const std::shared_ptr &sampler); - -Status ValidateStringValue(const std::string &dataset_name, const std::string &str, - const std::unordered_set &valid_strings); - -// Helper function to validate dataset input/output column parameterCD - -Status ValidateDatasetColumnParam(const std::string &dataset_name, const std::string &column_param, - const std::vector &columns); - -// Helper function to validate dataset directory parameter -Status ValidateDatasetDirParam(const std::string &dataset_name, std::string dataset_dir); - -/// \brief Function to create a SchemaObj -/// \param[in] schema_file Path of schema file -/// \return Shared pointer to the current schema -std::shared_ptr Schema(const std::string &schema_file = ""); +class ConcatDataset; +class MapDataset; +class ProjectDataset; +class RenameDataset; +class RepeatDataset; +class ShuffleDataset; +class SkipDataset; +class TakeDataset; +class TransferDataset; +class ZipDataset; -/// \brief Function to create an AlbumNode -/// \notes The generated dataset is specified through setting a schema -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] data_schema Path to dataset schema file -/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. -/// (default = {}) -/// \param[in] decode the option to decode the images in dataset (default = false) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, - const std::vector &column_names = {}, bool decode = false, - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr); +/// \class Dataset datasets.h +/// \brief A base class to represent a dataset in the data pipeline. +class Dataset : public std::enable_shared_from_this { + public: + // need friend class so they can access the children_ field + friend class Iterator; + friend class TransferNode; + friend class mindspore::dataset::TreeAdapter; -/// \brief Function to create a CelebANode -/// \notes The generated dataset has two columns ['image', 'attr']. -/// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. -/// \param[in] dataset_dir Path to the root directory that contains the dataset. -/// \param[in] usage One of "all", "train", "valid" or "test" (default = "all"). -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] decode Decode the images after reading (default=false). -/// \param[in] extensions Set of file extensions to be included in the dataset (default={}). -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &usage = "all", - const std::shared_ptr &sampler = RandomSampler(), bool decode = false, - const std::set &extensions = {}, - const std::shared_ptr &cache = nullptr); + /// \brief Constructor + Dataset(); -/// \brief Function to create a Cifar10 Dataset -/// \notes The generated dataset has two columns ["image", "label"] -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] usage of CIFAR10, can be "train", "test" or "all" (default = "all"). -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr Cifar10(const std::string &dataset_dir, const std::string &usage = "all", - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr); + /// \brief Destructor + ~Dataset() = default; -/// \brief Function to create a Cifar100 Dataset -/// \notes The generated dataset has three columns ["image", "coarse_label", "fine_label"] -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] usage of CIFAR100, can be "train", "test" or "all" (default = "all"). -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr Cifar100(const std::string &dataset_dir, const std::string &usage = "all", - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr); + /// \brief Gets the dataset size + /// \return dataset size. If failed, return -1 + int64_t GetDatasetSize(); -/// \brief Function to create a CLUENode -/// \notes The generated dataset has a variable number of columns depending on the task and usage -/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list -/// will be sorted in a lexicographical order. -/// \param[in] task The kind of task, one of "AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC" and "CSL" (default="AFQMC"). -/// \param[in] usage Be used to "train", "test" or "eval" data (default="train"). -/// \param[in] num_samples The number of samples to be included in the dataset. -/// (Default = 0 means all samples.) -/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) -/// Can be any of: -/// ShuffleMode::kFalse - No shuffling is performed. -/// ShuffleMode::kFiles - Shuffle files only. -/// ShuffleMode::kGlobal - Shuffle both the files and samples. -/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) -/// \param[in] shard_id The shard ID within num_shards. This argument should be -/// specified only when num_shards is also specified. (Default = 0) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current CLUENode -std::shared_ptr CLUE(const std::vector &dataset_files, const std::string &task = "AFQMC", - const std::string &usage = "train", int64_t num_samples = 0, - ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, - const std::shared_ptr &cache = nullptr); + /// \brief Gets the output type + /// \return a vector of DataType. If failed, return an empty vector + std::vector GetOutputTypes(); -/// \brief Function to create a CocoNode -/// \notes The generated dataset has multi-columns : -/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], -/// ['iscrowd', dtype=uint32]]. -/// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]]. -/// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32], -/// ['num_keypoints', dtype=uint32]]. -/// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], -/// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]]. -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] annotation_file Path to the annotation json -/// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint' -/// \param[in] decode Decode the images after reading -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, - const std::string &task = "Detection", const bool &decode = false, - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr); + /// \brief Gets the output shape + /// \return a vector of TensorShape. If failed, return am empty vector + std::vector GetOutputShapes(); -/// \brief Function to create a CSVNode -/// \notes The generated dataset has a variable number of columns -/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list -/// will be sorted in a lexicographical order. -/// \param[in] field_delim A char that indicates the delimiter to separate fields (default=','). -/// \param[in] column_defaults List of default values for the CSV field (default={}). Each item in the list is -/// either a valid type (float, int, or string). If this is not provided, treats all columns as string type. -/// \param[in] column_names List of column names of the dataset (default={}). If this is not provided, infers the -/// column_names from the first row of CSV file. -/// \param[in] num_samples The number of samples to be included in the dataset. -/// (Default = 0 means all samples.) -/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal) -/// Can be any of: -/// ShuffleMode::kFalse - No shuffling is performed. -/// ShuffleMode::kFiles - Shuffle files only. -/// ShuffleMode::kGlobal - Shuffle both the files and samples. -/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) -/// \param[in] shard_id The shard ID within num_shards. This argument should be -/// specified only when num_shards is also specified. (Default = 0) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr CSV(const std::vector &dataset_files, char field_delim = ',', - const std::vector> &column_defaults = {}, - const std::vector &column_names = {}, int64_t num_samples = 0, - ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, - const std::shared_ptr &cache = nullptr); + /// \brief Gets the batch size + /// \return int64_t + int64_t GetBatchSize(); -/// \brief Function to create an ImageFolderNode -/// \notes A source dataset that reads images from a tree of directories -/// All images within one folder have the same label -/// The generated dataset has two columns ["image", "label"] -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] decode A flag to decode in ImageFolder -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] extensions File extensions to be read -/// \param[in] class_indexing a class name to label map -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current ImageFolderNode -std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode = false, - const std::shared_ptr &sampler = RandomSampler(), - const std::set &extensions = {}, - const std::map &class_indexing = {}, - const std::shared_ptr &cache = nullptr); + /// \brief Gets the the repeat count + /// \return int64_t + int64_t GetRepeatCount(); -#ifndef ENABLE_ANDROID -/// \brief Function to create a ManifestNode -/// \notes The generated dataset has two columns ["image", "label"] -/// \param[in] dataset_file The dataset file to be read -/// \param[in] usage Need "train", "eval" or "inference" data (default="train") -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder -/// names will be sorted alphabetically and each class will be given a unique index starting from 0). -/// \param[in] decode Decode the images after reading (default=false). -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current ManifestNode -std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage = "train", - const std::shared_ptr &sampler = RandomSampler(), - const std::map &class_indexing = {}, bool decode = false, - const std::shared_ptr &cache = nullptr); -#endif + /// \brief Gets the number of classes + /// \return number of classes. If failed, return -1 + int64_t GetNumClasses(); -#ifndef ENABLE_ANDROID -/// \brief Function to create a MindDataNode -/// \param[in] dataset_file File name of one component of a mindrecord source. Other files with identical source -/// in the same path will be found and loaded automatically. -/// \param[in] columns_list List of columns to be read (default={}) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()), -/// supported sampler list: SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler. -/// \param[in] padded_sample Samples will be appended to dataset, where keys are the same as column_list. -/// \param[in] num_padded Number of padding samples. Dataset size plus num_padded should be divisible by num_shards. -/// \return Shared pointer to the current MindDataNode -std::shared_ptr MindData(const std::string &dataset_file, - const std::vector &columns_list = {}, - const std::shared_ptr &sampler = RandomSampler(), - nlohmann::json padded_sample = nullptr, int64_t num_padded = 0); + /// \brief Setter function for runtime number of workers + /// \param[in] num_workers The number of threads in this operator + /// \return Shared pointer to the original object + std::shared_ptr SetNumWorkers(int32_t num_workers); -/// \brief Function to create a MindDataNode -/// \param[in] dataset_files List of dataset files to be read directly. -/// \param[in] columns_list List of columns to be read (default={}) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()), -/// supported sampler list: SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler. -/// \param[in] padded_sample Samples will be appended to dataset, where keys are the same as column_list. -/// \param[in] num_padded Number of padding samples. Dataset size plus num_padded should be divisible by num_shards. -/// \return Shared pointer to the current MindDataNode -std::shared_ptr MindData(const std::vector &dataset_files, - const std::vector &columns_list = {}, - const std::shared_ptr &sampler = RandomSampler(), - nlohmann::json padded_sample = nullptr, int64_t num_padded = 0); -#endif + /// \brief Function to create an Iterator over the Dataset pipeline + /// \param[in] columns List of columns to be used to specify the order of columns + /// \return Shared pointer to the Iterator + std::shared_ptr CreateIterator(std::vector columns = {}); -/// \brief Function to create a MnistNode -/// \notes The generated dataset has two columns ["image", "label"] -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all"). -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current MnistNode -std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage = "all", - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr); + /// \brief Function to transfer data through a device. + /// \notes If device is Ascend, features of data will be transferred one by one. The limitation + /// of data transmission per time is 256M. + /// \param[in] send_epoch_end Whether to send end of sequence to device or not (default=True). + /// \return Returns true if no error encountered else false. + bool DeviceQueue(bool send_epoch_end = true); -/// \brief Function to create a ConcatNode -/// \notes Reload "+" operator to concat two datasets -/// \param[in] datasets1 Shared pointer to the first dataset to be concatenated -/// \param[in] datasets2 Shared pointer to the second dataset to be concatenated -/// \return Shared pointer to the current ConcatNode -std::shared_ptr operator+(const std::shared_ptr &datasets1, - const std::shared_ptr &datasets2); +#ifndef ENABLE_ANDROID + /// \brief Function to create a Saver to save the dynamic data processed by the dataset pipeline + /// \note Usage restrictions: + /// 1. Supported dataset formats: 'mindrecord' only + /// 2. To save the samples in order, set dataset's shuffle to false and num_files to 1. + /// 3. Before calling the function, do not use batch operator, repeat operator or data augmentation operators + /// with random attribute in map operator. + /// 4. Mindrecord does not support bool, uint64, multi-dimensional uint8(drop dimension) nor + /// multi-dimensional string. + /// \param[in] file_name Path to dataset file + /// \param[in] num_files Number of dataset files (default=1) + /// \param[in] file_type Dataset format (default="mindrecord") + /// \return Returns true if no error encountered else false + bool Save(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord"); +#endif -/// \brief Function to create a RandomNode -/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random) -/// \param[in] schema SchemaObj to set column type, data type and data shape -/// \param[in] columns_list List of columns to be read (default={}, read all columns) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -template > -std::shared_ptr RandomData(const int32_t &total_rows = 0, const T &schema = nullptr, - const std::vector &columns_list = {}, - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr) { - if (total_rows < 0) { - MS_LOG(ERROR) << "RandomNode: total_rows must be greater than or equal 0, now get " << total_rows; - return nullptr; - } - if (sampler == nullptr) { - MS_LOG(ERROR) << "RandomNode: Sampler is not constructed correctly, sampler: nullptr"; - return nullptr; - } - if (!columns_list.empty()) { - for (uint32_t i = 0; i < columns_list.size(); ++i) { - if (columns_list[i].empty()) { - MS_LOG(ERROR) << "RandomNode:columns_list" - << "[" << i << "] should not be empty"; - return nullptr; - } - } - std::set columns_set(columns_list.begin(), columns_list.end()); - if (columns_set.size() != columns_list.size()) { - MS_LOG(ERROR) << "RandomNode:columns_list: Every column name should not be same with others"; - return nullptr; - } - } - std::shared_ptr ds; - if constexpr (std::is_same::value || std::is_same>::value) { - std::shared_ptr schema_obj = schema; - ds = std::make_shared(total_rows, std::move(schema_obj), std::move(columns_list), std::move(sampler), - cache); - } else { - ds = - std::make_shared(total_rows, std::move(schema), std::move(columns_list), std::move(sampler), cache); + /// \brief Function to create a BatchDataset + /// \notes Combines batch_size number of consecutive rows into batches + /// \param[in] batch_size The number of rows each batch is created with + /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete + /// batch. If true, and if there are less than batch_size rows + /// available to make the last batch, then those rows will + /// be dropped and not propagated to the next node + /// \return Shared pointer to the current BatchDataset + std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false) { + return std::make_shared(shared_from_this(), batch_size, drop_remainder); } - return ds; -} - -/// \brief Function to create a TextFileNode -/// \notes The generated dataset has one column ['text'] -/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list -/// will be sorted in a lexicographical order. -/// \param[in] num_samples The number of samples to be included in the dataset. -/// (Default = 0 means all samples.) -/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) -/// Can be any of: -/// ShuffleMode.kFalse - No shuffling is performed. -/// ShuffleMode.kFiles - Shuffle files only. -/// ShuffleMode.kGlobal - Shuffle both the files and samples. -/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) -/// \param[in] shard_id The shard ID within num_shards. This argument should be -/// specified only when num_shards is also specified. (Default = 0) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current TextFileNode -std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples = 0, - ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, - int32_t shard_id = 0, const std::shared_ptr &cache = nullptr); - -#ifndef ENABLE_ANDROID -/// \brief Function to create a TFRecordNode -/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list -/// will be sorted in a lexicographical order. -/// \param[in] schema SchemaObj or string to schema path. (Default = nullptr, which means that the -/// meta data from the TFData file is considered the schema.) -/// \param[in] columns_list List of columns to be read. (Default = {}, read all columns) -/// \param[in] num_samples The number of samples to be included in the dataset. -/// (Default = 0 means all samples.) -/// If num_samples is 0 and numRows(parsed from schema) does not exist, read the full dataset; -/// If num_samples is 0 and numRows(parsed from schema) is greater than 0, read numRows rows; -/// If both num_samples and numRows(parsed from schema) are greater than 0, read num_samples rows. -/// \param[in] shuffle The mode for shuffling data every epoch. (Default = ShuffleMode::kGlobal) -/// Can be any of: -/// ShuffleMode::kFalse - No shuffling is performed. -/// ShuffleMode::kFiles - Shuffle files only. -/// ShuffleMode::kGlobal - Shuffle both the files and samples. -/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) -/// \param[in] shard_id The shard ID within num_shards. This argument should be specified only -/// when num_shards is also specified. (Default = 0) -/// \param[in] shard_equal_rows Get equal rows for all shards. (Default = False, number of rows of -/// each shard may be not equal) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current TFRecordNode -template > -std::shared_ptr TFRecord(const std::vector &dataset_files, const T &schema = nullptr, - const std::vector &columns_list = {}, int64_t num_samples = 0, - ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, - int32_t shard_id = 0, bool shard_equal_rows = false, - const std::shared_ptr &cache = nullptr) { - if (dataset_files.empty()) { - MS_LOG(ERROR) << "TFRecordNode: dataset_files is not specified."; - return nullptr; - } - - for (auto f : dataset_files) { - Path dataset_file(f); - if (!dataset_file.Exists()) { - MS_LOG(ERROR) << "TFRecordNode: dataset file: [" << f << "] is invalid or does not exist."; - return nullptr; - } - } - - if (num_samples < 0) { - MS_LOG(ERROR) << "TFRecordNode: Invalid number of samples: " << num_samples; - return nullptr; - } - - if (num_shards <= 0) { - MS_LOG(ERROR) << "TFRecordNode: Invalid num_shards: " << num_shards; - return nullptr; - } - - if (shard_id < 0 || shard_id >= num_shards) { - MS_LOG(ERROR) << "TFRecordNode: Invalid input, shard_id: " << shard_id << ", num_shards: " << num_shards; - return nullptr; - } - - if (cache == nullptr && !shard_equal_rows && dataset_files.size() < num_shards) { - // This check only makes sense in a non-cache path. We should make sure there is at least one file per - // shard in file-based sharding - MS_LOG(ERROR) << "TFRecordNode: Invalid number of dataset files, should at least be " << std::to_string(num_shards); - return nullptr; - } - - std::shared_ptr ds = nullptr; - if constexpr (std::is_same::value || std::is_same>::value) { - std::shared_ptr schema_obj = schema; - ds = std::make_shared(dataset_files, schema_obj, columns_list, num_samples, shuffle, num_shards, - shard_id, shard_equal_rows, cache); - } else { - std::string schema_path = schema; - if (!schema_path.empty()) { - Path schema_file(schema_path); - if (!schema_file.Exists()) { - MS_LOG(ERROR) << "TFRecordNode: schema path [" << schema_path << "] is invalid or does not exist."; - return nullptr; - } - } - ds = std::make_shared(dataset_files, schema_path, columns_list, num_samples, shuffle, num_shards, - shard_id, shard_equal_rows, cache); - } - return ds; -} - -/// \brief Function to create a VOCNode -/// \notes The generated dataset has multi-columns : -/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32], -/// ['difficult', dtype=uint32], ['truncate', dtype=uint32]]. -/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. -/// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" -/// \param[in] usage The type of data list text file to be read (default = "train"). -/// \param[in] class_indexing A str-to-int mapping from label name to index, only valid in "Detection" task -/// \param[in] decode Decode the images after reading -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, -/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) -/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). -/// The cache feature is under development and is not recommended. -/// \return Shared pointer to the current Dataset -std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", - const std::string &usage = "train", - const std::map &class_indexing = {}, bool decode = false, - const std::shared_ptr &sampler = RandomSampler(), - const std::shared_ptr &cache = nullptr); - -/// \brief Function the create a cache to be attached to a dataset -/// \param id A user assigned session id for the current pipeline -/// \param mem_sz Size of the memory set aside for the row caching. 0 for unlimited -/// \param spill Spill to disk if out of memory -/// \param hostname optional host name -/// \param port optional port -/// \param num_connections optional number of connections -/// \param prefetch_sz optional prefetch size -/// \return Shared pointer to DatasetCache. If error, nullptr is returned. -std::shared_ptr CreateDatasetCache(session_id_type id, uint64_t mem_sz, bool spill, - std::optional hostname = std::nullopt, - std::optional port = std::nullopt, - std::optional num_connections = std::nullopt, - std::optional prefetch_sz = std::nullopt); -#endif - -/// \brief Function to create a sampler for non-mappable dataset (to be used by cache op later). -/// \notes Non-mappable dataset does not directly support a sampler. It has provided sampling arguments (shuffle, -/// num_samples, num_shards, shard_id) and it DOES support sampling if somewhere above it in the pipeline contains -/// a cache. If there is no cache above it, then the sampler is not used. -/// \param[in] num_samples The number of samples to be included in the dataset. -/// \param[in] shuffle If true, the indices are shuffled. -/// \param[in] num_shards Number of shards to divide the dataset into. -/// \param[in] shard_id Shard ID of the current shard within num_shards. -/// \return Shared pointer to the current Sampler. -std::shared_ptr SelectSampler(int64_t num_samples, bool shuffle, int32_t num_shards, int32_t shard_id); - -/// \brief Function to create a ZipNode -/// \notes Applies zip to the dataset -/// \param[in] datasets List of shared pointers to the datasets that we want to zip -/// \return Shared pointer to the current Dataset -std::shared_ptr Zip(const std::vector> &datasets); - -/// \class Dataset datasets.h -/// \brief A base class to represent a dataset in the data pipeline. -class Dataset : public std::enable_shared_from_this { - public: - // need friend class so they can access the children_ field - friend class Iterator; - friend class TransferNode; - friend class mindspore::dataset::TreeAdapter; - - /// \brief Constructor - Dataset(); - - /// \brief Constructor that initializes the cache - /// \param dataset_cache DatasetCache - explicit Dataset(const std::shared_ptr &dataset_cache); - - /// \brief Destructor - ~Dataset() = default; - - /// \brief Pure virtual function to convert a Dataset class into a runtime dataset object - /// \return The list of shared pointers to the newly created DatasetOps - virtual std::vector> Build() = 0; - - /// \brief Pure virtual function for derived class to implement parameters validation - /// \return Status Status::OK() if all the parameters are valid - virtual Status ValidateParams() = 0; - - /// \brief Pure virtual function for derived class to get the shard id of specific node - /// \return Status Status::OK() if get shard id successfully - virtual Status GetShardId(int32_t *shard_id) { - return Status(StatusCode::kNotImplementedYet, __LINE__, __FILE__, "Method is not implemented yet."); - } - - /// \brief Gets the dataset size - /// \return dataset size. If failed, return -1 - int64_t GetDatasetSize(); - - /// \brief Gets the output type - /// \return a vector of DataType. If failed, return an empty vector - std::vector GetOutputTypes(); - - /// \brief Gets the output shape - /// \return a vector of TensorShape. If failed, return am empty vector - std::vector GetOutputShapes(); - - /// \brief Gets the batch size - /// \return int64_t - int64_t GetBatchSize(); - - /// \brief Gets the the repeat count - /// \return int64_t - int64_t GetRepeatCount(); - - /// \brief Gets the number of classes - /// \return number of classes. If failed, return -1 - int64_t GetNumClasses(); - - /// \brief Setter function for runtime number of workers - /// \param[in] num_workers The number of threads in this operator - /// \return Shared pointer to the original object - std::shared_ptr SetNumWorkers(int32_t num_workers) { -#if !defined(_WIN32) && !defined(_WIN64) -#ifndef ENABLE_ANDROID - int32_t cpu_count = sysconf(_SC_NPROCESSORS_CONF); - if (cpu_count < 0 || cpu_count > INT32_MAX) { - MS_LOG(ERROR) << "Error determining current CPU: " << cpu_count; - return nullptr; - } - if (num_workers < 1 || num_workers > cpu_count) { - MS_LOG(ERROR) << "num_workers exceeds the boundary between 1 and " << cpu_count; - return nullptr; - } -#endif -#endif - num_workers_ = num_workers; - return shared_from_this(); - } - - /// \brief Function to create an Iterator over the Dataset pipeline - /// \param[in] columns List of columns to be used to specify the order of columns - /// \return Shared pointer to the Iterator - std::shared_ptr CreateIterator(std::vector columns = {}); - - /// \brief Function to transfer data through a device. - /// \notes If device is Ascend, features of data will be transferred one by one. The limitation - /// of data transmission per time is 256M. - /// \param[in] send_epoch_end Whether to send end of sequence to device or not (default=True). - /// \return Returns true if no error encountered else false. - bool DeviceQueue(bool send_epoch_end = true); - -#ifndef ENABLE_ANDROID - /// \brief Function to create a Saver to save the dynamic data processed by the dataset pipeline - /// \note Usage restrictions: - /// 1. Supported dataset formats: 'mindrecord' only - /// 2. To save the samples in order, set dataset's shuffle to false and num_files to 1. - /// 3. Before calling the function, do not use batch operator, repeat operator or data augmentation operators - /// with random attribute in map operator. - /// 4. Mindrecord does not support bool, uint64, multi-dimensional uint8(drop dimension) nor - /// multi-dimensional string. - /// \param[in] file_name Path to dataset file - /// \param[in] num_files Number of dataset files (default=1) - /// \param[in] file_type Dataset format (default="mindrecord") - /// \return Returns true if no error encountered else false - bool Save(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord"); -#endif - - /// \brief Function to create a BatchNode - /// \notes Combines batch_size number of consecutive rows into batches - /// \param[in] batch_size The number of rows each batch is created with - /// \param[in] drop_remainder Determines whether or not to drop the last possibly incomplete - /// batch. If true, and if there are less than batch_size rows - /// available to make the last batch, then those rows will - /// be dropped and not propagated to the next node - /// \return Shared pointer to the current BatchNode - std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); #ifndef ENABLE_ANDROID - /// \brief Function to create a BucketBatchByLengthNode + /// \brief Function to create a BucketBatchByLengthDataset /// \notes Bucket elements according to their lengths. Each bucket will be padded and batched when /// they are full. /// \param[in] column_names Columns passed to element_length_function @@ -712,13 +194,17 @@ class Dataset : public std::enable_shared_from_this { /// an error will occur (default=false). /// \param[in] drop_remainder If true, will drop the last batch for each bucket if it is not a full batch /// (default=false). - /// \return Shared pointer to the current BucketBatchByLengthNode - std::shared_ptr BucketBatchByLength( + /// \return Shared pointer to the current BucketBatchByLengthDataset + std::shared_ptr BucketBatchByLength( const std::vector &column_names, const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, std::function element_length_function = nullptr, const std::map>> &pad_info = {}, - bool pad_to_bucket_boundary = false, bool drop_remainder = false); + bool pad_to_bucket_boundary = false, bool drop_remainder = false) { + return std::make_shared(shared_from_this(), column_names, bucket_boundaries, + bucket_batch_sizes, element_length_function, pad_info, + pad_to_bucket_boundary, drop_remainder); + } /// \brief Function to create a SentencePieceVocab from source dataset /// \notes Build a SentencePieceVocab from a dataset. @@ -753,13 +239,17 @@ class Dataset : public std::enable_shared_from_this { bool special_first = true); #endif - /// \brief Function to create a ConcatNode + /// \brief Function to create a ConcatDataset /// \notes Concat the datasets in the input /// \param[in] datasets List of shared pointers to the dataset that should be concatenated together - /// \return Shared pointer to the current ConcatNode - std::shared_ptr Concat(const std::vector> &datasets); + /// \return Shared pointer to the current ConcatDataset + std::shared_ptr Concat(const std::vector> &datasets) { + std::vector> all_datasets = datasets; + all_datasets.push_back(shared_from_this()); + return std::make_shared(all_datasets); + } - /// \brief Function to create a MapNode + /// \brief Function to create a MapDataset /// \notes Applies each operation in operations to this dataset /// \param[in] operations Vector of operations to be applied on the dataset. Operations are /// applied in the order they appear in this list @@ -775,71 +265,81 @@ class Dataset : public std::enable_shared_from_this { /// \param[in] project_columns A list of column names to project /// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). /// The cache feature is under development and is not recommended. - /// \return Shared pointer to the current MapNode - std::shared_ptr Map(std::vector> operations, - std::vector input_columns = {}, - std::vector output_columns = {}, - const std::vector &project_columns = {}, - const std::shared_ptr &cache = nullptr); + /// \return Shared pointer to the current MapDataset + std::shared_ptr Map(std::vector> operations, + std::vector input_columns = {}, + std::vector output_columns = {}, + const std::vector &project_columns = {}, + const std::shared_ptr &cache = nullptr) { + return std::make_shared(shared_from_this(), operations, input_columns, output_columns, project_columns, + cache); + } /// \brief Function to create a Project Dataset /// \notes Applies project to the dataset /// \param[in] columns The name of columns to project /// \return Shared pointer to the current Dataset - std::shared_ptr Project(const std::vector &columns); + std::shared_ptr Project(const std::vector &columns) { + return std::make_shared(shared_from_this(), columns); + } /// \brief Function to create a Rename Dataset /// \notes Renames the columns in the input dataset /// \param[in] input_columns List of the input columns to rename /// \param[in] output_columns List of the output columns /// \return Shared pointer to the current Dataset - std::shared_ptr Rename(const std::vector &input_columns, - const std::vector &output_columns); + std::shared_ptr Rename(const std::vector &input_columns, + const std::vector &output_columns) { + return std::make_shared(shared_from_this(), input_columns, output_columns); + } - /// \brief Function to create a RepeatNode + /// \brief Function to create a RepeatDataset /// \notes Repeats this dataset count times. Repeat indefinitely if count is -1 /// \param[in] count Number of times the dataset should be repeated /// \return Shared pointer to the current Dataset - /// \note Repeat will return shared pointer to `Dataset` instead of `RepeatNode` + /// \note Repeat will return shared pointer to `Dataset` instead of `RepeatDataset` /// due to a limitation in the current implementation - std::shared_ptr Repeat(int32_t count = -1); + std::shared_ptr Repeat(int32_t count = -1) { + return std::make_shared(shared_from_this(), count); + } /// \brief Function to create a Shuffle Dataset /// \notes Randomly shuffles the rows of this dataset /// \param[in] buffer_size The size of the buffer (must be larger than 1) for shuffling - /// \return Shared pointer to the current ShuffleNode - std::shared_ptr Shuffle(int32_t buffer_size); + /// \return Shared pointer to the current ShuffleDataset + std::shared_ptr Shuffle(int32_t buffer_size) { + return std::make_shared(shared_from_this(), buffer_size); + } - /// \brief Function to create a SkipNode + /// \brief Function to create a SkipDataset /// \notes Skips count elements in this dataset. /// \param[in] count Number of elements the dataset to be skipped. - /// \return Shared pointer to the current SkipNode - std::shared_ptr Skip(int32_t count); + /// \return Shared pointer to the current SkipDataset + std::shared_ptr Skip(int32_t count) { return std::make_shared(shared_from_this(), count); } - /// \brief Function to create a TakeNode + /// \brief Function to create a TakeDataset /// \notes Takes count elements in this dataset. /// \param[in] count Number of elements the dataset to be taken. /// \return Shared pointer to the current Dataset - std::shared_ptr Take(int32_t count = -1); + std::shared_ptr Take(int32_t count = -1) { + return std::make_shared(shared_from_this(), count); + } /// \brief Function to create a Zip Dataset /// \notes Applies zip to the dataset /// \param[in] datasets A list of shared pointers to the datasets that we want to zip /// \return Shared pointer to the current Dataset - std::shared_ptr Zip(const std::vector> &datasets); + std::shared_ptr Zip(const std::vector> &datasets) { + std::vector> all_datasets = datasets; + all_datasets.push_back(shared_from_this()); + return std::make_shared(all_datasets); + } + + std::shared_ptr IRNode() { return ir_node_; } protected: - std::vector> children; - std::shared_ptr parent; std::shared_ptr tree_getters_; - - int32_t num_workers_; - int32_t rows_per_buffer_; - int32_t connector_que_size_; - int32_t worker_connector_size_; - - std::shared_ptr cache_; - Status AddCacheOp(std::vector> *node_ops); + std::shared_ptr ir_node_; }; class SchemaObj { @@ -902,8 +402,569 @@ class SchemaObj { nlohmann::json columns_; }; -/* ####################################### Derived Dataset classes ################################# */ +class BatchDataset : public Dataset { + public: + BatchDataset(std::shared_ptr input, int32_t batch_size, bool drop_remainder = false); +}; + +#ifndef ENABLE_ANDROID +class BucketBatchByLengthDataset : public Dataset { + public: + BucketBatchByLengthDataset( + std::shared_ptr input, const std::vector &column_names, + const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, + std::function element_length_function = nullptr, + const std::map>> &pad_info = {}, + bool pad_to_bucket_boundary = false, bool drop_remainder = false); +}; + +#endif + +class ConcatDataset : public Dataset { + public: + explicit ConcatDataset(const std::vector> &input); +}; + +class MapDataset : public Dataset { + public: + MapDataset(std::shared_ptr input, std::vector> operations, + std::vector input_columns, std::vector output_columns, + const std::vector &project_columns, const std::shared_ptr &cache); +}; + +class ProjectDataset : public Dataset { + public: + ProjectDataset(std::shared_ptr input, const std::vector &columns); +}; + +class RenameDataset : public Dataset { + public: + RenameDataset(std::shared_ptr input, const std::vector &input_columns, + const std::vector &output_columns); +}; + +class RepeatDataset : public Dataset { + public: + RepeatDataset(std::shared_ptr input, int32_t count); +}; + +class ShuffleDataset : public Dataset { + public: + ShuffleDataset(std::shared_ptr input, int32_t buffer_size); +}; + +class SkipDataset : public Dataset { + public: + SkipDataset(std::shared_ptr input, int32_t count); +}; +class TakeDataset : public Dataset { + public: + TakeDataset(std::shared_ptr input, int32_t count); +}; + +class ZipDataset : public Dataset { + public: + explicit ZipDataset(const std::vector> &inputs); +}; + +/// \brief Function to create a SchemaObj +/// \param[in] schema_file Path of schema file +/// \return Shared pointer to the current schema +std::shared_ptr Schema(const std::string &schema_file = ""); + +class AlbumDataset : public Dataset { + public: + AlbumDataset(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names = {}, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create an AlbumDataset +/// \notes The generated dataset is specified through setting a schema +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] data_schema Path to dataset schema file +/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns. +/// (default = {}) +/// \param[in] decode the option to decode the images in dataset (default = false) +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, + const std::vector &column_names = {}, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); + +class CelebADataset : public Dataset { + public: + explicit CelebADataset(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), bool decode = false, + const std::set &extensions = {}, + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a CelebADataset +/// \notes The generated dataset has two columns ['image', 'attr']. +/// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. +/// \param[in] dataset_dir Path to the root directory that contains the dataset. +/// \param[in] usage One of "all", "train", "valid" or "test" (default = "all"). +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] decode Decode the images after reading (default=false). +/// \param[in] extensions Set of file extensions to be included in the dataset (default={}). +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), bool decode = false, + const std::set &extensions = {}, + const std::shared_ptr &cache = nullptr); + +class Cifar10Dataset : public Dataset { + public: + explicit Cifar10Dataset(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a Cifar10 Dataset +/// \notes The generated dataset has two columns ["image", "label"] +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] usage of CIFAR10, can be "train", "test" or "all" (default = "all"). +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr Cifar10(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); + +class Cifar100Dataset : public Dataset { + public: + explicit Cifar100Dataset(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a Cifar100 Dataset +/// \notes The generated dataset has three columns ["image", "coarse_label", "fine_label"] +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] usage of CIFAR100, can be "train", "test" or "all" (default = "all"). +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr Cifar100(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); + +class CLUEDataset : public Dataset { + public: + explicit CLUEDataset(const std::vector &dataset_files, const std::string &task = "AFQMC", + const std::string &usage = "train", int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a CLUEDataset +/// \notes The generated dataset has a variable number of columns depending on the task and usage +/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list +/// will be sorted in a lexicographical order. +/// \param[in] task The kind of task, one of "AFQMC", "TNEWS", "IFLYTEK", "CMNLI", "WSC" and "CSL" (default="AFQMC"). +/// \param[in] usage Be used to "train", "test" or "eval" data (default="train"). +/// \param[in] num_samples The number of samples to be included in the dataset. +/// (Default = 0 means all samples.) +/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) +/// Can be any of: +/// ShuffleMode::kFalse - No shuffling is performed. +/// ShuffleMode::kFiles - Shuffle files only. +/// ShuffleMode::kGlobal - Shuffle both the files and samples. +/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) +/// \param[in] shard_id The shard ID within num_shards. This argument should be +/// specified only when num_shards is also specified. (Default = 0) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current CLUEDataset +std::shared_ptr CLUE(const std::vector &dataset_files, const std::string &task = "AFQMC", + const std::string &usage = "train", int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, + int32_t shard_id = 0, const std::shared_ptr &cache = nullptr); + +class CocoDataset : public Dataset { + public: + CocoDataset(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task = "Detection", + const bool &decode = false, const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a CocoDataset +/// \notes The generated dataset has multi-columns : +/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], +/// ['iscrowd', dtype=uint32]]. +/// - task='Stuff', column: [['image', dtype=uint8], ['segmentation',dtype=float32], ['iscrowd', dtype=uint32]]. +/// - task='Keypoint', column: [['image', dtype=uint8], ['keypoints', dtype=float32], +/// ['num_keypoints', dtype=uint32]]. +/// - task='Panoptic', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['category_id', dtype=uint32], +/// ['iscrowd', dtype=uint32], ['area', dtype=uitn32]]. +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] annotation_file Path to the annotation json +/// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint' +/// \param[in] decode Decode the images after reading +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, + const std::string &task = "Detection", const bool &decode = false, + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); + +class CSVDataset : public Dataset { + public: + explicit CSVDataset(const std::vector &dataset_files, char field_delim = ',', + const std::vector> &column_defaults = {}, + const std::vector &column_names = {}, int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a CSVDataset +/// \notes The generated dataset has a variable number of columns +/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list +/// will be sorted in a lexicographical order. +/// \param[in] field_delim A char that indicates the delimiter to separate fields (default=','). +/// \param[in] column_defaults List of default values for the CSV field (default={}). Each item in the list is +/// either a valid type (float, int, or string). If this is not provided, treats all columns as string type. +/// \param[in] column_names List of column names of the dataset (default={}). If this is not provided, infers the +/// column_names from the first row of CSV file. +/// \param[in] num_samples The number of samples to be included in the dataset. +/// (Default = 0 means all samples.) +/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode::kGlobal) +/// Can be any of: +/// ShuffleMode::kFalse - No shuffling is performed. +/// ShuffleMode::kFiles - Shuffle files only. +/// ShuffleMode::kGlobal - Shuffle both the files and samples. +/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) +/// \param[in] shard_id The shard ID within num_shards. This argument should be +/// specified only when num_shards is also specified. (Default = 0) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr CSV(const std::vector &dataset_files, char field_delim = ',', + const std::vector> &column_defaults = {}, + const std::vector &column_names = {}, int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, + int32_t shard_id = 0, const std::shared_ptr &cache = nullptr); + +class ImageFolderDataset : public Dataset { + public: + explicit ImageFolderDataset(const std::string &dataset_dir, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), + const std::set &extensions = {}, + const std::map &class_indexing = {}, + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create an ImageFolderDataset +/// \notes A source dataset that reads images from a tree of directories +/// All images within one folder have the same label +/// The generated dataset has two columns ["image", "label"] +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] decode A flag to decode in ImageFolder +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] extensions File extensions to be read +/// \param[in] class_indexing a class name to label map +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current ImageFolderDataset +std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), + const std::set &extensions = {}, + const std::map &class_indexing = {}, + const std::shared_ptr &cache = nullptr); + +#ifndef ENABLE_ANDROID +class ManifestDataset : public Dataset { + public: + explicit ManifestDataset(const std::string &dataset_file, const std::string &usage = "train", + const std::shared_ptr &sampler = RandomSampler(), + const std::map &class_indexing = {}, bool decode = false, + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a ManifestDataset +/// \notes The generated dataset has two columns ["image", "label"] +/// \param[in] dataset_file The dataset file to be read +/// \param[in] usage Need "train", "eval" or "inference" data (default="train") +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder +/// names will be sorted alphabetically and each class will be given a unique index starting from 0). +/// \param[in] decode Decode the images after reading (default=false). +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current ManifestDataset +std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage = "train", + const std::shared_ptr &sampler = RandomSampler(), + const std::map &class_indexing = {}, + bool decode = false, const std::shared_ptr &cache = nullptr); +#endif + +#ifndef ENABLE_ANDROID +class MindDataDataset : public Dataset { + public: + explicit MindDataDataset(const std::string &dataset_file, const std::vector &columns_list = {}, + const std::shared_ptr &sampler = RandomSampler(), + nlohmann::json padded_sample = nullptr, int64_t num_padded = 0); + explicit MindDataDataset(const std::vector &dataset_files, + const std::vector &columns_list = {}, + const std::shared_ptr &sampler = RandomSampler(), + nlohmann::json padded_sample = nullptr, int64_t num_padded = 0); +}; + +/// \brief Function to create a MindDataDataset +/// \param[in] dataset_file File name of one component of a mindrecord source. Other files with identical source +/// in the same path will be found and loaded automatically. +/// \param[in] columns_list List of columns to be read (default={}) +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()), +/// supported sampler list: SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler. +/// \param[in] padded_sample Samples will be appended to dataset, where keys are the same as column_list. +/// \param[in] num_padded Number of padding samples. Dataset size plus num_padded should be divisible by num_shards. +/// \return Shared pointer to the current MindDataDataset +std::shared_ptr MindData(const std::string &dataset_file, + const std::vector &columns_list = {}, + const std::shared_ptr &sampler = RandomSampler(), + nlohmann::json padded_sample = nullptr, int64_t num_padded = 0); + +/// \brief Function to create a MindDataDataset +/// \param[in] dataset_files List of dataset files to be read directly. +/// \param[in] columns_list List of columns to be read (default={}) +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()), +/// supported sampler list: SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler. +/// \param[in] padded_sample Samples will be appended to dataset, where keys are the same as column_list. +/// \param[in] num_padded Number of padding samples. Dataset size plus num_padded should be divisible by num_shards. +/// \return Shared pointer to the current MindDataDataset +std::shared_ptr MindData(const std::vector &dataset_files, + const std::vector &columns_list = {}, + const std::shared_ptr &sampler = RandomSampler(), + nlohmann::json padded_sample = nullptr, int64_t num_padded = 0); +#endif + +class MnistDataset : public Dataset { + public: + explicit MnistDataset(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a MnistDataset +/// \notes The generated dataset has two columns ["image", "label"] +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all"). +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current MnistDataset +std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage = "all", + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); + +/// \brief Function to create a ConcatDataset +/// \notes Reload "+" operator to concat two datasets +/// \param[in] datasets1 Shared pointer to the first dataset to be concatenated +/// \param[in] datasets2 Shared pointer to the second dataset to be concatenated +/// \return Shared pointer to the current ConcatDataset +std::shared_ptr operator+(const std::shared_ptr &datasets1, + const std::shared_ptr &datasets2); + +class RandomDataDataset : public Dataset { + public: + RandomDataDataset(const int32_t &total_rows, std::shared_ptr schema, + const std::vector &columns_list, const std::shared_ptr &sampler, + std::shared_ptr cache); + + RandomDataDataset(const int32_t &total_rows, std::string schema_path, const std::vector &columns_list, + const std::shared_ptr &sampler, std::shared_ptr cache); +}; + +/// \brief Function to create a RandomDataset +/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random) +/// \param[in] schema SchemaObj to set column type, data type and data shape +/// \param[in] columns_list List of columns to be read (default={}, read all columns) +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +template > +std::shared_ptr RandomData(const int32_t &total_rows = 0, const T &schema = nullptr, + const std::vector &columns_list = {}, + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr) { + std::shared_ptr ds; + if constexpr (std::is_same::value || std::is_same>::value) { + std::shared_ptr schema_obj = schema; + ds = std::make_shared(total_rows, std::move(schema_obj), std::move(columns_list), + std::move(sampler), cache); + } else { + ds = std::make_shared(total_rows, std::move(schema), std::move(columns_list), std::move(sampler), + cache); + } + return ds; +} + +class TextFileDataset : public Dataset { + public: + explicit TextFileDataset(const std::vector &dataset_files, int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a TextFileDataset +/// \notes The generated dataset has one column ['text'] +/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list +/// will be sorted in a lexicographical order. +/// \param[in] num_samples The number of samples to be included in the dataset. +/// (Default = 0 means all samples.) +/// \param[in] shuffle The mode for shuffling data every epoch. (Default=ShuffleMode.kGlobal) +/// Can be any of: +/// ShuffleMode.kFalse - No shuffling is performed. +/// ShuffleMode.kFiles - Shuffle files only. +/// ShuffleMode.kGlobal - Shuffle both the files and samples. +/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) +/// \param[in] shard_id The shard ID within num_shards. This argument should be +/// specified only when num_shards is also specified. (Default = 0) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current TextFileDataset +std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, + int32_t shard_id = 0, const std::shared_ptr &cache = nullptr); + +#ifndef ENABLE_ANDROID +class TFRecordDataset : public Dataset { + public: + TFRecordDataset(const std::vector &dataset_files, std::string schema, + const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache); + + /// \brief Constructor + /// \note Parameter 'schema' is shared pointer to Schema object + TFRecordDataset(const std::vector &dataset_files, std::shared_ptr schema, + const std::vector &columns_list, int64_t num_samples, ShuffleMode shuffle, + int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache); +}; + +/// \brief Function to create a TFRecordDataset +/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list +/// will be sorted in a lexicographical order. +/// \param[in] schema SchemaObj or string to schema path. (Default = nullptr, which means that the +/// meta data from the TFData file is considered the schema.) +/// \param[in] columns_list List of columns to be read. (Default = {}, read all columns) +/// \param[in] num_samples The number of samples to be included in the dataset. +/// (Default = 0 means all samples.) +/// If num_samples is 0 and numRows(parsed from schema) does not exist, read the full dataset; +/// If num_samples is 0 and numRows(parsed from schema) is greater than 0, read numRows rows; +/// If both num_samples and numRows(parsed from schema) are greater than 0, read num_samples rows. +/// \param[in] shuffle The mode for shuffling data every epoch. (Default = ShuffleMode::kGlobal) +/// Can be any of: +/// ShuffleMode::kFalse - No shuffling is performed. +/// ShuffleMode::kFiles - Shuffle files only. +/// ShuffleMode::kGlobal - Shuffle both the files and samples. +/// \param[in] num_shards Number of shards that the dataset should be divided into. (Default = 1) +/// \param[in] shard_id The shard ID within num_shards. This argument should be specified only +/// when num_shards is also specified. (Default = 0) +/// \param[in] shard_equal_rows Get equal rows for all shards. (Default = False, number of rows of +/// each shard may be not equal) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current TFRecordDataset +template > +std::shared_ptr TFRecord(const std::vector &dataset_files, const T &schema = nullptr, + const std::vector &columns_list = {}, int64_t num_samples = 0, + ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, + int32_t shard_id = 0, bool shard_equal_rows = false, + const std::shared_ptr &cache = nullptr) { + std::shared_ptr ds = nullptr; + if constexpr (std::is_same::value || std::is_same>::value) { + std::shared_ptr schema_obj = schema; + ds = std::make_shared(dataset_files, schema_obj, columns_list, num_samples, shuffle, num_shards, + shard_id, shard_equal_rows, cache); + } else { + std::string schema_path = schema; + if (!schema_path.empty()) { + Path schema_file(schema_path); + if (!schema_file.Exists()) { + MS_LOG(ERROR) << "TFRecordDataset: schema path [" << schema_path << "] is invalid or does not exist."; + return nullptr; + } + } + ds = std::make_shared(dataset_files, schema_path, columns_list, num_samples, shuffle, num_shards, + shard_id, shard_equal_rows, cache); + } + return ds; +} + +class VOCDataset : public Dataset { + public: + explicit VOCDataset(const std::string &dataset_dir, const std::string &task = "Segmentation", + const std::string &usage = "train", const std::map &class_indexing = {}, + bool decode = false, const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); +}; + +/// \brief Function to create a VOCDataset +/// \notes The generated dataset has multi-columns : +/// - task='Detection', column: [['image', dtype=uint8], ['bbox', dtype=float32], ['label', dtype=uint32], +/// ['difficult', dtype=uint32], ['truncate', dtype=uint32]]. +/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. +/// \param[in] dataset_dir Path to the root directory that contains the dataset +/// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" +/// \param[in] usage The type of data list text file to be read (default = "train"). +/// \param[in] class_indexing A str-to-int mapping from label name to index, only valid in "Detection" task +/// \param[in] decode Decode the images after reading +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) +/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used). +/// The cache feature is under development and is not recommended. +/// \return Shared pointer to the current Dataset +std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", + const std::string &usage = "train", + const std::map &class_indexing = {}, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), + const std::shared_ptr &cache = nullptr); + +/// \brief Function the create a cache to be attached to a dataset +/// \param id A user assigned session id for the current pipeline +/// \param mem_sz Size of the memory set aside for the row caching. 0 for unlimited +/// \param spill Spill to disk if out of memory +/// \param hostname optional host name +/// \param port optional port +/// \param num_connections optional number of connections +/// \param prefetch_sz optional prefetch size +/// \return Shared pointer to DatasetCache. If error, nullptr is returned. +std::shared_ptr CreateDatasetCache(session_id_type id, uint64_t mem_sz, bool spill, + std::optional hostname = std::nullopt, + std::optional port = std::nullopt, + std::optional num_connections = std::nullopt, + std::optional prefetch_sz = std::nullopt); +#endif + +/// \brief Function to create a ZipDataset +/// \notes Applies zip to the dataset +/// \param[in] datasets List of shared pointers to the datasets that we want to zip +/// \return Shared pointer to the current Dataset +std::shared_ptr Zip(const std::vector> &datasets); } // namespace api } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/include/iterator.h b/mindspore/ccsrc/minddata/dataset/include/iterator.h index 4c82fad574..58781ecefb 100644 --- a/mindspore/ccsrc/minddata/dataset/include/iterator.h +++ b/mindspore/ccsrc/minddata/dataset/include/iterator.h @@ -49,7 +49,7 @@ class Iterator { Iterator() : consumer_(nullptr) {} /// \brief Destructor - ~Iterator() = default; + ~Iterator() { Stop(); } /// \brief Method for building and launching the pipeline. /// \param[in] ops - a vector of DatasetOp in the data pipeline. diff --git a/mindspore/lite/minddata/CMakeLists.txt b/mindspore/lite/minddata/CMakeLists.txt index e934dcd64d..4bd78ede99 100644 --- a/mindspore/lite/minddata/CMakeLists.txt +++ b/mindspore/lite/minddata/CMakeLists.txt @@ -82,30 +82,30 @@ AUX_SOURCE_DIRECTORY(${MINDDATA_DIR}/kernels/image/lite_cv MINDDATA_KERNELS_IMA if (BUILD_MINDDATA STREQUAL "full") include_directories("${CMAKE_SOURCE_DIR}/../ccsrc/minddata/dataset/kernels/image") - list(REMOVE_ITEM MINDDATA_API_SRC_FILES - "${MINDDATA_DIR}/api/text.cc" - ) + list(REMOVE_ITEM MINDDATA_API_SRC_FILES + "${MINDDATA_DIR}/api/text.cc" + ) - list(REMOVE_ITEM MINDDATA_CALLBACK_SRC_FILES - "${MINDDATA_DIR}/callback/py_ds_callback.cc" - ) + list(REMOVE_ITEM MINDDATA_CALLBACK_SRC_FILES + "${MINDDATA_DIR}/callback/py_ds_callback.cc" + ) list(REMOVE_ITEM MINDDATA_CORE_SRC_FILES - "${MINDDATA_DIR}/core/cv_tensor.cc" - ) + "${MINDDATA_DIR}/core/cv_tensor.cc" + ) list(REMOVE_ITEM MINDDATA_KERNELS_SRC_FILES "${MINDDATA_DIR}/kernels/py_func_op.cc") list(REMOVE_ITEM MINDDATA_ENGINE_DATASETOPS_SRC_FILES - "${MINDDATA_DIR}/engine/datasetops/build_sentence_piece_vocab_op.cc" - "${MINDDATA_DIR}/engine/datasetops/filter_op.cc" - "${MINDDATA_DIR}/engine/datasetops/barrier_op.cc" - "${MINDDATA_DIR}/engine/datasetops/bucket_batch_by_length_op.cc" - "${MINDDATA_DIR}/engine/datasetops/build_vocab_op.cc" - "${MINDDATA_DIR}/engine/datasetops/cache_merge_op.cc" - "${MINDDATA_DIR}/engine/datasetops/cache_base_op.cc" - "${MINDDATA_DIR}/engine/datasetops/cache_lookup_op.cc" - "${MINDDATA_DIR}/engine/datasetops/cache_op.cc" - ) + "${MINDDATA_DIR}/engine/datasetops/build_sentence_piece_vocab_op.cc" + "${MINDDATA_DIR}/engine/datasetops/filter_op.cc" + "${MINDDATA_DIR}/engine/datasetops/barrier_op.cc" + "${MINDDATA_DIR}/engine/datasetops/bucket_batch_by_length_op.cc" + "${MINDDATA_DIR}/engine/datasetops/build_vocab_op.cc" + "${MINDDATA_DIR}/engine/datasetops/cache_merge_op.cc" + "${MINDDATA_DIR}/engine/datasetops/cache_base_op.cc" + "${MINDDATA_DIR}/engine/datasetops/cache_lookup_op.cc" + "${MINDDATA_DIR}/engine/datasetops/cache_op.cc" + ) list(REMOVE_ITEM MINDDATA_ENGINE_DATASETOPS_SOURCE_SRC_FILES "${MINDDATA_DIR}/engine/datasetops/source/generator_op.cc" @@ -161,47 +161,55 @@ if (BUILD_MINDDATA STREQUAL "full") "${MINDDATA_DIR}/kernels/image/random_crop_and_resize_with_bbox_op.cc" "${MINDDATA_DIR}/kernels/image/random_crop_decode_resize_op.cc" "${MINDDATA_DIR}/kernels/image/random_crop_and_resize_op.cc" - "${MINDDATA_DIR}/kernels/image/random_crop_op.cc" - "${MINDDATA_DIR}/kernels/image/random_crop_with_bbox_op.cc" - "${MINDDATA_DIR}/kernels/image/random_horizontal_flip_op.cc" - "${MINDDATA_DIR}/kernels/image/random_horizontal_flip_with_bbox_op.cc" - "${MINDDATA_DIR}/kernels/image/random_posterize_op.cc" - "${MINDDATA_DIR}/kernels/image/random_resize_op.cc" - "${MINDDATA_DIR}/kernels/image/random_rotation_op.cc" - "${MINDDATA_DIR}/kernels/image/random_select_subpolicy_op.cc" - "${MINDDATA_DIR}/kernels/image/random_solarize_op.cc" - "${MINDDATA_DIR}/kernels/image/random_vertical_flip_op.cc" - "${MINDDATA_DIR}/kernels/image/random_vertical_flip_with_bbox_op.cc" - "${MINDDATA_DIR}/kernels/image/random_sharpness_op.cc" - "${MINDDATA_DIR}/kernels/image/rescale_op.cc" - "${MINDDATA_DIR}/kernels/image/rgba_to_bgr_op.cc" - "${MINDDATA_DIR}/kernels/image/rgba_to_rgb_op.cc" - "${MINDDATA_DIR}/kernels/image/sharpness_op.cc" - "${MINDDATA_DIR}/kernels/image/solarize_op.cc" - "${MINDDATA_DIR}/kernels/image/swap_red_blue_op.cc" - "${MINDDATA_DIR}/kernels/image/uniform_aug_op.cc" - "${MINDDATA_DIR}/kernels/image/resize_with_bbox_op.cc" - "${MINDDATA_DIR}/kernels/image/random_resize_with_bbox_op.cc" - "${MINDDATA_DIR}/kernels/image/random_color_op.cc" - ) + "${MINDDATA_DIR}/kernels/image/random_crop_op.cc" + "${MINDDATA_DIR}/kernels/image/random_crop_with_bbox_op.cc" + "${MINDDATA_DIR}/kernels/image/random_horizontal_flip_op.cc" + "${MINDDATA_DIR}/kernels/image/random_horizontal_flip_with_bbox_op.cc" + "${MINDDATA_DIR}/kernels/image/random_posterize_op.cc" + "${MINDDATA_DIR}/kernels/image/random_resize_op.cc" + "${MINDDATA_DIR}/kernels/image/random_rotation_op.cc" + "${MINDDATA_DIR}/kernels/image/random_select_subpolicy_op.cc" + "${MINDDATA_DIR}/kernels/image/random_solarize_op.cc" + "${MINDDATA_DIR}/kernels/image/random_vertical_flip_op.cc" + "${MINDDATA_DIR}/kernels/image/random_vertical_flip_with_bbox_op.cc" + "${MINDDATA_DIR}/kernels/image/random_sharpness_op.cc" + "${MINDDATA_DIR}/kernels/image/rescale_op.cc" + "${MINDDATA_DIR}/kernels/image/rgba_to_bgr_op.cc" + "${MINDDATA_DIR}/kernels/image/rgba_to_rgb_op.cc" + "${MINDDATA_DIR}/kernels/image/sharpness_op.cc" + "${MINDDATA_DIR}/kernels/image/solarize_op.cc" + "${MINDDATA_DIR}/kernels/image/swap_red_blue_op.cc" + "${MINDDATA_DIR}/kernels/image/uniform_aug_op.cc" + "${MINDDATA_DIR}/kernels/image/resize_with_bbox_op.cc" + "${MINDDATA_DIR}/kernels/image/random_resize_with_bbox_op.cc" + "${MINDDATA_DIR}/kernels/image/random_color_op.cc" + ) list(REMOVE_ITEM MINDDATA_ENGINE_IR_DATASETOPS_SRC_FILES - "${MINDDATA_DIR}/engine/ir/datasetops/bucket_batch_by_length_node.cc" - "${MINDDATA_DIR}/engine/ir/datasetops/build_sentence_piece_vocab_node.cc" - "${MINDDATA_DIR}/engine/ir/datasetops/build_vocab_node.cc" - "${MINDDATA_DIR}/engine/ir/datasetops/sync_wait_node.cc" - ) + "${MINDDATA_DIR}/engine/ir/datasetops/bucket_batch_by_length_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/build_sentence_piece_vocab_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/build_vocab_node.cc" + "${MINDDATA_DIR}/engine/ir/datasetops/sync_wait_node.cc" + ) + list(REMOVE_ITEM MINDDATA_ENGINE_CONSUMERS_SRC_FILES + "${MINDDATA_DIR}/engine/consumers/python_tree_consumer.cc" + ) + + list(REMOVE_ITEM MINDDATA_ENGINE_SRC_FILES + "${MINDDATA_DIR}/engine/python_runtime_context.cc" + ) + list(REMOVE_ITEM MINDDATA_KERNELS_DATA_SRC_FILES - "${MINDDATA_DIR}/kernels/data/unique_op.cc" - ) + "${MINDDATA_DIR}/kernels/data/unique_op.cc" + ) include_directories("${CMAKE_BINARY_DIR}/minddata/dataset/engine/cache") if (BUILD_MINDDATA_EXAMPLE AND (PLATFORM_ARM32 OR PLATFORM_ARM64)) - set(MINDDATA_EXAMPLE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/example/jni-example.cc) - endif() + set(MINDDATA_EXAMPLE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/example/jni-example.cc) + endif () add_library(minddata-lite SHARED - ${MINDDATA_API_SRC_FILES} + ${MINDDATA_API_SRC_FILES} ${MINDDATA_CALLBACK_SRC_FILES} ${MINDDATA_CORE_SRC_FILES} ${MINDDATA_ENGINE_SRC_FILES} diff --git a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc index ddd147cfed..6b06259e85 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc @@ -1093,7 +1093,7 @@ TEST_F(MindDataTestPipeline, TestTakeDatasetDefault) { std::shared_ptr ds = ImageFolder(folder_path, true, RandomSampler(false, 7)); EXPECT_NE(ds, nullptr); - // Create a Take operation on ds, dafault count = -1 + // Create a Take operation on ds, default count = -1 ds = ds->Take(); EXPECT_NE(ds, nullptr); diff --git a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc index 232fdbe25d..090c4f7352 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc @@ -429,7 +429,7 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetWithNullSampler) { schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); std::shared_ptr ds = RandomData(50, schema, {}, nullptr); // Expect failure: sampler can not be nullptr - EXPECT_EQ(ds, nullptr); + EXPECT_EQ(ds->CreateIterator(), nullptr); } TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) { @@ -441,5 +441,5 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) { schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); std::shared_ptr ds = RandomData(50, schema, {"image", "image"}); // Expect failure: duplicate column names - EXPECT_EQ(ds, nullptr); + EXPECT_EQ(ds->CreateIterator(), nullptr); } diff --git a/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc b/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc index bed08f6ada..62eb1efd8c 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_tfrecord_test.cc @@ -443,34 +443,34 @@ TEST_F(MindDataTestPipeline, TestTFRecordDatasetExeception) { // This case expected to fail because the list of dir_path cannot be empty. std::shared_ptr ds1 = TFRecord({}); - EXPECT_EQ(ds1, nullptr); + EXPECT_EQ(ds1->CreateIterator(), nullptr); // This case expected to fail because the file in dir_path is not exist. std::string file_path = datasets_root_path_ + "/testTFTestAllTypes/test.data"; std::shared_ptr ds2 = TFRecord({file_path, "noexist.data"}); - EXPECT_EQ(ds2, nullptr); + EXPECT_EQ(ds2->CreateIterator(), nullptr); // This case expected to fail because the file of schema is not exist. std::shared_ptr ds4 = TFRecord({file_path, "notexist.json"}); - EXPECT_EQ(ds4, nullptr); + EXPECT_EQ(ds4->CreateIterator(), nullptr); // This case expected to fail because num_samples is negative. std::shared_ptr ds5 = TFRecord({file_path}, "", {}, -1); - EXPECT_EQ(ds5, nullptr); + EXPECT_EQ(ds5->CreateIterator(), nullptr); // This case expected to fail because num_shards is negative. std::shared_ptr ds6 = TFRecord({file_path}, "", {}, 10, ShuffleMode::kFalse, 0); - EXPECT_EQ(ds6, nullptr); + EXPECT_EQ(ds6->CreateIterator(), nullptr); // This case expected to fail because shard_id is out_of_bound. std::shared_ptr ds7 = TFRecord({file_path}, "", {}, 10, ShuffleMode::kFalse, 3, 3); - EXPECT_EQ(ds7, nullptr); + EXPECT_EQ(ds7->CreateIterator(), nullptr); // This case expected to fail because the provided number of files < num_shards in file-based sharding. std::string file_path1 = datasets_root_path_ + "/test_tf_file_3_images2/train-0000-of-0001.data"; std::string file_path2 = datasets_root_path_ + "/test_tf_file_3_images2/train-0000-of-0002.data"; std::shared_ptr ds8 = TFRecord({file_path1, file_path2}, "", {}, 0, ShuffleMode::kFalse, 3); - EXPECT_EQ(ds8, nullptr); + EXPECT_EQ(ds8->CreateIterator(), nullptr); } TEST_F(MindDataTestPipeline, TestTFRecordDatasetExeception2) { diff --git a/tests/ut/cpp/dataset/tree_adapter_test.cc b/tests/ut/cpp/dataset/tree_adapter_test.cc index f29dec7654..c0f336232b 100644 --- a/tests/ut/cpp/dataset/tree_adapter_test.cc +++ b/tests/ut/cpp/dataset/tree_adapter_test.cc @@ -56,7 +56,7 @@ TEST_F(MindDataTestTreeAdapter, TestSimpleTreeAdapter) { mindspore::dataset::TreeAdapter tree_adapter; - Status rc = tree_adapter.BuildAndPrepare(ds, 1); + Status rc = tree_adapter.BuildAndPrepare(ds->IRNode(), 1); EXPECT_TRUE(rc.IsOk()); @@ -91,7 +91,7 @@ TEST_F(MindDataTestTreeAdapter, TestTreeAdapterWithRepeat) { mindspore::dataset::TreeAdapter tree_adapter; - Status rc = tree_adapter.BuildAndPrepare(ds, 2); + Status rc = tree_adapter.BuildAndPrepare(ds->IRNode(), 2); EXPECT_TRUE(rc.IsOk()); const std::unordered_map map = tree_adapter.GetColumnNameMap(); @@ -128,7 +128,7 @@ TEST_F(MindDataTestTreeAdapter, TestProjectMapTreeAdapter) { mindspore::dataset::TreeAdapter tree_adapter; - Status rc = tree_adapter.BuildAndPrepare(ds, 2); + Status rc = tree_adapter.BuildAndPrepare(ds->IRNode(), 2); EXPECT_TRUE(rc.IsOk());