From 4f98ecb433f779e6683781f65522be5c4a9e41e2 Mon Sep 17 00:00:00 2001 From: luoyang Date: Fri, 28 Aug 2020 10:51:41 +0800 Subject: [PATCH] Fix bugs in c-api: rename, concat, take, sampler, duplicate column & Change docstring of OneHot --- .../ccsrc/minddata/dataset/api/datasets.cc | 164 +++++++++++------- .../ccsrc/minddata/dataset/include/datasets.h | 66 +++---- mindspore/dataset/transforms/c_transforms.py | 2 +- .../cpp/dataset/c_api_dataset_cifar_test.cc | 30 ++++ .../ut/cpp/dataset/c_api_dataset_coco_test.cc | 11 ++ .../ut/cpp/dataset/c_api_dataset_csv_test.cc | 11 ++ .../dataset/c_api_dataset_manifest_test.cc | 13 +- .../ut/cpp/dataset/c_api_dataset_ops_test.cc | 74 +++++++- .../dataset/c_api_dataset_randomdata_test.cc | 26 ++- .../ut/cpp/dataset/c_api_dataset_voc_test.cc | 10 ++ tests/ut/cpp/dataset/c_api_datasets_test.cc | 57 +++++- 11 files changed, 350 insertions(+), 114 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 8ace68d2a3..c231146168 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -191,8 +191,8 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, } // Function to create a ManifestDataset. -std::shared_ptr Manifest(std::string dataset_file, std::string usage, - std::shared_ptr sampler, +std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, const std::map &class_indexing, bool decode) { auto ds = std::make_shared(dataset_file, usage, sampler, class_indexing, decode); @@ -211,7 +211,7 @@ std::shared_ptr Mnist(const std::string &dataset_dir, const std::s // Function to overload "+" operator to concat two datasets std::shared_ptr operator+(const std::shared_ptr &datasets1, const std::shared_ptr &datasets2) { - std::shared_ptr ds = std::make_shared(std::vector({datasets1, datasets2})); + std::shared_ptr ds = std::make_shared(std::vector({datasets2, datasets1})); // Call derived class validation method. return ds->ValidateParams() ? ds : nullptr; @@ -580,13 +580,6 @@ bool SchemaObj::from_json(nlohmann::json json_obj) { // OTHER FUNCTIONS -// Helper function to create default RandomSampler. -std::shared_ptr CreateDefaultSampler() { - const int32_t num_samples = 0; // 0 means to sample all ids. - bool replacement = false; - return std::make_shared(replacement, num_samples); -} - // Helper function to compute a default shuffle size Status ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows, int64_t *shuffle_size) { @@ -682,6 +675,36 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha return true; } +// Helper function to validate dataset sampler parameter +bool ValidateDatasetSampler(const std::string &dataset_name, const std::shared_ptr &sampler) { + if (sampler == nullptr) { + MS_LOG(ERROR) << dataset_name << ": Sampler is not constructed correctly, sampler: nullptr"; + return false; + } + return true; +} + +// Helper function to validate dataset input/output column parameter +bool ValidateDatasetColumnParam(const std::string &dataset_name, const std::string &column_param, + const std::vector &columns) { + if (columns.empty()) { + MS_LOG(ERROR) << dataset_name << ":" << column_param << " should not be empty"; + return false; + } + for (uint32_t i = 0; i < columns.size(); ++i) { + if (columns[i].empty()) { + MS_LOG(ERROR) << dataset_name << ":" << column_param << "[" << i << "] should not be empty"; + return false; + } + } + std::set columns_set(columns.begin(), columns.end()); + if (columns_set.size() != columns.size()) { + MS_LOG(ERROR) << dataset_name << ":" << column_param << ": Every column name should not be same with others"; + return false; + } + return true; +} + /* ####################################### Derived Dataset classes ################################# */ // DERIVED DATASET CLASSES LEAF-NODE DATASETS @@ -701,6 +724,9 @@ bool CelebADataset::ValidateParams() { if (!ValidateDatasetDirParam("CelebADataset", dataset_dir_)) { return false; } + if (!ValidateDatasetSampler("CelebADataset", sampler_)) { + return false; + } std::set dataset_type_list = {"all", "train", "valid", "test"}; auto iter = dataset_type_list.find(dataset_type_); if (iter == dataset_type_list.end()) { @@ -715,11 +741,6 @@ std::vector> CelebADataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - std::unique_ptr schema = std::make_unique(); RETURN_EMPTY_IF_ERROR( schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); @@ -736,18 +757,15 @@ std::vector> CelebADataset::Build() { Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} -bool Cifar10Dataset::ValidateParams() { return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_); } +bool Cifar10Dataset::ValidateParams() { + return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) && ValidateDatasetSampler("Cifar10Dataset", sampler_); +} // Function to build CifarOp for Cifar10 std::vector> Cifar10Dataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -765,18 +783,16 @@ std::vector> Cifar10Dataset::Build() { Cifar100Dataset::Cifar100Dataset(const std::string &dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} -bool Cifar100Dataset::ValidateParams() { return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_); } +bool Cifar100Dataset::ValidateParams() { + return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) && + ValidateDatasetSampler("Cifar100Dataset", sampler_); +} // Function to build CifarOp for Cifar100 std::vector> Cifar100Dataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -987,6 +1003,9 @@ bool CocoDataset::ValidateParams() { if (!ValidateDatasetDirParam("CocoDataset", dataset_dir_)) { return false; } + if (!ValidateDatasetSampler("CocoDataset", sampler_)) { + return false; + } Path annotation_file(annotation_file_); if (!annotation_file.Exists()) { MS_LOG(ERROR) << "annotation_file is invalid or not exist"; @@ -1006,11 +1025,6 @@ std::vector> CocoDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - CocoOp::TaskType task_type; if (task_ == "Detection") { task_type = CocoOp::TaskType::Detection; @@ -1100,6 +1114,12 @@ bool CSVDataset::ValidateParams() { return false; } + if (!column_names_.empty()) { + if (!ValidateDatasetColumnParam("CSVDataset", "column_names", column_names_)) { + return false; + } + } + return true; } @@ -1155,17 +1175,15 @@ ImageFolderDataset::ImageFolderDataset(std::string dataset_dir, bool decode, std class_indexing_(class_indexing), exts_(extensions) {} -bool ImageFolderDataset::ValidateParams() { return ValidateDatasetDirParam("ImageFolderDataset", dataset_dir_); } +bool ImageFolderDataset::ValidateParams() { + return ValidateDatasetDirParam("ImageFolderDataset", dataset_dir_) && + ValidateDatasetSampler("ImageFolderDataset", sampler_); +} std::vector> ImageFolderDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. // This arg is exist in ImageFolderOp, but not externalized (in Python API). std::unique_ptr schema = std::make_unique(); @@ -1180,7 +1198,8 @@ std::vector> ImageFolderDataset::Build() { return node_ops; } -ManifestDataset::ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr sampler, +ManifestDataset::ManifestDataset(const std::string &dataset_file, const std::string &usage, + const std::shared_ptr &sampler, const std::map &class_indexing, bool decode) : dataset_file_(dataset_file), usage_(usage), decode_(decode), class_index_(class_indexing), sampler_(sampler) {} @@ -1190,6 +1209,9 @@ bool ManifestDataset::ValidateParams() { MS_LOG(ERROR) << "dataset file: [" << dataset_file_ << "] is invalid or not exist"; return false; } + if (!ValidateDatasetSampler("ManifestDataset", sampler_)) { + return false; + } std::vector usage_list = {"train", "eval", "inference"}; if (find(usage_list.begin(), usage_list.end(), usage_) == usage_list.end()) { @@ -1204,11 +1226,6 @@ std::vector> ManifestDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -1228,17 +1245,14 @@ std::vector> ManifestDataset::Build() { MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr sampler) : dataset_dir_(dataset_dir), sampler_(sampler) {} -bool MnistDataset::ValidateParams() { return ValidateDatasetDirParam("MnistDataset", dataset_dir_); } +bool MnistDataset::ValidateParams() { + return ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_); +} std::vector> MnistDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler, i.e., RandomSampler. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - // Do internal Schema generation. auto schema = std::make_unique(); RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1))); @@ -1257,6 +1271,14 @@ bool RandomDataset::ValidateParams() { MS_LOG(ERROR) << "RandomDataset: total_rows must be greater than 0, now get " << total_rows_; return false; } + if (!ValidateDatasetSampler("RandomDataset", sampler_)) { + return false; + } + if (!columns_list_.empty()) { + if (!ValidateDatasetColumnParam("RandomDataset", "columns_list", columns_list_)) { + return false; + } + } return true; } @@ -1279,11 +1301,6 @@ std::vector> RandomDataset::Build() { total_rows_ = schema_obj->get_num_rows(); } - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - std::string schema_json_string, schema_file_path; if (schema_ != nullptr) { schema_->set_dataset_type("Random"); @@ -1392,6 +1409,9 @@ bool VOCDataset::ValidateParams() { MS_LOG(ERROR) << "Invalid dataset path or no dataset path is specified."; return false; } + if (!ValidateDatasetSampler("VOCDataset", sampler_)) { + return false; + } if (task_ == "Segmentation") { if (!class_index_.empty()) { MS_LOG(ERROR) << "class_indexing is invalid in Segmentation task."; @@ -1420,11 +1440,6 @@ std::vector> VOCDataset::Build() { // A vector containing shared pointer to the Dataset Ops that this object will create std::vector> node_ops; - // If user does not specify Sampler, create a default sampler based on the shuffle variable. - if (sampler_ == nullptr) { - sampler_ = CreateDefaultSampler(); - } - auto schema = std::make_unique(); VOCOp::TaskType task_type_; @@ -1539,6 +1554,10 @@ bool ConcatDataset::ValidateParams() { MS_LOG(ERROR) << "Concat: concatenated datasets are not specified."; return false; } + if (find(datasets_.begin(), datasets_.end(), nullptr) != datasets_.end()) { + MS_LOG(ERROR) << "Concat: concatenated dataset should not be null."; + return false; + } return true; } @@ -1586,6 +1605,21 @@ bool MapDataset::ValidateParams() { MS_LOG(ERROR) << "Map: No operation is specified."; return false; } + if (!input_columns_.empty()) { + if (!ValidateDatasetColumnParam("MapDataset", "input_columns", input_columns_)) { + return false; + } + } + if (!output_columns_.empty()) { + if (!ValidateDatasetColumnParam("MapDataset", "output_columns", output_columns_)) { + return false; + } + } + if (!project_columns_.empty()) { + if (!ValidateDatasetColumnParam("MapDataset", "project_columns", project_columns_)) { + return false; + } + } return true; } @@ -1615,12 +1649,12 @@ RenameDataset::RenameDataset(const std::vector &input_columns, : input_columns_(input_columns), output_columns_(output_columns) {} bool RenameDataset::ValidateParams() { - if (input_columns_.empty() || output_columns_.empty()) { - MS_LOG(ERROR) << "input and output columns must be specified"; + if (input_columns_.size() != output_columns_.size()) { + MS_LOG(ERROR) << "RenameDataset: input and output columns must be the same size"; return false; } - if (input_columns_.size() != output_columns_.size()) { - MS_LOG(ERROR) << "input and output columns must be the same size"; + if (!ValidateDatasetColumnParam("RenameDataset", "input_columns", input_columns_) || + !ValidateDatasetColumnParam("RenameDataset", "output_columns", output_columns_)) { return false; } return true; @@ -1713,7 +1747,7 @@ std::vector> TakeDataset::Build() { // Function to validate the parameters for TakeDataset bool TakeDataset::ValidateParams() { - if (take_count_ < 0 && take_count_ != -1) { + if (take_count_ <= 0 && take_count_ != -1) { MS_LOG(ERROR) << "Take: take_count should be either -1 or positive integer, take_count: " << take_count_; return false; } diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index ea3f65a5ed..a367e27d8d 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -84,32 +84,32 @@ std::shared_ptr Schema(const std::string &schema_file = ""); // The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. /// \param[in] dataset_dir Path to the root directory that contains the dataset. /// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'. -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \param[in] decode Decode the images after reading (default=false). /// \param[in] extensions Set of file extensions to be included in the dataset (default={}). /// \return Shared pointer to the current Dataset std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &dataset_type = "all", - const std::shared_ptr &sampler = nullptr, bool decode = false, + const std::shared_ptr &sampler = RandomSampler(), bool decode = false, const std::set &extensions = {}); /// \brief Function to create a Cifar10 Dataset /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Cifar10(const std::string &dataset_dir, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a Cifar100 Dataset /// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Cifar100(const std::string &dataset_dir, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a CLUEDataset /// \notes The generated dataset has a variable number of columns depending on the task and usage @@ -146,12 +146,12 @@ std::shared_ptr CLUE(const std::vector &dataset_files, /// \param[in] annotation_file Path to the annotation json /// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint' /// \param[in] decode Decode the images after reading -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task = "Detection", const bool &decode = false, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a CSVDataset /// \notes The generated dataset has a variable number of columns @@ -185,13 +185,13 @@ std::shared_ptr CSV(const std::vector &dataset_files, c /// The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] decode A flag to decode in ImageFolder -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \param[in] extensions File extensions to be read /// \param[in] class_indexing a class name to label map /// \return Shared pointer to the current ImageFolderDataset std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode = false, - const std::shared_ptr &sampler = nullptr, + const std::shared_ptr &sampler = RandomSampler(), const std::set &extensions = {}, const std::map &class_indexing = {}); @@ -199,25 +199,25 @@ std::shared_ptr ImageFolder(const std::string &dataset_dir, /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_file The dataset file to be read /// \param[in] usage Need "train", "eval" or "inference" data (default="train") -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder /// names will be sorted alphabetically and each class will be given a unique index starting from 0). /// \param[in] decode Decode the images after reading (default=false). /// \return Shared pointer to the current ManifestDataset -std::shared_ptr Manifest(std::string dataset_file, std::string usage = "train", - std::shared_ptr sampler = nullptr, +std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage = "train", + const std::shared_ptr &sampler = RandomSampler(), const std::map &class_indexing = {}, bool decode = false); /// \brief Function to create a MnistDataset /// \notes The generated dataset has two columns ['image', 'label'] /// \param[in] dataset_dir Path to the root directory that contains the dataset -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, -/// A `RandomSampler` will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current MnistDataset std::shared_ptr Mnist(const std::string &dataset_dir, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a ConcatDataset /// \notes Reload "+" operator to concat two datasets @@ -231,14 +231,14 @@ std::shared_ptr operator+(const std::shared_ptr &dataset /// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random) /// \param[in] schema SchemaObj to set column type, data type and data shape /// \param[in] columns_list List of columns to be read (default={}, read all columns) -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset template > std::shared_ptr RandomData(const int32_t &total_rows = 0, T schema = nullptr, const std::vector &columns_list = {}, - std::shared_ptr sampler = nullptr) { - auto ds = std::make_shared(total_rows, schema, std::move(columns_list), std::move(sampler)); + const std::shared_ptr &sampler = RandomSampler()) { + auto ds = std::make_shared(total_rows, schema, columns_list, std::move(sampler)); return ds->ValidateParams() ? ds : nullptr; } @@ -271,13 +271,13 @@ std::shared_ptr TextFile(const std::vector &datase /// \param[in] mode Set the data list txt file to be readed /// \param[in] class_indexing A str-to-int mapping from label name to index /// \param[in] decode Decode the images after reading -/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` -/// will be used to randomly iterate the entire dataset +/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, +/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// \return Shared pointer to the current Dataset std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", const std::string &mode = "train", const std::map &class_indexing = {}, bool decode = false, - const std::shared_ptr &sampler = nullptr); + const std::shared_ptr &sampler = RandomSampler()); /// \brief Function to create a ZipDataset /// \notes Applies zip to the dataset @@ -716,7 +716,7 @@ class ImageFolderDataset : public Dataset { class ManifestDataset : public Dataset { public: /// \brief Constructor - ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr sampler, + ManifestDataset(const std::string &dataset_file, const std::string &usage, const std::shared_ptr &sampler, const std::map &class_indexing, bool decode); /// \brief Destructor @@ -768,7 +768,7 @@ class RandomDataset : public Dataset { /// \brief Constructor RandomDataset(const int32_t &total_rows, std::shared_ptr schema, - const std::vector &columns_list, std::shared_ptr sampler) + const std::vector &columns_list, const std::shared_ptr &sampler) : total_rows_(total_rows), schema_path_(""), schema_(std::move(schema)), @@ -776,8 +776,8 @@ class RandomDataset : public Dataset { sampler_(std::move(sampler)) {} /// \brief Constructor - RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector columns_list, - std::shared_ptr sampler) + RandomDataset(const int32_t &total_rows, std::string schema_path, const std::vector &columns_list, + const std::shared_ptr &sampler) : total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {} /// \brief Destructor diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index f4b7cde1fa..f3b20e66a6 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -32,7 +32,7 @@ class OneHot(cde.OneHotOp): Args: num_classes (int): Number of classes of the label - it should be bigger than or equal to label class number. + it should be bigger than largest label number in dataset. Raises: RuntimeError: feature size is bigger than num_classes. diff --git a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc index 2e4125516d..1c473f6c81 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc @@ -107,3 +107,33 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetFail1) { std::shared_ptr ds = Cifar10("", RandomSampler(false, 10)); EXPECT_EQ(ds, nullptr); } + +TEST_F(MindDataTestPipeline, TestCifar10DatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetWithNullSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; + std::shared_ptr ds = Cifar10(folder_path, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestCifar100DatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithNullSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; + std::shared_ptr ds = Cifar100(folder_path, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestCifar100DatasetWithWrongSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithWrongSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; + std::shared_ptr ds = Cifar100(folder_path, RandomSampler(false, -10)); + // Expect failure: sampler is not construnced correctly + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc index dcc21e253b..ec1c784b95 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_coco_test.cc @@ -290,3 +290,14 @@ TEST_F(MindDataTestPipeline, TestCocoStuff) { // Manually terminate the pipeline iter->Stop(); } + +TEST_F(MindDataTestPipeline, TestCocoWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCocoWithNullSampler."; + // Create a Coco Dataset + std::string folder_path = datasets_root_path_ + "/testCOCO/train"; + std::string annotation_file = datasets_root_path_ + "/testCOCO/annotations/train.json"; + + std::shared_ptr ds = Coco(folder_path, annotation_file, "Detection", false, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc index f005cf301f..c900954d69 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_csv_test.cc @@ -473,3 +473,14 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleGlobal) { GlobalContext::config_manager()->set_seed(original_seed); GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers); } + +TEST_F(MindDataTestPipeline, TestCSVDatasetDuplicateColumnName) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCSVDatasetDuplicateColumnName."; + + // Create a CSVDataset, with single CSV file + std::string train_file = datasets_root_path_ + "/testCSV/1.csv"; + std::vector column_names = {"col1", "col1", "col3", "col4"}; + std::shared_ptr ds = CSV({train_file}, ',', {}, column_names, -1, ShuffleMode::kFalse); + // Expect failure: duplicate column names + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc index 5e4c91c765..5911279d02 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_manifest_test.cc @@ -59,7 +59,7 @@ TEST_F(MindDataTestPipeline, TestManifestDecode) { std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; // Create a Manifest Dataset - std::shared_ptr ds = Manifest(file_path, "train", nullptr, {}, true); + std::shared_ptr ds = Manifest(file_path, "train", RandomSampler(), {}, true); EXPECT_NE(ds, nullptr); // Create an iterator over the result of the above dataset @@ -130,7 +130,7 @@ TEST_F(MindDataTestPipeline, TestManifestClassIndex) { std::vector expected_label = {111, 222}; // Create a Manifest Dataset - std::shared_ptr ds = Manifest(file_path, "train", nullptr, map, true); + std::shared_ptr ds = Manifest(file_path, "train", RandomSampler(), map, true); EXPECT_NE(ds, nullptr); // Create an iterator over the result of the above dataset @@ -204,3 +204,12 @@ TEST_F(MindDataTestPipeline, TestManifestError) { std::shared_ptr ds1 = Manifest(file_path, "invalid_usage"); EXPECT_EQ(ds1, nullptr); } + +TEST_F(MindDataTestPipeline, TestManifestWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestWithNullSampler."; + std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json"; + // Create a Manifest Dataset + std::shared_ptr ds = Manifest(file_path, "train", nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc index e65cf8392d..818189e6e2 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_ops_test.cc @@ -311,6 +311,34 @@ TEST_F(MindDataTestPipeline, TestProjectMap) { iter->Stop(); } +TEST_F(MindDataTestPipeline, TestMapDuplicateColumn) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMapDuplicateColumn."; + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create objects for the tensor ops + std::shared_ptr random_vertical_flip_op = vision::RandomVerticalFlip(0.5); + EXPECT_NE(random_vertical_flip_op, nullptr); + + // Create a Map operation on ds + auto ds1 = ds->Map({random_vertical_flip_op}, {"image", "image"}, {}, {}); + // Expect failure: duplicate input column name + EXPECT_EQ(ds1, nullptr); + + // Create a Map operation on ds + auto ds2 = ds->Map({random_vertical_flip_op}, {}, {"label", "label"}, {}); + // Expect failure: duplicate output column name + EXPECT_EQ(ds2, nullptr); + + // Create a Map operation on ds + auto ds3 = ds->Map({random_vertical_flip_op}, {}, {}, {"image", "image"}); + // Expect failure: duplicate project column name + EXPECT_EQ(ds3, nullptr); +} + TEST_F(MindDataTestPipeline, TestProjectMapAutoInjection) { MS_LOG(INFO) << "Doing MindDataTestPipeline.TestProjectMapAutoInjection"; @@ -362,8 +390,8 @@ TEST_F(MindDataTestPipeline, TestProjectMapAutoInjection) { iter->Stop(); } -TEST_F(MindDataTestPipeline, TestRenameFail) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail."; +TEST_F(MindDataTestPipeline, TestRenameFail1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail1."; // We expect this test to fail because input and output in Rename are not the same size // Create an ImageFolder Dataset @@ -381,6 +409,38 @@ TEST_F(MindDataTestPipeline, TestRenameFail) { EXPECT_EQ(ds, nullptr); } +TEST_F(MindDataTestPipeline, TestRenameFail2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail2."; + // We expect this test to fail because input or output column name is empty + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a Rename operation on ds + ds = ds->Rename({"image", "label"}, {"col2", ""}); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestRenameFail3) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail3."; + // We expect this test to fail because duplicate column name + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, RandomSampler(false, 10)); + EXPECT_NE(ds, nullptr); + + // Create a Rename operation on ds + auto ds1 = ds->Rename({"image", "image"}, {"col1", "col2"}); + EXPECT_EQ(ds1, nullptr); + + // Create a Rename operation on ds + auto ds2 = ds->Rename({"image", "label"}, {"col1", "col1"}); + EXPECT_EQ(ds2, nullptr); +} + TEST_F(MindDataTestPipeline, TestRenameSuccess) { MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameSuccess."; @@ -688,9 +748,15 @@ TEST_F(MindDataTestPipeline, TestTakeDatasetError1) { // Create a Take operation on ds with invalid count input int32_t count = -5; - ds = ds->Take(count); + auto ds1 = ds->Take(count); // Expect nullptr for invalid input take_count - EXPECT_EQ(ds, nullptr); + EXPECT_EQ(ds1, nullptr); + + // Create a Take operation on ds with invalid count input + count = 0; + auto ds2 = ds->Take(count); + // Expect nullptr for invalid input take_count + EXPECT_EQ(ds2, nullptr); } TEST_F(MindDataTestPipeline, TestTakeDatasetNormal) { diff --git a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc index 22e77a2ddc..0506a58134 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_randomdata_test.cc @@ -265,4 +265,28 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) { // Manually terminate the pipeline iter->Stop(); GlobalContext::config_manager()->set_seed(curr_seed); -} \ No newline at end of file +} + +TEST_F(MindDataTestPipeline, TestRandomDatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetWithNullSampler."; + + // Create a RandomDataset + std::shared_ptr schema = Schema(); + schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2}); + schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); + std::shared_ptr ds = RandomData(50, schema, {}, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetDuplicateColumnName."; + + // Create a RandomDataset + std::shared_ptr schema = Schema(); + schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2}); + schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); + std::shared_ptr ds = RandomData(50, schema, {"image", "image"}); + // Expect failure: duplicate column names + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc index 17fa23198a..ab81d3667d 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc @@ -194,3 +194,13 @@ TEST_F(MindDataTestPipeline, TestVOCSegmentationError1) { // Expect nullptr for segmentation task with class_index EXPECT_EQ(ds, nullptr); } + +TEST_F(MindDataTestPipeline, TestVOCWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVOCWithNullSampler."; + + // Create a VOC Dataset + std::string folder_path = datasets_root_path_ + "/testVOC2012_2"; + std::shared_ptr ds = VOC(folder_path, "Segmentation", "train", {}, false, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} diff --git a/tests/ut/cpp/dataset/c_api_datasets_test.cc b/tests/ut/cpp/dataset/c_api_datasets_test.cc index 1ae562a618..2952222d72 100644 --- a/tests/ut/cpp/dataset/c_api_datasets_test.cc +++ b/tests/ut/cpp/dataset/c_api_datasets_test.cc @@ -118,24 +118,44 @@ TEST_F(MindDataTestPipeline, TestCelebAException) { EXPECT_EQ(ds1, nullptr); } -TEST_F(MindDataTestPipeline, TestImageFolderFail1) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail1."; +TEST_F(MindDataTestPipeline, TestCelebADatasetWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCelebADataset."; - // Create an ImageFolder Dataset - std::shared_ptr ds = ImageFolder("", true, nullptr); + // Create a CelebA Dataset + std::string folder_path = datasets_root_path_ + "/testCelebAData/"; + std::shared_ptr ds = CelebA(folder_path, "all", nullptr, false, {}); + // Expect failure: sampler can not be nullptr EXPECT_EQ(ds, nullptr); } -TEST_F(MindDataTestPipeline, TestMnistFail1) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFail1."; +TEST_F(MindDataTestPipeline, TestMnistFailWithWrongDatasetDir) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir."; // Create a Mnist Dataset std::shared_ptr ds = Mnist("", RandomSampler(false, 10)); EXPECT_EQ(ds, nullptr); } -TEST_F(MindDataTestPipeline, TestImageFolderFail2) { - MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail2."; +TEST_F(MindDataTestPipeline, TestMnistFailWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithNullSampler."; + + // Create a Mnist Dataset + std::string folder_path = datasets_root_path_ + "/testMnistData/"; + std::shared_ptr ds = Mnist(folder_path, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestImageFolderWithWrongDatasetDir) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderWithWrongDatasetDir."; + + // Create an ImageFolder Dataset + std::shared_ptr ds = ImageFolder("", true, nullptr); + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongExtension) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongExtension."; // Create an ImageFolder Dataset std::string folder_path = datasets_root_path_ + "/testPK/data/"; @@ -150,8 +170,29 @@ TEST_F(MindDataTestPipeline, TestImageFolderFail2) { // Iterate the dataset and get each row std::unordered_map> row; iter->GetNextRow(&row); + // Expect no data: can not find files with specified extension EXPECT_EQ(row.size(), 0); // Manually terminate the pipeline iter->Stop(); } + +TEST_F(MindDataTestPipeline, TestImageFolderFailWithNullSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithNullSampler."; + + // Create an ImageFolder Dataset + std::string folder_path = datasets_root_path_ + "/testPK/data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, nullptr); + // Expect failure: sampler can not be nullptr + EXPECT_EQ(ds, nullptr); +} + +TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongSampler) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongSampler."; + + // Create a Cifar10 Dataset + std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; + std::shared_ptr ds = ImageFolder(folder_path, true, SequentialSampler(-2, 5)); + // Expect failure: sampler is not construnced correctly + EXPECT_EQ(ds, nullptr); +}