Merge pull request !6171 from luoyang/c-api-pyfunctags/v1.0.0
| @@ -16,6 +16,7 @@ | |||||
| #include <fstream> | #include <fstream> | ||||
| #include <unordered_set> | #include <unordered_set> | ||||
| #include <algorithm> | |||||
| #include "minddata/dataset/include/datasets.h" | #include "minddata/dataset/include/datasets.h" | ||||
| #include "minddata/dataset/include/samplers.h" | #include "minddata/dataset/include/samplers.h" | ||||
| #include "minddata/dataset/include/transforms.h" | #include "minddata/dataset/include/transforms.h" | ||||
| @@ -729,7 +730,14 @@ bool ValidateDatasetSampler(const std::string &dataset_name, const std::shared_p | |||||
| } | } | ||||
| bool ValidateStringValue(const std::string &str, const std::unordered_set<std::string> &valid_strings) { | bool ValidateStringValue(const std::string &str, const std::unordered_set<std::string> &valid_strings) { | ||||
| return valid_strings.find(str) != valid_strings.end(); | |||||
| if (valid_strings.find(str) == valid_strings.end()) { | |||||
| std::string mode; | |||||
| mode = std::accumulate(valid_strings.begin(), valid_strings.end(), mode, | |||||
| [](std::string a, std::string b) { return std::move(a) + " " + std::move(b); }); | |||||
| MS_LOG(ERROR) << str << " does not match any mode in [" + mode + " ]"; | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | } | ||||
| // Helper function to validate dataset input/output column parameter | // Helper function to validate dataset input/output column parameter | ||||
| @@ -841,8 +849,7 @@ Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, const std::string | |||||
| bool Cifar10Dataset::ValidateParams() { | bool Cifar10Dataset::ValidateParams() { | ||||
| return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) && | return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) && | ||||
| ValidateDatasetSampler("Cifar10Dataset", sampler_) && | |||||
| ValidateStringValue(usage_, {"train", "test", "all", ""}); | |||||
| ValidateDatasetSampler("Cifar10Dataset", sampler_) && ValidateStringValue(usage_, {"train", "test", "all"}); | |||||
| } | } | ||||
| // Function to build CifarOp for Cifar10 | // Function to build CifarOp for Cifar10 | ||||
| @@ -870,8 +877,7 @@ Cifar100Dataset::Cifar100Dataset(const std::string &dataset_dir, const std::stri | |||||
| bool Cifar100Dataset::ValidateParams() { | bool Cifar100Dataset::ValidateParams() { | ||||
| return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) && | return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) && | ||||
| ValidateDatasetSampler("Cifar100Dataset", sampler_) && | |||||
| ValidateStringValue(usage_, {"train", "test", "all", ""}); | |||||
| ValidateDatasetSampler("Cifar100Dataset", sampler_) && ValidateStringValue(usage_, {"train", "test", "all"}); | |||||
| } | } | ||||
| // Function to build CifarOp for Cifar100 | // Function to build CifarOp for Cifar100 | ||||
| @@ -1359,7 +1365,7 @@ MnistDataset::MnistDataset(std::string dataset_dir, std::string usage, std::shar | |||||
| : dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} | : dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} | ||||
| bool MnistDataset::ValidateParams() { | bool MnistDataset::ValidateParams() { | ||||
| return ValidateStringValue(usage_, {"train", "test", "all", ""}) && | |||||
| return ValidateStringValue(usage_, {"train", "test", "all"}) && | |||||
| ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_); | ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_); | ||||
| } | } | ||||
| @@ -31,8 +31,10 @@ SamplerObj::SamplerObj() {} | |||||
| /// Function to create a Distributed Sampler. | /// Function to create a Distributed Sampler. | ||||
| std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle, | std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle, | ||||
| int64_t num_samples, uint32_t seed, bool even_dist) { | |||||
| auto sampler = std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed, even_dist); | |||||
| int64_t num_samples, uint32_t seed, int64_t offset, | |||||
| bool even_dist) { | |||||
| auto sampler = | |||||
| std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed, offset, even_dist); | |||||
| // Input validation | // Input validation | ||||
| if (!sampler->ValidateParams()) { | if (!sampler->ValidateParams()) { | ||||
| return nullptr; | return nullptr; | ||||
| @@ -95,12 +97,13 @@ std::shared_ptr<WeightedRandomSamplerObj> WeightedRandomSampler(std::vector<doub | |||||
| // DistributedSampler | // DistributedSampler | ||||
| DistributedSamplerObj::DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, | DistributedSamplerObj::DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, | ||||
| uint32_t seed, bool even_dist) | |||||
| uint32_t seed, int64_t offset, bool even_dist) | |||||
| : num_shards_(num_shards), | : num_shards_(num_shards), | ||||
| shard_id_(shard_id), | shard_id_(shard_id), | ||||
| shuffle_(shuffle), | shuffle_(shuffle), | ||||
| num_samples_(num_samples), | num_samples_(num_samples), | ||||
| seed_(seed), | seed_(seed), | ||||
| offset_(offset), | |||||
| even_dist_(even_dist) {} | even_dist_(even_dist) {} | ||||
| bool DistributedSamplerObj::ValidateParams() { | bool DistributedSamplerObj::ValidateParams() { | ||||
| @@ -123,7 +126,7 @@ bool DistributedSamplerObj::ValidateParams() { | |||||
| } | } | ||||
| std::shared_ptr<Sampler> DistributedSamplerObj::Build() { | std::shared_ptr<Sampler> DistributedSamplerObj::Build() { | ||||
| return std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_, | |||||
| return std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_, offset_, | |||||
| even_dist_); | even_dist_); | ||||
| } | } | ||||
| @@ -42,15 +42,10 @@ bool LookupOperation::ValidateParams() { | |||||
| MS_LOG(ERROR) << "Lookup: vocab object type is incorrect or null."; | MS_LOG(ERROR) << "Lookup: vocab object type is incorrect or null."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| if (unknown_token_.empty()) { | |||||
| MS_LOG(ERROR) << "Lookup: no unknown token is specified."; | |||||
| default_id_ = vocab_->Lookup(unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| MS_LOG(ERROR) << "Lookup: " << unknown_token_ << " doesn't exist in vocab."; | |||||
| return false; | return false; | ||||
| } else { | |||||
| default_id_ = vocab_->Lookup(unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| MS_LOG(ERROR) << "Lookup: unknown_token: [" + unknown_token_ + "], does not exist in vocab."; | |||||
| return false; | |||||
| } | |||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -263,6 +263,7 @@ Status Tensor::CreateFromFile(const std::string &path, std::shared_ptr<Tensor> * | |||||
| fs.open(path, std::ios::binary | std::ios::in); | fs.open(path, std::ios::binary | std::ios::in); | ||||
| CHECK_FAIL_RETURN_UNEXPECTED(!fs.fail(), "Fail to open file: " + path); | CHECK_FAIL_RETURN_UNEXPECTED(!fs.fail(), "Fail to open file: " + path); | ||||
| int64_t num_bytes = fs.seekg(0, std::ios::end).tellg(); | int64_t num_bytes = fs.seekg(0, std::ios::end).tellg(); | ||||
| CHECK_FAIL_RETURN_UNEXPECTED(num_bytes <= kDeMaxDim, "Invalid file to allocate tensor memory, check path: " + path); | |||||
| CHECK_FAIL_RETURN_UNEXPECTED(fs.seekg(0, std::ios::beg).good(), "Fail to find size of file"); | CHECK_FAIL_RETURN_UNEXPECTED(fs.seekg(0, std::ios::beg).good(), "Fail to find size of file"); | ||||
| RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{num_bytes}, DataType(DataType::DE_UINT8), out)); | RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{num_bytes}, DataType(DataType::DE_UINT8), out)); | ||||
| int64_t written_bytes = fs.read(reinterpret_cast<char *>((*out)->GetMutableBuffer()), num_bytes).gcount(); | int64_t written_bytes = fs.read(reinterpret_cast<char *>((*out)->GetMutableBuffer()), num_bytes).gcount(); | ||||
| @@ -158,7 +158,7 @@ void TensorShape::AddListToShape(const T &list) { | |||||
| } | } | ||||
| if (dim > kDeMaxDim) { | if (dim > kDeMaxDim) { | ||||
| std::stringstream ss; | std::stringstream ss; | ||||
| ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!"; | |||||
| ss << "Invalid shape data, dim (" << dim << ") is larger than the maximum dim size(" << kDeMaxDim << ")!"; | |||||
| MS_LOG(ERROR) << ss.str().c_str(); | MS_LOG(ERROR) << ss.str().c_str(); | ||||
| known_ = false; | known_ = false; | ||||
| raw_shape_.clear(); | raw_shape_.clear(); | ||||
| @@ -119,6 +119,10 @@ Status AlbumOp::PrescanEntry() { | |||||
| std::sort(image_rows_.begin(), image_rows_.end(), StrComp); | std::sort(image_rows_.begin(), image_rows_.end(), StrComp); | ||||
| num_rows_ = image_rows_.size(); | num_rows_ = image_rows_.size(); | ||||
| if (num_rows_ == 0) { | |||||
| RETURN_STATUS_UNEXPECTED( | |||||
| "Invalid data, no valid data matching the dataset API AlbumDataset. Please check file path or dataset API."); | |||||
| } | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -237,8 +237,7 @@ Status CelebAOp::ParseImageAttrInfo() { | |||||
| num_rows_ = image_labels_vec_.size(); | num_rows_ = image_labels_vec_.size(); | ||||
| if (num_rows_ == 0) { | if (num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API CelebADataset. " | |||||
| "Please check file path or dataset API validation first"); | |||||
| "Invalid data, no valid data matching the dataset API CelebADataset. Please check file path or dataset API."); | |||||
| } | } | ||||
| MS_LOG(DEBUG) << "Celeba dataset rows number is " << num_rows_ << "."; | MS_LOG(DEBUG) << "Celeba dataset rows number is " << num_rows_ << "."; | ||||
| return Status::OK(); | return Status::OK(); | ||||
| @@ -412,7 +412,7 @@ Status CifarOp::ParseCifarData() { | |||||
| if (num_rows_ == 0) { | if (num_rows_ == 0) { | ||||
| std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset"; | std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset"; | ||||
| RETURN_STATUS_UNEXPECTED("Invalid data, no valid data matching the dataset API " + api + | RETURN_STATUS_UNEXPECTED("Invalid data, no valid data matching the dataset API " + api + | ||||
| ". Please check file path or dataset API validation first."); | |||||
| ". Please check file path or dataset API."); | |||||
| } | } | ||||
| cifar_raw_data_block_->Reset(); | cifar_raw_data_block_->Reset(); | ||||
| return Status::OK(); | return Status::OK(); | ||||
| @@ -192,7 +192,7 @@ Status ClueOp::LoadFile(const std::string &file, const int64_t start_offset, con | |||||
| js = nlohmann::json::parse(line); | js = nlohmann::json::parse(line); | ||||
| } catch (const std::exception &err) { | } catch (const std::exception &err) { | ||||
| // Catch any exception and convert to Status return code | // Catch any exception and convert to Status return code | ||||
| RETURN_STATUS_UNEXPECTED("Invalid file, failed to parse json file: " + line); | |||||
| RETURN_STATUS_UNEXPECTED("Invalid file, failed to parse json file: " + file); | |||||
| } | } | ||||
| int cols_count = cols_to_keyword_.size(); | int cols_count = cols_to_keyword_.size(); | ||||
| TensorRow tRow(cols_count, nullptr); | TensorRow tRow(cols_count, nullptr); | ||||
| @@ -482,8 +482,7 @@ Status ClueOp::CalculateNumRowsPerShard() { | |||||
| } | } | ||||
| if (all_num_rows_ == 0) { | if (all_num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API CLUEDataset. Please check file path or dataset API " | |||||
| "validation first."); | |||||
| "Invalid data, no valid data matching the dataset API CLUEDataset. Please check file path or dataset API."); | |||||
| } | } | ||||
| num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); | num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); | ||||
| @@ -468,6 +468,10 @@ Status CocoOp::ParseAnnotationIds() { | |||||
| if (coordinate_map_.find(img) != coordinate_map_.end()) image_ids_.push_back(img); | if (coordinate_map_.find(img) != coordinate_map_.end()) image_ids_.push_back(img); | ||||
| } | } | ||||
| num_rows_ = image_ids_.size(); | num_rows_ = image_ids_.size(); | ||||
| if (num_rows_ == 0) { | |||||
| RETURN_STATUS_UNEXPECTED( | |||||
| "Invalid data, no valid data matching the dataset API CocoDataset. Please check file path or dataset API."); | |||||
| } | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -783,8 +783,7 @@ Status CsvOp::CalculateNumRowsPerShard() { | |||||
| } | } | ||||
| if (all_num_rows_ == 0) { | if (all_num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API CsvDataset. Please check file path or CSV format " | |||||
| "validation first."); | |||||
| "Invalid data, no valid data matching the dataset API CsvDataset. Please check file path or CSV format."); | |||||
| } | } | ||||
| num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); | num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); | ||||
| @@ -117,8 +117,8 @@ Status ImageFolderOp::PrescanMasterEntry(const std::string &filedir) { | |||||
| num_rows_ = image_label_pairs_.size(); | num_rows_ = image_label_pairs_.size(); | ||||
| if (num_rows_ == 0) { | if (num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API ImageFolderDataset. Please check file path or dataset " | |||||
| "API validation first."); | |||||
| "Invalid data, no valid data matching the dataset API ImageFolderDataset. " | |||||
| "Please check file path or dataset API."); | |||||
| } | } | ||||
| // free memory of two queues used for pre-scan | // free memory of two queues used for pre-scan | ||||
| folder_name_queue_->Reset(); | folder_name_queue_->Reset(); | ||||
| @@ -386,8 +386,7 @@ Status ManifestOp::CountDatasetInfo() { | |||||
| num_rows_ = static_cast<int64_t>(image_labelname_.size()); | num_rows_ = static_cast<int64_t>(image_labelname_.size()); | ||||
| if (num_rows_ == 0) { | if (num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API ManifestDataset.Please check file path or dataset API " | |||||
| "validation first."); | |||||
| "Invalid data, no valid data matching the dataset API ManifestDataset. Please check file path or dataset API."); | |||||
| } | } | ||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -369,8 +369,7 @@ Status MnistOp::ParseMnistData() { | |||||
| num_rows_ = image_label_pairs_.size(); | num_rows_ = image_label_pairs_.size(); | ||||
| if (num_rows_ == 0) { | if (num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API MnistDataset.Please check file path or dataset API " | |||||
| "validation first."); | |||||
| "Invalid data, no valid data matching the dataset API MnistDataset. Please check file path or dataset API."); | |||||
| } | } | ||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -473,8 +473,7 @@ Status TextFileOp::CalculateNumRowsPerShard() { | |||||
| } | } | ||||
| if (all_num_rows_ == 0) { | if (all_num_rows_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API TextFileDataset.Please check file path or dataset API " | |||||
| "validation first."); | |||||
| "Invalid data, no valid data matching the dataset API TextFileDataset. Please check file path or dataset API."); | |||||
| } | } | ||||
| num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); | num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); | ||||
| @@ -229,8 +229,7 @@ Status TFReaderOp::CalculateNumRowsPerShard() { | |||||
| num_rows_per_shard_ = static_cast<int64_t>(std::ceil(num_rows_ * 1.0 / num_devices_)); | num_rows_per_shard_ = static_cast<int64_t>(std::ceil(num_rows_ * 1.0 / num_devices_)); | ||||
| if (num_rows_per_shard_ == 0) { | if (num_rows_per_shard_ == 0) { | ||||
| RETURN_STATUS_UNEXPECTED( | RETURN_STATUS_UNEXPECTED( | ||||
| "Invalid data, no valid data matching the dataset API TFRecordDataset.Please check file path or dataset API " | |||||
| "validation first."); | |||||
| "Invalid data, no valid data matching the dataset API TFRecordDataset. Please check file path or dataset API."); | |||||
| } | } | ||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -315,6 +315,10 @@ Status VOCOp::ParseAnnotationIds() { | |||||
| } | } | ||||
| num_rows_ = image_ids_.size(); | num_rows_ = image_ids_.size(); | ||||
| if (num_rows_ == 0) { | |||||
| RETURN_STATUS_UNEXPECTED( | |||||
| "Invalid data, no valid data matching the dataset API VOCDataset. Please check file path or dataset API."); | |||||
| } | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -113,7 +113,7 @@ std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::s | |||||
| /// \notes The generated dataset has two columns ['image', 'attr']. | /// \notes The generated dataset has two columns ['image', 'attr']. | ||||
| /// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. | /// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. | ||||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | /// \param[in] dataset_dir Path to the root directory that contains the dataset. | ||||
| /// \param[in] usage One of "all", "train", "valid" or "test". | |||||
| /// \param[in] usage One of "all", "train", "valid" or "test" (default = "all"). | |||||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | ||||
| /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | ||||
| /// \param[in] decode Decode the images after reading (default=false). | /// \param[in] decode Decode the images after reading (default=false). | ||||
| @@ -126,21 +126,21 @@ std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std: | |||||
| /// \brief Function to create a Cifar10 Dataset | /// \brief Function to create a Cifar10 Dataset | ||||
| /// \notes The generated dataset has two columns ["image", "label"] | /// \notes The generated dataset has two columns ["image", "label"] | ||||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | /// \param[in] dataset_dir Path to the root directory that contains the dataset | ||||
| /// \param[in] usage of CIFAR10, can be "train", "test" or "all" | |||||
| /// \param[in] usage of CIFAR10, can be "train", "test" or "all" (default = "all"). | |||||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | ||||
| /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | ||||
| /// \return Shared pointer to the current Dataset | /// \return Shared pointer to the current Dataset | ||||
| std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir, const std::string &usage = std::string(), | |||||
| std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir, const std::string &usage = "all", | |||||
| const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); | const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); | ||||
| /// \brief Function to create a Cifar100 Dataset | /// \brief Function to create a Cifar100 Dataset | ||||
| /// \notes The generated dataset has three columns ["image", "coarse_label", "fine_label"] | /// \notes The generated dataset has three columns ["image", "coarse_label", "fine_label"] | ||||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | /// \param[in] dataset_dir Path to the root directory that contains the dataset | ||||
| /// \param[in] usage of CIFAR100, can be "train", "test" or "all" | |||||
| /// \param[in] usage of CIFAR100, can be "train", "test" or "all" (default = "all"). | |||||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | ||||
| /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | ||||
| /// \return Shared pointer to the current Dataset | /// \return Shared pointer to the current Dataset | ||||
| std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir, const std::string &usage = std::string(), | |||||
| std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir, const std::string &usage = "all", | |||||
| const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); | const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); | ||||
| /// \brief Function to create a CLUEDataset | /// \brief Function to create a CLUEDataset | ||||
| @@ -247,11 +247,11 @@ std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const | |||||
| /// \brief Function to create a MnistDataset | /// \brief Function to create a MnistDataset | ||||
| /// \notes The generated dataset has two columns ["image", "label"] | /// \notes The generated dataset has two columns ["image", "label"] | ||||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | /// \param[in] dataset_dir Path to the root directory that contains the dataset | ||||
| /// \param[in] usage of MNIST, can be "train", "test" or "all" | |||||
| /// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all"). | |||||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | ||||
| /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) | ||||
| /// \return Shared pointer to the current MnistDataset | /// \return Shared pointer to the current MnistDataset | ||||
| std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::string &usage = std::string(), | |||||
| std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::string &usage = "all", | |||||
| const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); | const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); | ||||
| /// \brief Function to create a ConcatDataset | /// \brief Function to create a ConcatDataset | ||||
| @@ -407,7 +407,7 @@ std::shared_ptr<TFRecordDataset> TFRecord(const std::vector<std::string> &datase | |||||
| /// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. | /// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. | ||||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset | /// \param[in] dataset_dir Path to the root directory that contains the dataset | ||||
| /// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" | /// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" | ||||
| /// \param[in] usage The type of data list text file to be read | |||||
| /// \param[in] usage The type of data list text file to be read (default = "train"). | |||||
| /// \param[in] class_indexing A str-to-int mapping from label name to index, only valid in "Detection" task | /// \param[in] class_indexing A str-to-int mapping from label name to index, only valid in "Detection" task | ||||
| /// \param[in] decode Decode the images after reading | /// \param[in] decode Decode the images after reading | ||||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, | ||||
| @@ -52,12 +52,13 @@ class WeightedRandomSamplerObj; | |||||
| /// \param[in] shuffle - If true, the indices are shuffled. | /// \param[in] shuffle - If true, the indices are shuffled. | ||||
| /// \param[in] num_samples - The number of samples to draw (default to all elements). | /// \param[in] num_samples - The number of samples to draw (default to all elements). | ||||
| /// \param[in] seed - The seed in use when shuffle is true. | /// \param[in] seed - The seed in use when shuffle is true. | ||||
| /// \param[in] offset - The starting position where access to elements in the dataset begins. | |||||
| /// \param[in] even_dist - If true, each shard would return the same number of rows (default to true). | /// \param[in] even_dist - If true, each shard would return the same number of rows (default to true). | ||||
| /// If false the total rows returned by all the shards would not have overlap. | /// If false the total rows returned by all the shards would not have overlap. | ||||
| /// \return Shared pointer to the current Sampler. | /// \return Shared pointer to the current Sampler. | ||||
| std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle = true, | std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle = true, | ||||
| int64_t num_samples = 0, uint32_t seed = 1, | int64_t num_samples = 0, uint32_t seed = 1, | ||||
| bool even_dist = true); | |||||
| int64_t offset = -1, bool even_dist = true); | |||||
| /// Function to create a PK Sampler. | /// Function to create a PK Sampler. | ||||
| /// \notes Samples K elements for each P class in the dataset. | /// \notes Samples K elements for each P class in the dataset. | ||||
| @@ -103,7 +104,7 @@ std::shared_ptr<WeightedRandomSamplerObj> WeightedRandomSampler(std::vector<doub | |||||
| class DistributedSamplerObj : public SamplerObj { | class DistributedSamplerObj : public SamplerObj { | ||||
| public: | public: | ||||
| DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, uint32_t seed, | DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, uint32_t seed, | ||||
| bool even_dist); | |||||
| int64_t offset, bool even_dist); | |||||
| ~DistributedSamplerObj() = default; | ~DistributedSamplerObj() = default; | ||||
| @@ -117,6 +118,7 @@ class DistributedSamplerObj : public SamplerObj { | |||||
| bool shuffle_; | bool shuffle_; | ||||
| int64_t num_samples_; | int64_t num_samples_; | ||||
| uint32_t seed_; | uint32_t seed_; | ||||
| int64_t offset_; | |||||
| bool even_dist_; | bool even_dist_; | ||||
| }; | }; | ||||
| @@ -87,54 +87,25 @@ Status Vocab::BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdTyp | |||||
| Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens, | Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens, | ||||
| bool prepend_special, std::shared_ptr<Vocab> *vocab) { | bool prepend_special, std::shared_ptr<Vocab> *vocab) { | ||||
| // Validate parameters | |||||
| std::string duplicate_word; | |||||
| for (const WordType &word : words) { | |||||
| if (std::count(words.begin(), words.end(), word) > 1) { | |||||
| if (duplicate_word.find(word) == std::string::npos) { | |||||
| duplicate_word = duplicate_word.empty() ? duplicate_word + word : duplicate_word + ", " + word; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (!duplicate_word.empty()) { | |||||
| MS_LOG(ERROR) << "words contains duplicate word: " << duplicate_word; | |||||
| RETURN_STATUS_UNEXPECTED("words contains duplicate word: " + duplicate_word); | |||||
| } | |||||
| std::string duplicate_sp; | |||||
| std::string existed_sp; | |||||
| for (const WordType &sp : special_tokens) { | |||||
| if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) { | |||||
| if (duplicate_sp.find(sp) == std::string::npos) { | |||||
| duplicate_sp = duplicate_sp.empty() ? duplicate_sp + sp : duplicate_sp + ", " + sp; | |||||
| } | |||||
| } | |||||
| if (std::count(words.begin(), words.end(), sp) >= 1) { | |||||
| if (existed_sp.find(sp) == std::string::npos) { | |||||
| existed_sp = existed_sp.empty() ? existed_sp + sp : existed_sp + ", " + sp; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (!duplicate_sp.empty()) { | |||||
| MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp; | |||||
| RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp); | |||||
| } | |||||
| if (!existed_sp.empty()) { | |||||
| MS_LOG(ERROR) << "special_tokens and word_list contain duplicate word: " << existed_sp; | |||||
| RETURN_STATUS_UNEXPECTED("special_tokens and word_list contain duplicate word: " + existed_sp); | |||||
| } | |||||
| std::unordered_map<WordType, WordIdType> word2id; | std::unordered_map<WordType, WordIdType> word2id; | ||||
| // if special is added in front, normal words id will start from number of special tokens | // if special is added in front, normal words id will start from number of special tokens | ||||
| WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0; | WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0; | ||||
| for (auto word : words) { | for (auto word : words) { | ||||
| if (word2id.find(word) != word2id.end()) { | |||||
| MS_LOG(ERROR) << "word_list contains duplicate word: " + word + "."; | |||||
| RETURN_STATUS_UNEXPECTED("word_list contains duplicate word: " + word + "."); | |||||
| } | |||||
| word2id[word] = word_id++; | word2id[word] = word_id++; | ||||
| } | } | ||||
| word_id = prepend_special ? 0 : word2id.size(); | word_id = prepend_special ? 0 : word2id.size(); | ||||
| for (auto special_token : special_tokens) { | for (auto special_token : special_tokens) { | ||||
| if (word2id.find(special_token) != word2id.end()) { | |||||
| MS_LOG(ERROR) << "special_tokens and word_list contain duplicate word: " + special_token + "."; | |||||
| RETURN_STATUS_UNEXPECTED("special_tokens and word_list contain duplicate word: " + special_token + "."); | |||||
| } | |||||
| word2id[special_token] = word_id++; | word2id[special_token] = word_id++; | ||||
| } | } | ||||
| @@ -183,8 +183,8 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) { | |||||
| TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) { | TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) { | ||||
| MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3."; | MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3."; | ||||
| // Build vocab from a vector | // Build vocab from a vector | ||||
| std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", "<pad>"}; | |||||
| std::vector<std::string> sp_tokens = {"<pad>", "<unk>"}; | |||||
| std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", ""}; | |||||
| std::vector<std::string> sp_tokens = {"", "<unk>"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | ||||
| // Expected failure: special tokens are already existed in word_list | // Expected failure: special tokens are already existed in word_list | ||||
| @@ -28,7 +28,7 @@ TEST_F(MindDataTestPipeline, TestCifar10Dataset) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create an iterator over the result of the above dataset | // Create an iterator over the result of the above dataset | ||||
| @@ -62,7 +62,7 @@ TEST_F(MindDataTestPipeline, TestCifar100Dataset) { | |||||
| // Create a Cifar100 Dataset | // Create a Cifar100 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar100(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar100(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create an iterator over the result of the above dataset | // Create an iterator over the result of the above dataset | ||||
| @@ -96,7 +96,7 @@ TEST_F(MindDataTestPipeline, TestCifar100DatasetFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetFail1."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetFail1."; | ||||
| // Create a Cifar100 Dataset | // Create a Cifar100 Dataset | ||||
| std::shared_ptr<Dataset> ds = Cifar100("", std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar100("", "all", RandomSampler(false, 10)); | |||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -104,7 +104,17 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetFail1."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetFail1."; | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::shared_ptr<Dataset> ds = Cifar10("", std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10("", "all", RandomSampler(false, 10)); | |||||
| EXPECT_EQ(ds, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestCifar10DatasetWithInvalidUsage) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetWithNullSampler."; | |||||
| // Create a Cifar10 Dataset | |||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "validation"); | |||||
| // Expect failure: validation is not a valid usage | |||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -113,7 +123,7 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetWithNullSampler) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), nullptr); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", nullptr); | |||||
| // Expect failure: sampler can not be nullptr | // Expect failure: sampler can not be nullptr | ||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -123,7 +133,7 @@ TEST_F(MindDataTestPipeline, TestCifar100DatasetWithNullSampler) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar100(folder_path, std::string(), nullptr); | |||||
| std::shared_ptr<Dataset> ds = Cifar100(folder_path, "all", nullptr); | |||||
| // Expect failure: sampler can not be nullptr | // Expect failure: sampler can not be nullptr | ||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -133,7 +143,7 @@ TEST_F(MindDataTestPipeline, TestCifar100DatasetWithWrongSampler) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar100(folder_path, std::string(), RandomSampler(false, -10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar100(folder_path, "all", RandomSampler(false, -10)); | |||||
| // Expect failure: sampler is not construnced correctly | // Expect failure: sampler is not construnced correctly | ||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -28,7 +28,7 @@ TEST_F(MindDataTestPipeline, TestIteratorEmptyColumn) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorEmptyColumn."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorEmptyColumn."; | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 5)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 5)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Rename operation on ds | // Create a Rename operation on ds | ||||
| @@ -64,7 +64,7 @@ TEST_F(MindDataTestPipeline, TestIteratorOneColumn) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn."; | ||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 4)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 4)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -103,7 +103,7 @@ TEST_F(MindDataTestPipeline, TestIteratorReOrder) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorReOrder."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorReOrder."; | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), SequentialSampler(false, 4)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", SequentialSampler(false, 4)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Take operation on ds | // Create a Take operation on ds | ||||
| @@ -186,7 +186,7 @@ TEST_F(MindDataTestPipeline, TestIteratorWrongColumn) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn."; | ||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 4)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 4)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Pass wrong column name | // Pass wrong column name | ||||
| @@ -40,7 +40,7 @@ TEST_F(MindDataTestPipeline, TestBatchAndRepeat) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Repeat operation on ds | // Create a Repeat operation on ds | ||||
| @@ -82,7 +82,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthSuccess1) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -118,7 +118,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthSuccess2) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -156,7 +156,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail1) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -171,7 +171,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail2) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -186,7 +186,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail3) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -201,7 +201,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail4) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -216,7 +216,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail5) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -231,7 +231,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail6) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, -2, 3}); | ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, -2, 3}); | ||||
| @@ -245,7 +245,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail7) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a BucketBatchByLength operation on ds | // Create a BucketBatchByLength operation on ds | ||||
| @@ -312,7 +312,7 @@ TEST_F(MindDataTestPipeline, TestConcatSuccess) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| // Column names: {"image", "label"} | // Column names: {"image", "label"} | ||||
| folder_path = datasets_root_path_ + "/testCifar10Data/"; | folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, std::string(), RandomSampler(false, 9)); | |||||
| std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, "all", RandomSampler(false, 9)); | |||||
| EXPECT_NE(ds2, nullptr); | EXPECT_NE(ds2, nullptr); | ||||
| // Create a Project operation on ds | // Create a Project operation on ds | ||||
| @@ -364,7 +364,7 @@ TEST_F(MindDataTestPipeline, TestConcatSuccess2) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| // Column names: {"image", "label"} | // Column names: {"image", "label"} | ||||
| folder_path = datasets_root_path_ + "/testCifar10Data/"; | folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, std::string(), RandomSampler(false, 9)); | |||||
| std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, "all", RandomSampler(false, 9)); | |||||
| EXPECT_NE(ds2, nullptr); | EXPECT_NE(ds2, nullptr); | ||||
| // Create a Project operation on ds | // Create a Project operation on ds | ||||
| @@ -1012,7 +1012,7 @@ TEST_F(MindDataTestPipeline, TestTensorOpsAndMap) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 20)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 20)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Repeat operation on ds | // Create a Repeat operation on ds | ||||
| @@ -1126,7 +1126,7 @@ TEST_F(MindDataTestPipeline, TestZipSuccess) { | |||||
| EXPECT_NE(ds1, nullptr); | EXPECT_NE(ds1, nullptr); | ||||
| folder_path = datasets_root_path_ + "/testCifar10Data/"; | folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds2, nullptr); | EXPECT_NE(ds2, nullptr); | ||||
| // Create a Project operation on ds | // Create a Project operation on ds | ||||
| @@ -80,6 +80,50 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) { | |||||
| } | } | ||||
| } | } | ||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpEmptyString."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"}; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(list, {"<pad>", ""}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create Lookup operation on ds | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32")); | |||||
| EXPECT_NE(lookup, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({lookup}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| uint64_t i = 0; | |||||
| std::vector<int32_t> expected = {2, 1, 4, 5, 6, 7}; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| MS_LOG(INFO) << ind->shape() << " " << *ind; | |||||
| std::shared_ptr<Tensor> expected_item; | |||||
| Tensor::CreateScalar(expected[i], &expected_item); | |||||
| EXPECT_EQ(*ind, *expected_item); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { | TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { | ||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1."; | ||||
| // Create a TextFile Dataset | // Create a TextFile Dataset | ||||
| @@ -110,27 +154,6 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) { | |||||
| EXPECT_EQ(lookup, nullptr); | EXPECT_EQ(lookup, nullptr); | ||||
| } | } | ||||
| TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from map | |||||
| std::unordered_map<std::string, int32_t> dict; | |||||
| dict["Home"] = 3; | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create Lookup operation on ds | |||||
| // Expected failure: "" is not a word of vocab | |||||
| std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32")); | |||||
| EXPECT_EQ(lookup, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestVocabFromDataset) { | TEST_F(MindDataTestPipeline, TestVocabFromDataset) { | ||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset."; | ||||
| @@ -133,7 +133,7 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithWrongDatasetDir) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir."; | ||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::shared_ptr<Dataset> ds = Mnist("", std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Mnist("", "all", RandomSampler(false, 10)); | |||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -142,7 +142,7 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithNullSampler) { | |||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), nullptr); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", nullptr); | |||||
| // Expect failure: sampler can not be nullptr | // Expect failure: sampler can not be nullptr | ||||
| EXPECT_EQ(ds, nullptr); | EXPECT_EQ(ds, nullptr); | ||||
| } | } | ||||
| @@ -30,7 +30,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchSuccess1) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| int number_of_classes = 10; | int number_of_classes = 10; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create objects for the tensor ops | // Create objects for the tensor ops | ||||
| @@ -98,7 +98,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchSuccess2) { | |||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| int number_of_classes = 10; | int number_of_classes = 10; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -156,7 +156,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchFail1) { | |||||
| // Must fail because alpha can't be negative | // Must fail because alpha can't be negative | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -181,7 +181,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchFail2) { | |||||
| // Must fail because prob can't be negative | // Must fail because prob can't be negative | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -206,7 +206,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchFail3) { | |||||
| // Must fail because alpha can't be zero | // Must fail because alpha can't be zero | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -376,7 +376,7 @@ TEST_F(MindDataTestPipeline, TestHwcToChw) { | |||||
| TEST_F(MindDataTestPipeline, TestMixUpBatchFail1) { | TEST_F(MindDataTestPipeline, TestMixUpBatchFail1) { | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -400,7 +400,7 @@ TEST_F(MindDataTestPipeline, TestMixUpBatchFail2) { | |||||
| // This should fail because alpha can't be zero | // This should fail because alpha can't be zero | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -423,7 +423,7 @@ TEST_F(MindDataTestPipeline, TestMixUpBatchFail2) { | |||||
| TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess1) { | TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess1) { | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -472,7 +472,7 @@ TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess1) { | |||||
| TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess2) { | TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess2) { | ||||
| // Create a Cifar10 Dataset | // Create a Cifar10 Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; | ||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10)); | |||||
| std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Batch operation on ds | // Create a Batch operation on ds | ||||
| @@ -1118,7 +1118,7 @@ TEST_F(MindDataTestPipeline, TestRandomRotation) { | |||||
| TEST_F(MindDataTestPipeline, TestUniformAugWithOps) { | TEST_F(MindDataTestPipeline, TestUniformAugWithOps) { | ||||
| // Create a Mnist Dataset | // Create a Mnist Dataset | ||||
| std::string folder_path = datasets_root_path_ + "/testMnistData/"; | std::string folder_path = datasets_root_path_ + "/testMnistData/"; | ||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "", RandomSampler(false, 20)); | |||||
| std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 20)); | |||||
| EXPECT_NE(ds, nullptr); | EXPECT_NE(ds, nullptr); | ||||
| // Create a Repeat operation on ds | // Create a Repeat operation on ds | ||||