Browse Source

!6171 [MD] loading dataset log / dataset validation / dataset description

Merge pull request !6171 from luoyang/c-api-pyfunc
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
ddac441dfd
27 changed files with 160 additions and 144 deletions
  1. +12
    -6
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  2. +7
    -4
      mindspore/ccsrc/minddata/dataset/api/samplers.cc
  3. +3
    -8
      mindspore/ccsrc/minddata/dataset/api/text.cc
  4. +1
    -0
      mindspore/ccsrc/minddata/dataset/core/tensor.cc
  5. +1
    -1
      mindspore/ccsrc/minddata/dataset/core/tensor_shape.cc
  6. +4
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
  7. +1
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc
  8. +1
    -1
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc
  9. +2
    -3
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/clue_op.cc
  10. +4
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/coco_op.cc
  11. +1
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc
  12. +2
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc
  13. +1
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/manifest_op.cc
  14. +1
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mnist_op.cc
  15. +1
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/text_file_op.cc
  16. +1
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/tf_reader_op.cc
  17. +4
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/voc_op.cc
  18. +8
    -8
      mindspore/ccsrc/minddata/dataset/include/datasets.h
  19. +4
    -2
      mindspore/ccsrc/minddata/dataset/include/samplers.h
  20. +8
    -37
      mindspore/ccsrc/minddata/dataset/text/vocab.cc
  21. +2
    -2
      tests/ut/cpp/dataset/build_vocab_test.cc
  22. +17
    -7
      tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc
  23. +4
    -4
      tests/ut/cpp/dataset/c_api_dataset_iterator_test.cc
  24. +14
    -14
      tests/ut/cpp/dataset/c_api_dataset_ops_test.cc
  25. +44
    -21
      tests/ut/cpp/dataset/c_api_dataset_vocab.cc
  26. +2
    -2
      tests/ut/cpp/dataset/c_api_datasets_test.cc
  27. +10
    -10
      tests/ut/cpp/dataset/c_api_transforms_test.cc

+ 12
- 6
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -16,6 +16,7 @@


#include <fstream> #include <fstream>
#include <unordered_set> #include <unordered_set>
#include <algorithm>
#include "minddata/dataset/include/datasets.h" #include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/transforms.h" #include "minddata/dataset/include/transforms.h"
@@ -729,7 +730,14 @@ bool ValidateDatasetSampler(const std::string &dataset_name, const std::shared_p
} }


bool ValidateStringValue(const std::string &str, const std::unordered_set<std::string> &valid_strings) { bool ValidateStringValue(const std::string &str, const std::unordered_set<std::string> &valid_strings) {
return valid_strings.find(str) != valid_strings.end();
if (valid_strings.find(str) == valid_strings.end()) {
std::string mode;
mode = std::accumulate(valid_strings.begin(), valid_strings.end(), mode,
[](std::string a, std::string b) { return std::move(a) + " " + std::move(b); });
MS_LOG(ERROR) << str << " does not match any mode in [" + mode + " ]";
return false;
}
return true;
} }


// Helper function to validate dataset input/output column parameter // Helper function to validate dataset input/output column parameter
@@ -841,8 +849,7 @@ Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, const std::string


bool Cifar10Dataset::ValidateParams() { bool Cifar10Dataset::ValidateParams() {
return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) && return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) &&
ValidateDatasetSampler("Cifar10Dataset", sampler_) &&
ValidateStringValue(usage_, {"train", "test", "all", ""});
ValidateDatasetSampler("Cifar10Dataset", sampler_) && ValidateStringValue(usage_, {"train", "test", "all"});
} }


// Function to build CifarOp for Cifar10 // Function to build CifarOp for Cifar10
@@ -870,8 +877,7 @@ Cifar100Dataset::Cifar100Dataset(const std::string &dataset_dir, const std::stri


bool Cifar100Dataset::ValidateParams() { bool Cifar100Dataset::ValidateParams() {
return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) && return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) &&
ValidateDatasetSampler("Cifar100Dataset", sampler_) &&
ValidateStringValue(usage_, {"train", "test", "all", ""});
ValidateDatasetSampler("Cifar100Dataset", sampler_) && ValidateStringValue(usage_, {"train", "test", "all"});
} }


// Function to build CifarOp for Cifar100 // Function to build CifarOp for Cifar100
@@ -1359,7 +1365,7 @@ MnistDataset::MnistDataset(std::string dataset_dir, std::string usage, std::shar
: dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {} : dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {}


bool MnistDataset::ValidateParams() { bool MnistDataset::ValidateParams() {
return ValidateStringValue(usage_, {"train", "test", "all", ""}) &&
return ValidateStringValue(usage_, {"train", "test", "all"}) &&
ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_); ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_);
} }




+ 7
- 4
mindspore/ccsrc/minddata/dataset/api/samplers.cc View File

@@ -31,8 +31,10 @@ SamplerObj::SamplerObj() {}


/// Function to create a Distributed Sampler. /// Function to create a Distributed Sampler.
std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle, std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle,
int64_t num_samples, uint32_t seed, bool even_dist) {
auto sampler = std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed, even_dist);
int64_t num_samples, uint32_t seed, int64_t offset,
bool even_dist) {
auto sampler =
std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed, offset, even_dist);
// Input validation // Input validation
if (!sampler->ValidateParams()) { if (!sampler->ValidateParams()) {
return nullptr; return nullptr;
@@ -95,12 +97,13 @@ std::shared_ptr<WeightedRandomSamplerObj> WeightedRandomSampler(std::vector<doub


// DistributedSampler // DistributedSampler
DistributedSamplerObj::DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, DistributedSamplerObj::DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples,
uint32_t seed, bool even_dist)
uint32_t seed, int64_t offset, bool even_dist)
: num_shards_(num_shards), : num_shards_(num_shards),
shard_id_(shard_id), shard_id_(shard_id),
shuffle_(shuffle), shuffle_(shuffle),
num_samples_(num_samples), num_samples_(num_samples),
seed_(seed), seed_(seed),
offset_(offset),
even_dist_(even_dist) {} even_dist_(even_dist) {}


bool DistributedSamplerObj::ValidateParams() { bool DistributedSamplerObj::ValidateParams() {
@@ -123,7 +126,7 @@ bool DistributedSamplerObj::ValidateParams() {
} }


std::shared_ptr<Sampler> DistributedSamplerObj::Build() { std::shared_ptr<Sampler> DistributedSamplerObj::Build() {
return std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_,
return std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_, offset_,
even_dist_); even_dist_);
} }




+ 3
- 8
mindspore/ccsrc/minddata/dataset/api/text.cc View File

@@ -42,15 +42,10 @@ bool LookupOperation::ValidateParams() {
MS_LOG(ERROR) << "Lookup: vocab object type is incorrect or null."; MS_LOG(ERROR) << "Lookup: vocab object type is incorrect or null.";
return false; return false;
} }
if (unknown_token_.empty()) {
MS_LOG(ERROR) << "Lookup: no unknown token is specified.";
default_id_ = vocab_->Lookup(unknown_token_);
if (default_id_ == Vocab::kNoTokenExists) {
MS_LOG(ERROR) << "Lookup: " << unknown_token_ << " doesn't exist in vocab.";
return false; return false;
} else {
default_id_ = vocab_->Lookup(unknown_token_);
if (default_id_ == Vocab::kNoTokenExists) {
MS_LOG(ERROR) << "Lookup: unknown_token: [" + unknown_token_ + "], does not exist in vocab.";
return false;
}
} }
return true; return true;
} }


+ 1
- 0
mindspore/ccsrc/minddata/dataset/core/tensor.cc View File

@@ -263,6 +263,7 @@ Status Tensor::CreateFromFile(const std::string &path, std::shared_ptr<Tensor> *
fs.open(path, std::ios::binary | std::ios::in); fs.open(path, std::ios::binary | std::ios::in);
CHECK_FAIL_RETURN_UNEXPECTED(!fs.fail(), "Fail to open file: " + path); CHECK_FAIL_RETURN_UNEXPECTED(!fs.fail(), "Fail to open file: " + path);
int64_t num_bytes = fs.seekg(0, std::ios::end).tellg(); int64_t num_bytes = fs.seekg(0, std::ios::end).tellg();
CHECK_FAIL_RETURN_UNEXPECTED(num_bytes <= kDeMaxDim, "Invalid file to allocate tensor memory, check path: " + path);
CHECK_FAIL_RETURN_UNEXPECTED(fs.seekg(0, std::ios::beg).good(), "Fail to find size of file"); CHECK_FAIL_RETURN_UNEXPECTED(fs.seekg(0, std::ios::beg).good(), "Fail to find size of file");
RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{num_bytes}, DataType(DataType::DE_UINT8), out)); RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{num_bytes}, DataType(DataType::DE_UINT8), out));
int64_t written_bytes = fs.read(reinterpret_cast<char *>((*out)->GetMutableBuffer()), num_bytes).gcount(); int64_t written_bytes = fs.read(reinterpret_cast<char *>((*out)->GetMutableBuffer()), num_bytes).gcount();


+ 1
- 1
mindspore/ccsrc/minddata/dataset/core/tensor_shape.cc View File

@@ -158,7 +158,7 @@ void TensorShape::AddListToShape(const T &list) {
} }
if (dim > kDeMaxDim) { if (dim > kDeMaxDim) {
std::stringstream ss; std::stringstream ss;
ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!";
ss << "Invalid shape data, dim (" << dim << ") is larger than the maximum dim size(" << kDeMaxDim << ")!";
MS_LOG(ERROR) << ss.str().c_str(); MS_LOG(ERROR) << ss.str().c_str();
known_ = false; known_ = false;
raw_shape_.clear(); raw_shape_.clear();


+ 4
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc View File

@@ -119,6 +119,10 @@ Status AlbumOp::PrescanEntry() {


std::sort(image_rows_.begin(), image_rows_.end(), StrComp); std::sort(image_rows_.begin(), image_rows_.end(), StrComp);
num_rows_ = image_rows_.size(); num_rows_ = image_rows_.size();
if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API AlbumDataset. Please check file path or dataset API.");
}
return Status::OK(); return Status::OK();
} }




+ 1
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc View File

@@ -237,8 +237,7 @@ Status CelebAOp::ParseImageAttrInfo() {
num_rows_ = image_labels_vec_.size(); num_rows_ = image_labels_vec_.size();
if (num_rows_ == 0) { if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API CelebADataset. "
"Please check file path or dataset API validation first");
"Invalid data, no valid data matching the dataset API CelebADataset. Please check file path or dataset API.");
} }
MS_LOG(DEBUG) << "Celeba dataset rows number is " << num_rows_ << "."; MS_LOG(DEBUG) << "Celeba dataset rows number is " << num_rows_ << ".";
return Status::OK(); return Status::OK();


+ 1
- 1
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc View File

@@ -412,7 +412,7 @@ Status CifarOp::ParseCifarData() {
if (num_rows_ == 0) { if (num_rows_ == 0) {
std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset"; std::string api = cifar_type_ == kCifar10 ? "Cifar10Dataset" : "Cifar100Dataset";
RETURN_STATUS_UNEXPECTED("Invalid data, no valid data matching the dataset API " + api + RETURN_STATUS_UNEXPECTED("Invalid data, no valid data matching the dataset API " + api +
". Please check file path or dataset API validation first.");
". Please check file path or dataset API.");
} }
cifar_raw_data_block_->Reset(); cifar_raw_data_block_->Reset();
return Status::OK(); return Status::OK();


+ 2
- 3
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/clue_op.cc View File

@@ -192,7 +192,7 @@ Status ClueOp::LoadFile(const std::string &file, const int64_t start_offset, con
js = nlohmann::json::parse(line); js = nlohmann::json::parse(line);
} catch (const std::exception &err) { } catch (const std::exception &err) {
// Catch any exception and convert to Status return code // Catch any exception and convert to Status return code
RETURN_STATUS_UNEXPECTED("Invalid file, failed to parse json file: " + line);
RETURN_STATUS_UNEXPECTED("Invalid file, failed to parse json file: " + file);
} }
int cols_count = cols_to_keyword_.size(); int cols_count = cols_to_keyword_.size();
TensorRow tRow(cols_count, nullptr); TensorRow tRow(cols_count, nullptr);
@@ -482,8 +482,7 @@ Status ClueOp::CalculateNumRowsPerShard() {
} }
if (all_num_rows_ == 0) { if (all_num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API CLUEDataset. Please check file path or dataset API "
"validation first.");
"Invalid data, no valid data matching the dataset API CLUEDataset. Please check file path or dataset API.");
} }


num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_));


+ 4
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/coco_op.cc View File

@@ -468,6 +468,10 @@ Status CocoOp::ParseAnnotationIds() {
if (coordinate_map_.find(img) != coordinate_map_.end()) image_ids_.push_back(img); if (coordinate_map_.find(img) != coordinate_map_.end()) image_ids_.push_back(img);
} }
num_rows_ = image_ids_.size(); num_rows_ = image_ids_.size();
if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API CocoDataset. Please check file path or dataset API.");
}
return Status::OK(); return Status::OK();
} }




+ 1
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc View File

@@ -783,8 +783,7 @@ Status CsvOp::CalculateNumRowsPerShard() {
} }
if (all_num_rows_ == 0) { if (all_num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API CsvDataset. Please check file path or CSV format "
"validation first.");
"Invalid data, no valid data matching the dataset API CsvDataset. Please check file path or CSV format.");
} }


num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_));


+ 2
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc View File

@@ -117,8 +117,8 @@ Status ImageFolderOp::PrescanMasterEntry(const std::string &filedir) {
num_rows_ = image_label_pairs_.size(); num_rows_ = image_label_pairs_.size();
if (num_rows_ == 0) { if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API ImageFolderDataset. Please check file path or dataset "
"API validation first.");
"Invalid data, no valid data matching the dataset API ImageFolderDataset. "
"Please check file path or dataset API.");
} }
// free memory of two queues used for pre-scan // free memory of two queues used for pre-scan
folder_name_queue_->Reset(); folder_name_queue_->Reset();


+ 1
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/manifest_op.cc View File

@@ -386,8 +386,7 @@ Status ManifestOp::CountDatasetInfo() {
num_rows_ = static_cast<int64_t>(image_labelname_.size()); num_rows_ = static_cast<int64_t>(image_labelname_.size());
if (num_rows_ == 0) { if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API ManifestDataset.Please check file path or dataset API "
"validation first.");
"Invalid data, no valid data matching the dataset API ManifestDataset. Please check file path or dataset API.");
} }
return Status::OK(); return Status::OK();
} }


+ 1
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mnist_op.cc View File

@@ -369,8 +369,7 @@ Status MnistOp::ParseMnistData() {
num_rows_ = image_label_pairs_.size(); num_rows_ = image_label_pairs_.size();
if (num_rows_ == 0) { if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API MnistDataset.Please check file path or dataset API "
"validation first.");
"Invalid data, no valid data matching the dataset API MnistDataset. Please check file path or dataset API.");
} }
return Status::OK(); return Status::OK();
} }


+ 1
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/text_file_op.cc View File

@@ -473,8 +473,7 @@ Status TextFileOp::CalculateNumRowsPerShard() {
} }
if (all_num_rows_ == 0) { if (all_num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API TextFileDataset.Please check file path or dataset API "
"validation first.");
"Invalid data, no valid data matching the dataset API TextFileDataset. Please check file path or dataset API.");
} }


num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_)); num_rows_per_shard_ = static_cast<int64_t>(std::ceil(all_num_rows_ * 1.0 / num_devices_));


+ 1
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/tf_reader_op.cc View File

@@ -229,8 +229,7 @@ Status TFReaderOp::CalculateNumRowsPerShard() {
num_rows_per_shard_ = static_cast<int64_t>(std::ceil(num_rows_ * 1.0 / num_devices_)); num_rows_per_shard_ = static_cast<int64_t>(std::ceil(num_rows_ * 1.0 / num_devices_));
if (num_rows_per_shard_ == 0) { if (num_rows_per_shard_ == 0) {
RETURN_STATUS_UNEXPECTED( RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API TFRecordDataset.Please check file path or dataset API "
"validation first.");
"Invalid data, no valid data matching the dataset API TFRecordDataset. Please check file path or dataset API.");
} }
return Status::OK(); return Status::OK();
} }


+ 4
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/voc_op.cc View File

@@ -315,6 +315,10 @@ Status VOCOp::ParseAnnotationIds() {
} }


num_rows_ = image_ids_.size(); num_rows_ = image_ids_.size();
if (num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED(
"Invalid data, no valid data matching the dataset API VOCDataset. Please check file path or dataset API.");
}
return Status::OK(); return Status::OK();
} }




+ 8
- 8
mindspore/ccsrc/minddata/dataset/include/datasets.h View File

@@ -113,7 +113,7 @@ std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::s
/// \notes The generated dataset has two columns ['image', 'attr']. /// \notes The generated dataset has two columns ['image', 'attr'].
/// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. /// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
/// \param[in] dataset_dir Path to the root directory that contains the dataset. /// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage One of "all", "train", "valid" or "test".
/// \param[in] usage One of "all", "train", "valid" or "test" (default = "all").
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \param[in] decode Decode the images after reading (default=false). /// \param[in] decode Decode the images after reading (default=false).
@@ -126,21 +126,21 @@ std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std:
/// \brief Function to create a Cifar10 Dataset /// \brief Function to create a Cifar10 Dataset
/// \notes The generated dataset has two columns ["image", "label"] /// \notes The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of CIFAR10, can be "train", "test" or "all"
/// \param[in] usage of CIFAR10, can be "train", "test" or "all" (default = "all").
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset /// \return Shared pointer to the current Dataset
std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir, const std::string &usage = std::string(),
std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir, const std::string &usage = "all",
const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); const std::shared_ptr<SamplerObj> &sampler = RandomSampler());


/// \brief Function to create a Cifar100 Dataset /// \brief Function to create a Cifar100 Dataset
/// \notes The generated dataset has three columns ["image", "coarse_label", "fine_label"] /// \notes The generated dataset has three columns ["image", "coarse_label", "fine_label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of CIFAR100, can be "train", "test" or "all"
/// \param[in] usage of CIFAR100, can be "train", "test" or "all" (default = "all").
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset /// \return Shared pointer to the current Dataset
std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir, const std::string &usage = std::string(),
std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir, const std::string &usage = "all",
const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); const std::shared_ptr<SamplerObj> &sampler = RandomSampler());


/// \brief Function to create a CLUEDataset /// \brief Function to create a CLUEDataset
@@ -247,11 +247,11 @@ std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const
/// \brief Function to create a MnistDataset /// \brief Function to create a MnistDataset
/// \notes The generated dataset has two columns ["image", "label"] /// \notes The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of MNIST, can be "train", "test" or "all"
/// \param[in] usage of MNIST, can be "train", "test" or "all" (default = "all").
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()) /// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current MnistDataset /// \return Shared pointer to the current MnistDataset
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::string &usage = std::string(),
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, const std::string &usage = "all",
const std::shared_ptr<SamplerObj> &sampler = RandomSampler()); const std::shared_ptr<SamplerObj> &sampler = RandomSampler());


/// \brief Function to create a ConcatDataset /// \brief Function to create a ConcatDataset
@@ -407,7 +407,7 @@ std::shared_ptr<TFRecordDataset> TFRecord(const std::vector<std::string> &datase
/// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]]. /// - task='Segmentation', column: [['image', dtype=uint8], ['target',dtype=uint8]].
/// \param[in] dataset_dir Path to the root directory that contains the dataset /// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection" /// \param[in] task Set the task type of reading voc data, now only support "Segmentation" or "Detection"
/// \param[in] usage The type of data list text file to be read
/// \param[in] usage The type of data list text file to be read (default = "train").
/// \param[in] class_indexing A str-to-int mapping from label name to index, only valid in "Detection" task /// \param[in] class_indexing A str-to-int mapping from label name to index, only valid in "Detection" task
/// \param[in] decode Decode the images after reading /// \param[in] decode Decode the images after reading
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given, /// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,


+ 4
- 2
mindspore/ccsrc/minddata/dataset/include/samplers.h View File

@@ -52,12 +52,13 @@ class WeightedRandomSamplerObj;
/// \param[in] shuffle - If true, the indices are shuffled. /// \param[in] shuffle - If true, the indices are shuffled.
/// \param[in] num_samples - The number of samples to draw (default to all elements). /// \param[in] num_samples - The number of samples to draw (default to all elements).
/// \param[in] seed - The seed in use when shuffle is true. /// \param[in] seed - The seed in use when shuffle is true.
/// \param[in] offset - The starting position where access to elements in the dataset begins.
/// \param[in] even_dist - If true, each shard would return the same number of rows (default to true). /// \param[in] even_dist - If true, each shard would return the same number of rows (default to true).
/// If false the total rows returned by all the shards would not have overlap. /// If false the total rows returned by all the shards would not have overlap.
/// \return Shared pointer to the current Sampler. /// \return Shared pointer to the current Sampler.
std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle = true, std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle = true,
int64_t num_samples = 0, uint32_t seed = 1, int64_t num_samples = 0, uint32_t seed = 1,
bool even_dist = true);
int64_t offset = -1, bool even_dist = true);


/// Function to create a PK Sampler. /// Function to create a PK Sampler.
/// \notes Samples K elements for each P class in the dataset. /// \notes Samples K elements for each P class in the dataset.
@@ -103,7 +104,7 @@ std::shared_ptr<WeightedRandomSamplerObj> WeightedRandomSampler(std::vector<doub
class DistributedSamplerObj : public SamplerObj { class DistributedSamplerObj : public SamplerObj {
public: public:
DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, uint32_t seed, DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples, uint32_t seed,
bool even_dist);
int64_t offset, bool even_dist);


~DistributedSamplerObj() = default; ~DistributedSamplerObj() = default;


@@ -117,6 +118,7 @@ class DistributedSamplerObj : public SamplerObj {
bool shuffle_; bool shuffle_;
int64_t num_samples_; int64_t num_samples_;
uint32_t seed_; uint32_t seed_;
int64_t offset_;
bool even_dist_; bool even_dist_;
}; };




+ 8
- 37
mindspore/ccsrc/minddata/dataset/text/vocab.cc View File

@@ -87,54 +87,25 @@ Status Vocab::BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdTyp


Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens, Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
bool prepend_special, std::shared_ptr<Vocab> *vocab) { bool prepend_special, std::shared_ptr<Vocab> *vocab) {
// Validate parameters
std::string duplicate_word;
for (const WordType &word : words) {
if (std::count(words.begin(), words.end(), word) > 1) {
if (duplicate_word.find(word) == std::string::npos) {
duplicate_word = duplicate_word.empty() ? duplicate_word + word : duplicate_word + ", " + word;
}
}
}
if (!duplicate_word.empty()) {
MS_LOG(ERROR) << "words contains duplicate word: " << duplicate_word;
RETURN_STATUS_UNEXPECTED("words contains duplicate word: " + duplicate_word);
}

std::string duplicate_sp;
std::string existed_sp;
for (const WordType &sp : special_tokens) {
if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) {
if (duplicate_sp.find(sp) == std::string::npos) {
duplicate_sp = duplicate_sp.empty() ? duplicate_sp + sp : duplicate_sp + ", " + sp;
}
}
if (std::count(words.begin(), words.end(), sp) >= 1) {
if (existed_sp.find(sp) == std::string::npos) {
existed_sp = existed_sp.empty() ? existed_sp + sp : existed_sp + ", " + sp;
}
}
}
if (!duplicate_sp.empty()) {
MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp;
RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp);
}
if (!existed_sp.empty()) {
MS_LOG(ERROR) << "special_tokens and word_list contain duplicate word: " << existed_sp;
RETURN_STATUS_UNEXPECTED("special_tokens and word_list contain duplicate word: " + existed_sp);
}

std::unordered_map<WordType, WordIdType> word2id; std::unordered_map<WordType, WordIdType> word2id;


// if special is added in front, normal words id will start from number of special tokens // if special is added in front, normal words id will start from number of special tokens
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0; WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
for (auto word : words) { for (auto word : words) {
if (word2id.find(word) != word2id.end()) {
MS_LOG(ERROR) << "word_list contains duplicate word: " + word + ".";
RETURN_STATUS_UNEXPECTED("word_list contains duplicate word: " + word + ".");
}
word2id[word] = word_id++; word2id[word] = word_id++;
} }


word_id = prepend_special ? 0 : word2id.size(); word_id = prepend_special ? 0 : word2id.size();


for (auto special_token : special_tokens) { for (auto special_token : special_tokens) {
if (word2id.find(special_token) != word2id.end()) {
MS_LOG(ERROR) << "special_tokens and word_list contain duplicate word: " + special_token + ".";
RETURN_STATUS_UNEXPECTED("special_tokens and word_list contain duplicate word: " + special_token + ".");
}
word2id[special_token] = word_id++; word2id[special_token] = word_id++;
} }




+ 2
- 2
tests/ut/cpp/dataset/build_vocab_test.cc View File

@@ -183,8 +183,8 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) {
TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) { TEST_F(MindDataTestVocab, TestVocabFromVectorFail3) {
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3."; MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail3.";
// Build vocab from a vector // Build vocab from a vector
std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", "<pad>"};
std::vector<std::string> sp_tokens = {"<pad>", "<unk>"};
std::vector<std::string> list = {"apple", "dog", "egg", "<unk>", ""};
std::vector<std::string> sp_tokens = {"", "<unk>"};
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();


// Expected failure: special tokens are already existed in word_list // Expected failure: special tokens are already existed in word_list


+ 17
- 7
tests/ut/cpp/dataset/c_api_dataset_cifar_test.cc View File

@@ -28,7 +28,7 @@ TEST_F(MindDataTestPipeline, TestCifar10Dataset) {


// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create an iterator over the result of the above dataset // Create an iterator over the result of the above dataset
@@ -62,7 +62,7 @@ TEST_F(MindDataTestPipeline, TestCifar100Dataset) {


// Create a Cifar100 Dataset // Create a Cifar100 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
std::shared_ptr<Dataset> ds = Cifar100(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar100(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create an iterator over the result of the above dataset // Create an iterator over the result of the above dataset
@@ -96,7 +96,7 @@ TEST_F(MindDataTestPipeline, TestCifar100DatasetFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetFail1."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetFail1.";


// Create a Cifar100 Dataset // Create a Cifar100 Dataset
std::shared_ptr<Dataset> ds = Cifar100("", std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar100("", "all", RandomSampler(false, 10));
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }


@@ -104,7 +104,17 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetFail1."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetFail1.";


// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::shared_ptr<Dataset> ds = Cifar10("", std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10("", "all", RandomSampler(false, 10));
EXPECT_EQ(ds, nullptr);
}

TEST_F(MindDataTestPipeline, TestCifar10DatasetWithInvalidUsage) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetWithNullSampler.";

// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "validation");
// Expect failure: validation is not a valid usage
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }


@@ -113,7 +123,7 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetWithNullSampler) {


// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), nullptr);
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", nullptr);
// Expect failure: sampler can not be nullptr // Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }
@@ -123,7 +133,7 @@ TEST_F(MindDataTestPipeline, TestCifar100DatasetWithNullSampler) {


// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
std::shared_ptr<Dataset> ds = Cifar100(folder_path, std::string(), nullptr);
std::shared_ptr<Dataset> ds = Cifar100(folder_path, "all", nullptr);
// Expect failure: sampler can not be nullptr // Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }
@@ -133,7 +143,7 @@ TEST_F(MindDataTestPipeline, TestCifar100DatasetWithWrongSampler) {


// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar100Data/"; std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
std::shared_ptr<Dataset> ds = Cifar100(folder_path, std::string(), RandomSampler(false, -10));
std::shared_ptr<Dataset> ds = Cifar100(folder_path, "all", RandomSampler(false, -10));
// Expect failure: sampler is not construnced correctly // Expect failure: sampler is not construnced correctly
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }

+ 4
- 4
tests/ut/cpp/dataset/c_api_dataset_iterator_test.cc View File

@@ -28,7 +28,7 @@ TEST_F(MindDataTestPipeline, TestIteratorEmptyColumn) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorEmptyColumn."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorEmptyColumn.";
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 5));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 5));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Rename operation on ds // Create a Rename operation on ds
@@ -64,7 +64,7 @@ TEST_F(MindDataTestPipeline, TestIteratorOneColumn) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn.";
// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 4));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 4));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -103,7 +103,7 @@ TEST_F(MindDataTestPipeline, TestIteratorReOrder) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorReOrder."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorReOrder.";
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), SequentialSampler(false, 4));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", SequentialSampler(false, 4));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Take operation on ds // Create a Take operation on ds
@@ -186,7 +186,7 @@ TEST_F(MindDataTestPipeline, TestIteratorWrongColumn) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestIteratorOneColumn.";
// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 4));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 4));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Pass wrong column name // Pass wrong column name


+ 14
- 14
tests/ut/cpp/dataset/c_api_dataset_ops_test.cc View File

@@ -40,7 +40,7 @@ TEST_F(MindDataTestPipeline, TestBatchAndRepeat) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Repeat operation on ds // Create a Repeat operation on ds
@@ -82,7 +82,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthSuccess1) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -118,7 +118,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthSuccess2) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -156,7 +156,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail1) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -171,7 +171,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail2) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -186,7 +186,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail3) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -201,7 +201,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail4) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -216,7 +216,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail5) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -231,7 +231,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail6) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);
// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, -2, 3}); ds = ds->BucketBatchByLength({"image"}, {1, 2}, {1, -2, 3});
@@ -245,7 +245,7 @@ TEST_F(MindDataTestPipeline, TestBucketBatchByLengthFail7) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a BucketBatchByLength operation on ds // Create a BucketBatchByLength operation on ds
@@ -312,7 +312,7 @@ TEST_F(MindDataTestPipeline, TestConcatSuccess) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
// Column names: {"image", "label"} // Column names: {"image", "label"}
folder_path = datasets_root_path_ + "/testCifar10Data/"; folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, std::string(), RandomSampler(false, 9));
std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, "all", RandomSampler(false, 9));
EXPECT_NE(ds2, nullptr); EXPECT_NE(ds2, nullptr);


// Create a Project operation on ds // Create a Project operation on ds
@@ -364,7 +364,7 @@ TEST_F(MindDataTestPipeline, TestConcatSuccess2) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
// Column names: {"image", "label"} // Column names: {"image", "label"}
folder_path = datasets_root_path_ + "/testCifar10Data/"; folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, std::string(), RandomSampler(false, 9));
std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, "all", RandomSampler(false, 9));
EXPECT_NE(ds2, nullptr); EXPECT_NE(ds2, nullptr);


// Create a Project operation on ds // Create a Project operation on ds
@@ -1012,7 +1012,7 @@ TEST_F(MindDataTestPipeline, TestTensorOpsAndMap) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), RandomSampler(false, 20));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 20));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Repeat operation on ds // Create a Repeat operation on ds
@@ -1126,7 +1126,7 @@ TEST_F(MindDataTestPipeline, TestZipSuccess) {
EXPECT_NE(ds1, nullptr); EXPECT_NE(ds1, nullptr);


folder_path = datasets_root_path_ + "/testCifar10Data/"; folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds2 = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds2, nullptr); EXPECT_NE(ds2, nullptr);


// Create a Project operation on ds // Create a Project operation on ds


+ 44
- 21
tests/ut/cpp/dataset/c_api_dataset_vocab.cc View File

@@ -80,6 +80,50 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
} }
} }


TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpEmptyString.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create a vocab from vector
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {"<pad>", ""}, true, &vocab);
EXPECT_EQ(s, Status::OK());

// Create Lookup operation on ds
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32"));
EXPECT_NE(lookup, nullptr);

// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);

uint64_t i = 0;
std::vector<int32_t> expected = {2, 1, 4, 5, 6, 7};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind->shape() << " " << *ind;
std::shared_ptr<Tensor> expected_item;
Tensor::CreateScalar(expected[i], &expected_item);
EXPECT_EQ(*ind, *expected_item);
iter->GetNextRow(&row);
i++;
}
}

TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1.";
// Create a TextFile Dataset // Create a TextFile Dataset
@@ -110,27 +154,6 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
EXPECT_EQ(lookup, nullptr); EXPECT_EQ(lookup, nullptr);
} }


TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create a vocab from map
std::unordered_map<std::string, int32_t> dict;
dict["Home"] = 3;
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
EXPECT_EQ(s, Status::OK());

// Create Lookup operation on ds
// Expected failure: "" is not a word of vocab
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32"));
EXPECT_EQ(lookup, nullptr);
}

TEST_F(MindDataTestPipeline, TestVocabFromDataset) { TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset.";




+ 2
- 2
tests/ut/cpp/dataset/c_api_datasets_test.cc View File

@@ -133,7 +133,7 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithWrongDatasetDir) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir.";


// Create a Mnist Dataset // Create a Mnist Dataset
std::shared_ptr<Dataset> ds = Mnist("", std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Mnist("", "all", RandomSampler(false, 10));
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }


@@ -142,7 +142,7 @@ TEST_F(MindDataTestPipeline, TestMnistFailWithNullSampler) {


// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, std::string(), nullptr);
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", nullptr);
// Expect failure: sampler can not be nullptr // Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr); EXPECT_EQ(ds, nullptr);
} }


+ 10
- 10
tests/ut/cpp/dataset/c_api_transforms_test.cc View File

@@ -30,7 +30,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchSuccess1) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
int number_of_classes = 10; int number_of_classes = 10;
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create objects for the tensor ops // Create objects for the tensor ops
@@ -98,7 +98,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchSuccess2) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
int number_of_classes = 10; int number_of_classes = 10;
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -156,7 +156,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchFail1) {
// Must fail because alpha can't be negative // Must fail because alpha can't be negative
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -181,7 +181,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchFail2) {
// Must fail because prob can't be negative // Must fail because prob can't be negative
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -206,7 +206,7 @@ TEST_F(MindDataTestPipeline, TestCutMixBatchFail3) {
// Must fail because alpha can't be zero // Must fail because alpha can't be zero
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -376,7 +376,7 @@ TEST_F(MindDataTestPipeline, TestHwcToChw) {
TEST_F(MindDataTestPipeline, TestMixUpBatchFail1) { TEST_F(MindDataTestPipeline, TestMixUpBatchFail1) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -400,7 +400,7 @@ TEST_F(MindDataTestPipeline, TestMixUpBatchFail2) {
// This should fail because alpha can't be zero // This should fail because alpha can't be zero
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -423,7 +423,7 @@ TEST_F(MindDataTestPipeline, TestMixUpBatchFail2) {
TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess1) { TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess1) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -472,7 +472,7 @@ TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess1) {
TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess2) { TEST_F(MindDataTestPipeline, TestMixUpBatchSuccess2) {
// Create a Cifar10 Dataset // Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/"; std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, std::string(), RandomSampler(false, 10));
std::shared_ptr<Dataset> ds = Cifar10(folder_path, "all", RandomSampler(false, 10));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Batch operation on ds // Create a Batch operation on ds
@@ -1118,7 +1118,7 @@ TEST_F(MindDataTestPipeline, TestRandomRotation) {
TEST_F(MindDataTestPipeline, TestUniformAugWithOps) { TEST_F(MindDataTestPipeline, TestUniformAugWithOps) {
// Create a Mnist Dataset // Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/"; std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, "", RandomSampler(false, 20));
std::shared_ptr<Dataset> ds = Mnist(folder_path, "all", RandomSampler(false, 20));
EXPECT_NE(ds, nullptr); EXPECT_NE(ds, nullptr);


// Create a Repeat operation on ds // Create a Repeat operation on ds


Loading…
Cancel
Save