Merge pull request !4422 from xiefangqi/xfq_c++api_randomdatatags/v0.7.0-beta
| @@ -28,6 +28,7 @@ | |||
| #include "minddata/dataset/engine/datasetops/source/image_folder_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/manifest_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/mnist_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/random_data_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/text_file_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/voc_op.h" | |||
| // Dataset operator headers (in alphabetical order) | |||
| @@ -102,6 +103,15 @@ Dataset::Dataset() { | |||
| worker_connector_size_ = cfg->worker_connector_size(); | |||
| } | |||
| /// \brief Function to create a SchemaObj | |||
| /// \param[in] schema_file Path of schema file | |||
| /// \return Shared pointer to the current schema | |||
| std::shared_ptr<SchemaObj> Schema(const std::string &schema_file) { | |||
| auto schema = std::make_shared<SchemaObj>(schema_file); | |||
| return schema->init() ? schema : nullptr; | |||
| } | |||
| // FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS | |||
| // (In alphabetical order) | |||
| @@ -366,6 +376,163 @@ std::shared_ptr<ZipDataset> Dataset::Zip(const std::vector<std::shared_ptr<Datas | |||
| return ds->ValidateParams() ? ds : nullptr; | |||
| } | |||
| SchemaObj::SchemaObj(const std::string &schema_file) : schema_file_(schema_file), num_rows_(0), dataset_type_("") {} | |||
| // SchemaObj init function | |||
| bool SchemaObj::init() { | |||
| if (schema_file_ != "") { | |||
| Path schema_file(schema_file_); | |||
| if (!schema_file.Exists()) { | |||
| MS_LOG(ERROR) << "The file " << schema_file << " does not exist or permission denied!"; | |||
| return false; | |||
| } | |||
| nlohmann::json js; | |||
| try { | |||
| std::ifstream in(schema_file_); | |||
| in >> js; | |||
| } catch (const std::exception &err) { | |||
| MS_LOG(ERROR) << "Schema file failed to load"; | |||
| return false; | |||
| } | |||
| return from_json(js); | |||
| } | |||
| return true; | |||
| } | |||
| // Function to add a column to schema with a mstype de_type | |||
| bool SchemaObj::add_column(std::string name, TypeId de_type, std::vector<int32_t> shape) { | |||
| nlohmann::json new_column; | |||
| new_column["name"] = name; | |||
| // if de_type is mstype | |||
| DataType data_type = dataset::MSTypeToDEType(de_type); | |||
| new_column["type"] = data_type.ToString(); | |||
| if (shape.size() > 0) { | |||
| new_column["shape"] = shape; | |||
| new_column["rank"] = shape.size(); | |||
| } else { | |||
| new_column["rank"] = 1; | |||
| } | |||
| columns_.push_back(new_column); | |||
| return true; | |||
| } | |||
| // Function to add a column to schema with a string de_type | |||
| bool SchemaObj::add_column(std::string name, std::string de_type, std::vector<int32_t> shape) { | |||
| nlohmann::json new_column; | |||
| new_column["name"] = name; | |||
| DataType data_type(de_type); | |||
| new_column["type"] = data_type.ToString(); | |||
| if (shape.size() > 0) { | |||
| new_column["shape"] = shape; | |||
| new_column["rank"] = shape.size(); | |||
| } else { | |||
| new_column["rank"] = 1; | |||
| } | |||
| columns_.push_back(new_column); | |||
| return true; | |||
| } | |||
| std::string SchemaObj::to_json() { | |||
| nlohmann::json json_file; | |||
| json_file["columns"] = columns_; | |||
| if (dataset_type_ != "") { | |||
| json_file["datasetType"] = dataset_type_; | |||
| } | |||
| if (num_rows_ > 0) { | |||
| json_file["numRows"] = num_rows_; | |||
| } | |||
| return json_file.dump(2); | |||
| } | |||
| bool SchemaObj::parse_column(nlohmann::json columns) { | |||
| std::string name, de_type; | |||
| std::vector<int32_t> shape; | |||
| columns_.clear(); | |||
| if (columns.type() == nlohmann::json::value_t::array) { | |||
| // reference to python list | |||
| for (auto column : columns) { | |||
| auto key_name = column.find("name"); | |||
| if (key_name == column.end()) { | |||
| MS_LOG(ERROR) << "Column's name is missing"; | |||
| return false; | |||
| } | |||
| name = *key_name; | |||
| auto key_type = column.find("type"); | |||
| if (key_type == column.end()) { | |||
| MS_LOG(ERROR) << "Column's type is missing"; | |||
| return false; | |||
| } | |||
| de_type = *key_type; | |||
| shape.clear(); | |||
| auto key_shape = column.find("shape"); | |||
| if (key_shape != column.end()) { | |||
| shape.insert(shape.end(), (*key_shape).begin(), (*key_shape).end()); | |||
| } | |||
| if (!add_column(name, de_type, shape)) { | |||
| return false; | |||
| } | |||
| } | |||
| } else if (columns.type() == nlohmann::json::value_t::object) { | |||
| for (const auto &it_child : columns.items()) { | |||
| name = it_child.key(); | |||
| auto key_type = it_child.value().find("type"); | |||
| if (key_type == it_child.value().end()) { | |||
| MS_LOG(ERROR) << "Column's type is missing"; | |||
| return false; | |||
| } | |||
| de_type = *key_type; | |||
| shape.clear(); | |||
| auto key_shape = it_child.value().find("shape"); | |||
| if (key_shape != it_child.value().end()) { | |||
| shape.insert(shape.end(), (*key_shape).begin(), (*key_shape).end()); | |||
| } | |||
| if (!add_column(name, de_type, shape)) { | |||
| return false; | |||
| } | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "columns must be dict or list, columns contain name, type, shape(optional)."; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool SchemaObj::from_json(nlohmann::json json_obj) { | |||
| for (const auto &it_child : json_obj.items()) { | |||
| if (it_child.key() == "datasetType") { | |||
| dataset_type_ = it_child.value(); | |||
| } else if (it_child.key() == "numRows") { | |||
| num_rows_ = it_child.value(); | |||
| } else if (it_child.key() == "columns") { | |||
| if (!parse_column(it_child.value())) { | |||
| MS_LOG(ERROR) << "parse columns failed"; | |||
| return false; | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "Unknown field " << it_child.key(); | |||
| return false; | |||
| } | |||
| } | |||
| if (columns_.empty()) { | |||
| MS_LOG(ERROR) << "Columns are missing."; | |||
| return false; | |||
| } | |||
| if (num_rows_ <= 0) { | |||
| MS_LOG(ERROR) << "numRows must be greater than 0"; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| // OTHER FUNCTIONS | |||
| // Helper function to create default RandomSampler. | |||
| @@ -960,6 +1127,67 @@ std::vector<std::shared_ptr<DatasetOp>> MnistDataset::Build() { | |||
| return node_ops; | |||
| } | |||
| // ValideParams for RandomDataset | |||
| bool RandomDataset::ValidateParams() { | |||
| if (total_rows_ < 0) { | |||
| MS_LOG(ERROR) << "RandomDataset: total_rows must be greater than 0, now get " << total_rows_; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| int32_t RandomDataset::GenRandomInt(int32_t min, int32_t max) { | |||
| std::uniform_int_distribution<int32_t> uniDist(min, max); | |||
| return uniDist(rand_gen_); | |||
| } | |||
| // Build for RandomDataset | |||
| std::vector<std::shared_ptr<DatasetOp>> RandomDataset::Build() { | |||
| // A vector containing shared pointer to the Dataset Ops that this object will create | |||
| std::vector<std::shared_ptr<DatasetOp>> node_ops; | |||
| rand_gen_.seed(GetSeed()); // seed the random generator | |||
| // If total rows was not given, then randomly pick a number | |||
| std::shared_ptr<SchemaObj> schema_obj; | |||
| if (!schema_path_.empty()) schema_obj = std::make_shared<SchemaObj>(schema_path_); | |||
| if (schema_obj != nullptr && total_rows_ == 0) { | |||
| total_rows_ = schema_obj->get_num_rows(); | |||
| } | |||
| // If user does not specify Sampler, create a default sampler based on the shuffle variable. | |||
| if (sampler_ == nullptr) { | |||
| sampler_ = CreateDefaultSampler(); | |||
| } | |||
| std::string schema_json_string, schema_file_path; | |||
| if (schema_ != nullptr) { | |||
| schema_->set_dataset_type("Random"); | |||
| if (total_rows_ != 0) { | |||
| schema_->set_num_rows(total_rows_); | |||
| } | |||
| schema_json_string = schema_->to_json(); | |||
| } else { | |||
| schema_file_path = schema_path_; | |||
| } | |||
| std::unique_ptr<DataSchema> data_schema; | |||
| std::vector<std::string> columns_to_load; | |||
| if (!schema_file_path.empty() || !schema_json_string.empty()) { | |||
| data_schema = std::make_unique<DataSchema>(); | |||
| if (!schema_file_path.empty()) { | |||
| data_schema->LoadSchemaFile(schema_file_path, columns_to_load); | |||
| } else if (!schema_json_string.empty()) { | |||
| data_schema->LoadSchemaString(schema_json_string, columns_to_load); | |||
| } | |||
| } | |||
| std::shared_ptr<RandomDataOp> op; | |||
| op = std::make_shared<RandomDataOp>(num_workers_, connector_que_size_, rows_per_buffer_, total_rows_, | |||
| std::move(data_schema), std::move(sampler_->Build())); | |||
| node_ops.push_back(op); | |||
| return node_ops; | |||
| } | |||
| // Constructor for TextFileDataset | |||
| TextFileDataset::TextFileDataset(std::vector<std::string> dataset_files, int32_t num_samples, ShuffleMode shuffle, | |||
| int32_t num_shards, int32_t shard_id) | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "minddata/dataset/include/de_tensor.h" | |||
| #include "minddata/dataset/include/type_id.h" | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/core/data_type.h" | |||
| #include "mindspore/core/ir/dtype/type_id.h" | |||
| @@ -23,68 +24,6 @@ | |||
| namespace mindspore { | |||
| namespace tensor { | |||
| dataset::DataType MSTypeToDEType(TypeId data_type) { | |||
| switch (data_type) { | |||
| case kNumberTypeBool: | |||
| return dataset::DataType(dataset::DataType::DE_BOOL); | |||
| case kNumberTypeInt8: | |||
| return dataset::DataType(dataset::DataType::DE_INT8); | |||
| case kNumberTypeUInt8: | |||
| return dataset::DataType(dataset::DataType::DE_UINT8); | |||
| case kNumberTypeInt16: | |||
| return dataset::DataType(dataset::DataType::DE_INT16); | |||
| case kNumberTypeUInt16: | |||
| return dataset::DataType(dataset::DataType::DE_UINT16); | |||
| case kNumberTypeInt32: | |||
| return dataset::DataType(dataset::DataType::DE_INT32); | |||
| case kNumberTypeUInt32: | |||
| return dataset::DataType(dataset::DataType::DE_UINT32); | |||
| case kNumberTypeInt64: | |||
| return dataset::DataType(dataset::DataType::DE_INT64); | |||
| case kNumberTypeUInt64: | |||
| return dataset::DataType(dataset::DataType::DE_UINT64); | |||
| case kNumberTypeFloat16: | |||
| return dataset::DataType(dataset::DataType::DE_FLOAT16); | |||
| case kNumberTypeFloat32: | |||
| return dataset::DataType(dataset::DataType::DE_FLOAT32); | |||
| case kNumberTypeFloat64: | |||
| return dataset::DataType(dataset::DataType::DE_FLOAT64); | |||
| default: | |||
| return dataset::DataType(dataset::DataType::DE_UNKNOWN); | |||
| } | |||
| } | |||
| TypeId DETypeToMSType(dataset::DataType data_type) { | |||
| switch (data_type.value()) { | |||
| case dataset::DataType::DE_BOOL: | |||
| return mindspore::TypeId::kNumberTypeBool; | |||
| case dataset::DataType::DE_INT8: | |||
| return mindspore::TypeId::kNumberTypeInt8; | |||
| case dataset::DataType::DE_UINT8: | |||
| return mindspore::TypeId::kNumberTypeUInt8; | |||
| case dataset::DataType::DE_INT16: | |||
| return mindspore::TypeId::kNumberTypeInt16; | |||
| case dataset::DataType::DE_UINT16: | |||
| return mindspore::TypeId::kNumberTypeUInt16; | |||
| case dataset::DataType::DE_INT32: | |||
| return mindspore::TypeId::kNumberTypeInt32; | |||
| case dataset::DataType::DE_UINT32: | |||
| return mindspore::TypeId::kNumberTypeUInt32; | |||
| case dataset::DataType::DE_INT64: | |||
| return mindspore::TypeId::kNumberTypeInt64; | |||
| case dataset::DataType::DE_UINT64: | |||
| return mindspore::TypeId::kNumberTypeUInt64; | |||
| case dataset::DataType::DE_FLOAT16: | |||
| return mindspore::TypeId::kNumberTypeFloat16; | |||
| case dataset::DataType::DE_FLOAT32: | |||
| return mindspore::TypeId::kNumberTypeFloat32; | |||
| case dataset::DataType::DE_FLOAT64: | |||
| return mindspore::TypeId::kNumberTypeFloat64; | |||
| default: | |||
| return kTypeUnknown; | |||
| } | |||
| } | |||
| MSTensor *DETensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) { | |||
| return new DETensor(data_type, shape); | |||
| } | |||
| @@ -113,7 +52,7 @@ DETensor::DETensor(TypeId data_type, const std::vector<int> &shape) { | |||
| t_shape.reserve(shape.size()); | |||
| std::transform(shape.begin(), shape.end(), std::back_inserter(t_shape), | |||
| [](int s) -> dataset::dsize_t { return static_cast<dataset::dsize_t>(s); }); | |||
| dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), MSTypeToDEType(data_type), &this->tensor_impl_); | |||
| dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), dataset::MSTypeToDEType(data_type), &this->tensor_impl_); | |||
| } | |||
| DETensor::DETensor(std::shared_ptr<dataset::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); } | |||
| @@ -133,14 +72,14 @@ std::shared_ptr<dataset::Tensor> DETensor::tensor() const { | |||
| TypeId DETensor::data_type() const { | |||
| MS_ASSERT(this->tensor_impl_ != nullptr); | |||
| return DETypeToMSType(this->tensor_impl_->type()); | |||
| return dataset::DETypeToMSType(this->tensor_impl_->type()); | |||
| } | |||
| TypeId DETensor::set_data_type(TypeId data_type) { | |||
| MS_ASSERT(this->tensor_impl_ != nullptr); | |||
| if (data_type != this->data_type()) { | |||
| std::shared_ptr<dataset::Tensor> temp; | |||
| dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), MSTypeToDEType(data_type), | |||
| dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), dataset::MSTypeToDEType(data_type), | |||
| this->tensor_impl_->GetBuffer(), &temp); | |||
| this->tensor_impl_ = temp; | |||
| } | |||
| @@ -50,13 +50,6 @@ Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) { | |||
| std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_, | |||
| builder_total_rows_, std::move(builder_data_schema_), std::move(builder_sampler_)); | |||
| // If the user did not provide a schema, then we will ask the op to generate a pseudo-random | |||
| // schema. | |||
| // See details of generateSchema function to learn what type of schema it will create. | |||
| if ((*out_op)->data_schema_ == nullptr) { | |||
| RETURN_IF_NOT_OK((*out_op)->GenerateSchema()); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| @@ -85,6 +78,12 @@ RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64 | |||
| if (total_rows_ == 0) { | |||
| total_rows_ = GenRandomInt(1, kMaxTotalRows); | |||
| } | |||
| // If the user did not provide a schema, then we will ask the op to generate a pseudo-random | |||
| // schema. | |||
| // See details of generateSchema function to learn what type of schema it will create. | |||
| if (data_schema_ == nullptr) { | |||
| GenerateSchema(); | |||
| } | |||
| // Everyone is already out from the sync area. | |||
| all_out_.Set(); | |||
| } | |||
| @@ -106,11 +105,7 @@ void RandomDataOp::Print(std::ostream &out, bool show_all) const { | |||
| } | |||
| // Helper function to produce a default/random schema if one didn't exist | |||
| Status RandomDataOp::GenerateSchema() { | |||
| if (data_schema_ != nullptr) { | |||
| return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!"); | |||
| } | |||
| void RandomDataOp::GenerateSchema() { | |||
| // To randomly create a schema, we need to choose: | |||
| // a) how many columns | |||
| // b) the type of each column | |||
| @@ -144,8 +139,6 @@ Status RandomDataOp::GenerateSchema() { | |||
| data_schema_->AddColumn(*newCol); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| // Class functor operator () override. | |||
| @@ -213,9 +213,8 @@ class RandomDataOp : public ParallelOp { | |||
| /** | |||
| * Helper function to produce a default/random schema if one didn't exist | |||
| @return Status - The error code return | |||
| */ | |||
| Status GenerateSchema(); | |||
| */ | |||
| void GenerateSchema(); | |||
| /** | |||
| * Performs a synchronization between workers at the end of an epoch | |||
| @@ -25,9 +25,11 @@ | |||
| #include <utility> | |||
| #include <string> | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/engine/data_schema.h" | |||
| #include "minddata/dataset/include/tensor.h" | |||
| #include "minddata/dataset/include/iterator.h" | |||
| #include "minddata/dataset/include/samplers.h" | |||
| #include "minddata/dataset/include/type_id.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -41,6 +43,7 @@ class TensorShape; | |||
| namespace api { | |||
| class TensorOperation; | |||
| class SchemaObj; | |||
| class SamplerObj; | |||
| // Datasets classes (in alphabetical order) | |||
| class CelebADataset; | |||
| @@ -51,6 +54,7 @@ class CocoDataset; | |||
| class ImageFolderDataset; | |||
| class ManifestDataset; | |||
| class MnistDataset; | |||
| class RandomDataset; | |||
| class TextFileDataset; | |||
| class VOCDataset; | |||
| // Dataset Op classes (in alphabetical order) | |||
| @@ -65,6 +69,11 @@ class SkipDataset; | |||
| class TakeDataset; | |||
| class ZipDataset; | |||
| /// \brief Function to create a SchemaObj | |||
| /// \param[in] schema_file Path of schema file | |||
| /// \return Shared pointer to the current schema | |||
| std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = ""); | |||
| /// \brief Function to create a CelebADataset | |||
| /// \notes The generated dataset has two columns ['image', 'attr']. | |||
| // The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. | |||
| @@ -187,6 +196,21 @@ std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir, | |||
| std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &datasets1, | |||
| const std::shared_ptr<Dataset> &datasets2); | |||
| /// \brief Function to create a RandomDataset | |||
| /// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random) | |||
| /// \param[in] schema SchemaObj to set column type, data type and data shape | |||
| /// \param[in] columns_list List of columns to be read (default=None, read all columns) | |||
| /// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler` | |||
| /// will be used to randomly iterate the entire dataset | |||
| /// \return Shared pointer to the current Dataset | |||
| template <typename T = std::shared_ptr<SchemaObj>> | |||
| std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr, | |||
| std::vector<std::string> columns_list = {}, | |||
| std::shared_ptr<SamplerObj> sampler = nullptr) { | |||
| auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler)); | |||
| return ds->ValidateParams() ? ds : nullptr; | |||
| } | |||
| /// \brief Function to create a TextFileDataset | |||
| /// \notes The generated dataset has one column ['text'] | |||
| /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list | |||
| @@ -355,6 +379,66 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||
| int32_t worker_connector_size_; | |||
| }; | |||
| class SchemaObj { | |||
| public: | |||
| /// \brief Constructor | |||
| explicit SchemaObj(const std::string &schema_file = ""); | |||
| /// \brief Destructor | |||
| ~SchemaObj() = default; | |||
| /// \brief SchemaObj init function | |||
| /// \return bool true if schema init success | |||
| bool init(); | |||
| /// \brief Add new column to the schema | |||
| /// \param[in] name name of the column. | |||
| /// \param[in] de_type data type of the column(TypeId). | |||
| /// \param[in] shape shape of the column. | |||
| /// \return bool true if schema init success | |||
| bool add_column(std::string name, TypeId de_type, std::vector<int32_t> shape); | |||
| /// \brief Add new column to the schema | |||
| /// \param[in] name name of the column. | |||
| /// \param[in] de_type data type of the column(std::string). | |||
| /// \param[in] shape shape of the column. | |||
| /// \return bool true if schema init success | |||
| bool add_column(std::string name, std::string de_type, std::vector<int32_t> shape); | |||
| /// \brief Get a JSON string of the schema | |||
| /// \return JSON string of the schema | |||
| std::string to_json(); | |||
| /// \brief Get a JSON string of the schema | |||
| std::string to_string() { return to_json(); } | |||
| /// \brief set a new value to dataset_type | |||
| inline void set_dataset_type(std::string dataset_type) { dataset_type_ = dataset_type; } | |||
| /// \brief set a new value to num_rows | |||
| inline void set_num_rows(int32_t num_rows) { num_rows_ = num_rows; } | |||
| /// \brief get the current num_rows | |||
| inline int32_t get_num_rows() { return num_rows_; } | |||
| private: | |||
| /// \brief Parse the columns and add it to columns | |||
| /// \param[in] columns dataset attribution information, decoded from schema file. | |||
| /// support both nlohmann::json::value_t::array and nlohmann::json::value_t::onject. | |||
| /// \return JSON string of the schema | |||
| bool parse_column(nlohmann::json columns); | |||
| /// \brief Get schema file from json file | |||
| /// \param[in] json_obj object of json parsed. | |||
| /// \return bool true if json dump success | |||
| bool from_json(nlohmann::json json_obj); | |||
| int32_t num_rows_; | |||
| std::string dataset_type_; | |||
| std::string schema_file_; | |||
| nlohmann::json columns_; | |||
| }; | |||
| /* ####################################### Derived Dataset classes ################################# */ | |||
| // DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS | |||
| @@ -562,6 +646,53 @@ class MnistDataset : public Dataset { | |||
| std::shared_ptr<SamplerObj> sampler_; | |||
| }; | |||
| class RandomDataset : public Dataset { | |||
| public: | |||
| // Some constants to provide limits to random generation. | |||
| static constexpr int32_t kMaxNumColumns = 4; | |||
| static constexpr int32_t kMaxRank = 4; | |||
| static constexpr int32_t kMaxDimValue = 32; | |||
| /// \brief Constructor | |||
| RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema, std::vector<std::string> columns_list, | |||
| std::shared_ptr<SamplerObj> sampler) | |||
| : total_rows_(total_rows), | |||
| schema_path_(""), | |||
| schema_(std::move(schema)), | |||
| columns_list_(columns_list), | |||
| sampler_(std::move(sampler)) {} | |||
| /// \brief Constructor | |||
| RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list, | |||
| std::shared_ptr<SamplerObj> sampler) | |||
| : total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {} | |||
| /// \brief Destructor | |||
| ~RandomDataset() = default; | |||
| /// \brief a base class override function to create the required runtime dataset op objects for this class | |||
| /// \return The list of shared pointers to the newly created DatasetOps | |||
| std::vector<std::shared_ptr<DatasetOp>> Build() override; | |||
| /// \brief Parameters validation | |||
| /// \return bool true if all the params are valid | |||
| bool ValidateParams() override; | |||
| private: | |||
| /// \brief A quick inline for producing a random number between (and including) min/max | |||
| /// \param[in] min minimum number that can be generated. | |||
| /// \param[in] max maximum number that can be generated. | |||
| /// \return The generated random number | |||
| int32_t GenRandomInt(int32_t min, int32_t max); | |||
| int32_t total_rows_; | |||
| std::string schema_path_; | |||
| std::shared_ptr<SchemaObj> schema_; | |||
| std::vector<std::string> columns_list_; | |||
| std::shared_ptr<SamplerObj> sampler_; | |||
| std::mt19937 rand_gen_; | |||
| }; | |||
| /// \class TextFileDataset | |||
| /// \brief A Dataset derived class to represent TextFile dataset | |||
| class TextFileDataset : public Dataset { | |||
| @@ -0,0 +1,88 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ | |||
| #include "minddata/dataset/core/data_type.h" | |||
| #include "mindspore/core/ir/dtype/type_id.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| inline dataset::DataType MSTypeToDEType(TypeId data_type) { | |||
| switch (data_type) { | |||
| case kNumberTypeBool: | |||
| return dataset::DataType(dataset::DataType::DE_BOOL); | |||
| case kNumberTypeInt8: | |||
| return dataset::DataType(dataset::DataType::DE_INT8); | |||
| case kNumberTypeUInt8: | |||
| return dataset::DataType(dataset::DataType::DE_UINT8); | |||
| case kNumberTypeInt16: | |||
| return dataset::DataType(dataset::DataType::DE_INT16); | |||
| case kNumberTypeUInt16: | |||
| return dataset::DataType(dataset::DataType::DE_UINT16); | |||
| case kNumberTypeInt32: | |||
| return dataset::DataType(dataset::DataType::DE_INT32); | |||
| case kNumberTypeUInt32: | |||
| return dataset::DataType(dataset::DataType::DE_UINT32); | |||
| case kNumberTypeInt64: | |||
| return dataset::DataType(dataset::DataType::DE_INT64); | |||
| case kNumberTypeUInt64: | |||
| return dataset::DataType(dataset::DataType::DE_UINT64); | |||
| case kNumberTypeFloat16: | |||
| return dataset::DataType(dataset::DataType::DE_FLOAT16); | |||
| case kNumberTypeFloat32: | |||
| return dataset::DataType(dataset::DataType::DE_FLOAT32); | |||
| case kNumberTypeFloat64: | |||
| return dataset::DataType(dataset::DataType::DE_FLOAT64); | |||
| default: | |||
| return dataset::DataType(dataset::DataType::DE_UNKNOWN); | |||
| } | |||
| } | |||
| inline TypeId DETypeToMSType(dataset::DataType data_type) { | |||
| switch (data_type.value()) { | |||
| case dataset::DataType::DE_BOOL: | |||
| return mindspore::TypeId::kNumberTypeBool; | |||
| case dataset::DataType::DE_INT8: | |||
| return mindspore::TypeId::kNumberTypeInt8; | |||
| case dataset::DataType::DE_UINT8: | |||
| return mindspore::TypeId::kNumberTypeUInt8; | |||
| case dataset::DataType::DE_INT16: | |||
| return mindspore::TypeId::kNumberTypeInt16; | |||
| case dataset::DataType::DE_UINT16: | |||
| return mindspore::TypeId::kNumberTypeUInt16; | |||
| case dataset::DataType::DE_INT32: | |||
| return mindspore::TypeId::kNumberTypeInt32; | |||
| case dataset::DataType::DE_UINT32: | |||
| return mindspore::TypeId::kNumberTypeUInt32; | |||
| case dataset::DataType::DE_INT64: | |||
| return mindspore::TypeId::kNumberTypeInt64; | |||
| case dataset::DataType::DE_UINT64: | |||
| return mindspore::TypeId::kNumberTypeUInt64; | |||
| case dataset::DataType::DE_FLOAT16: | |||
| return mindspore::TypeId::kNumberTypeFloat16; | |||
| case dataset::DataType::DE_FLOAT32: | |||
| return mindspore::TypeId::kNumberTypeFloat32; | |||
| case dataset::DataType::DE_FLOAT64: | |||
| return mindspore::TypeId::kNumberTypeFloat64; | |||
| default: | |||
| return kTypeUnknown; | |||
| } | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_ | |||
| @@ -104,6 +104,7 @@ SET(DE_UT_SRCS | |||
| c_api_dataset_clue_test.cc | |||
| c_api_dataset_coco_test.cc | |||
| c_api_dataset_filetext_test.cc | |||
| c_api_dataset_randomdata_test.cc | |||
| c_api_dataset_voc_test.cc | |||
| c_api_datasets_test.cc | |||
| c_api_dataset_iterator_test.cc | |||
| @@ -0,0 +1,271 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "mindspore/core/ir/dtype/type_id.h" | |||
| using namespace mindspore::dataset; | |||
| using namespace mindspore::dataset::api; | |||
| using mindspore::dataset::Tensor; | |||
| using mindspore::dataset::TensorShape; | |||
| using mindspore::dataset::DataType; | |||
| class MindDataTestPipeline : public UT::DatasetOpTesting { | |||
| protected: | |||
| }; | |||
| TEST_F(MindDataTestPipeline, TestRandomDatasetBasic1) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic1."; | |||
| // Create a RandomDataset | |||
| std::shared_ptr<SchemaObj> schema = Schema(); | |||
| schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2}); | |||
| schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1}); | |||
| std::shared_ptr<Dataset> ds = RandomData(50, schema); | |||
| EXPECT_NE(ds, nullptr); | |||
| ds = ds->SetNumWorkers(4); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a Repeat operation on ds | |||
| ds = ds->Repeat(4); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| // Check if RandomDataOp read correct columns | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto image = row["image"]; | |||
| auto label = row["label"]; | |||
| MS_LOG(INFO) << "Tensor image shape: " << image->shape(); | |||
| MS_LOG(INFO) << "Tensor label shape: " << label->shape(); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 200); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestRandomDatasetBasic2) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic2."; | |||
| // Create a RandomDataset | |||
| std::shared_ptr<Dataset> ds = RandomData(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| ds = ds->SetNumWorkers(1); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a Repeat operation on ds | |||
| ds = ds->Repeat(2); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| // Check if RandomDataOp read correct columns | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto image = row["image"]; | |||
| auto label = row["label"]; | |||
| MS_LOG(INFO) << "Tensor image shape: " << image->shape(); | |||
| MS_LOG(INFO) << "Tensor label shape: " << label->shape(); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 20); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestRandomDatasetBasic3) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic3."; | |||
| // Create a RandomDataset | |||
| u_int32_t curr_seed = GlobalContext::config_manager()->seed(); | |||
| GlobalContext::config_manager()->set_seed(246); | |||
| std::string SCHEMA_FILE = datasets_root_path_ + "/testTFTestAllTypes/datasetSchema.json"; | |||
| std::shared_ptr<SchemaObj> schema = Schema(SCHEMA_FILE); | |||
| std::shared_ptr<Dataset> ds = RandomData(0, schema); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a Repeat operation on ds | |||
| ds = ds->Repeat(2); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| // Check if RandomDataOp read correct columns | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto col_sint16 = row["col_sint16"]; | |||
| auto col_sint32 = row["col_sint32"]; | |||
| auto col_sint64 = row["col_sint64"]; | |||
| auto col_float = row["col_float"]; | |||
| auto col_1d = row["col_1d"]; | |||
| auto col_2d = row["col_2d"]; | |||
| auto col_3d = row["col_3d"]; | |||
| auto col_binary = row["col_binary"]; | |||
| // validate shape | |||
| ASSERT_EQ(col_sint16->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_sint32->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_sint64->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_float->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_1d->shape(), TensorShape({2})); | |||
| ASSERT_EQ(col_2d->shape(), TensorShape({2, 2})); | |||
| ASSERT_EQ(col_3d->shape(), TensorShape({2, 2, 2})); | |||
| ASSERT_EQ(col_binary->shape(), TensorShape({1})); | |||
| // validate Rank | |||
| ASSERT_EQ(col_sint16->Rank(), 1); | |||
| ASSERT_EQ(col_sint32->Rank(), 1); | |||
| ASSERT_EQ(col_sint64->Rank(), 1); | |||
| ASSERT_EQ(col_float->Rank(), 1); | |||
| ASSERT_EQ(col_1d->Rank(), 1); | |||
| ASSERT_EQ(col_2d->Rank(), 2); | |||
| ASSERT_EQ(col_3d->Rank(), 3); | |||
| ASSERT_EQ(col_binary->Rank(), 1); | |||
| // validate type | |||
| ASSERT_EQ(col_sint16->type(), DataType::DE_INT16); | |||
| ASSERT_EQ(col_sint32->type(), DataType::DE_INT32); | |||
| ASSERT_EQ(col_sint64->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_float->type(), DataType::DE_FLOAT32); | |||
| ASSERT_EQ(col_1d->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_2d->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_3d->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_binary->type(), DataType::DE_UINT8); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 984); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| GlobalContext::config_manager()->set_seed(curr_seed); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic3."; | |||
| // Create a RandomDataset | |||
| u_int32_t curr_seed = GlobalContext::config_manager()->seed(); | |||
| GlobalContext::config_manager()->set_seed(246); | |||
| std::string SCHEMA_FILE = datasets_root_path_ + "/testTFTestAllTypes/datasetSchema.json"; | |||
| std::shared_ptr<Dataset> ds = RandomData(0, SCHEMA_FILE); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a Repeat operation on ds | |||
| ds = ds->Repeat(2); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||
| iter->GetNextRow(&row); | |||
| // Check if RandomDataOp read correct columns | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto col_sint16 = row["col_sint16"]; | |||
| auto col_sint32 = row["col_sint32"]; | |||
| auto col_sint64 = row["col_sint64"]; | |||
| auto col_float = row["col_float"]; | |||
| auto col_1d = row["col_1d"]; | |||
| auto col_2d = row["col_2d"]; | |||
| auto col_3d = row["col_3d"]; | |||
| auto col_binary = row["col_binary"]; | |||
| // validate shape | |||
| ASSERT_EQ(col_sint16->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_sint32->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_sint64->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_float->shape(), TensorShape({1})); | |||
| ASSERT_EQ(col_1d->shape(), TensorShape({2})); | |||
| ASSERT_EQ(col_2d->shape(), TensorShape({2, 2})); | |||
| ASSERT_EQ(col_3d->shape(), TensorShape({2, 2, 2})); | |||
| ASSERT_EQ(col_binary->shape(), TensorShape({1})); | |||
| // validate Rank | |||
| ASSERT_EQ(col_sint16->Rank(), 1); | |||
| ASSERT_EQ(col_sint32->Rank(), 1); | |||
| ASSERT_EQ(col_sint64->Rank(), 1); | |||
| ASSERT_EQ(col_float->Rank(), 1); | |||
| ASSERT_EQ(col_1d->Rank(), 1); | |||
| ASSERT_EQ(col_2d->Rank(), 2); | |||
| ASSERT_EQ(col_3d->Rank(), 3); | |||
| ASSERT_EQ(col_binary->Rank(), 1); | |||
| // validate type | |||
| ASSERT_EQ(col_sint16->type(), DataType::DE_INT16); | |||
| ASSERT_EQ(col_sint32->type(), DataType::DE_INT32); | |||
| ASSERT_EQ(col_sint64->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_float->type(), DataType::DE_FLOAT32); | |||
| ASSERT_EQ(col_1d->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_2d->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_3d->type(), DataType::DE_INT64); | |||
| ASSERT_EQ(col_binary->type(), DataType::DE_UINT8); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 984); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| GlobalContext::config_manager()->set_seed(curr_seed); | |||
| } | |||