| @@ -22,6 +22,7 @@ | |||
| #include "minddata/dataset/callback/py_ds_callback.h" | |||
| #include "minddata/dataset/core/constants.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "minddata/dataset/engine/serdes.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| @@ -92,7 +93,13 @@ PYBIND_REGISTER(DatasetNode, 1, ([](const py::module *m) { | |||
| THROW_IF_ERROR(zip->ValidateParams()); | |||
| return zip; | |||
| }, | |||
| py::arg("datasets")); | |||
| py::arg("datasets")) | |||
| .def("to_json", [](std::shared_ptr<DatasetNode> self, const std::string &json_filepath) { | |||
| nlohmann::json args; | |||
| auto serdas = std::make_shared<Serdes>(); | |||
| THROW_IF_ERROR(serdas->SaveToJSON(self, json_filepath, &args)); | |||
| return args.dump(); | |||
| }); | |||
| })); | |||
| // PYBIND FOR LEAF NODES | |||
| @@ -258,6 +258,11 @@ std::shared_ptr<SamplerObj> PreBuiltSamplerObj::Copy() { | |||
| return sampler; | |||
| } | |||
| Status PreBuiltSamplerObj::to_json(nlohmann::json *out_json) { | |||
| RETURN_IF_NOT_OK(sp_->to_json(out_json)); | |||
| return Status::OK(); | |||
| } | |||
| #ifndef ENABLE_ANDROID | |||
| std::shared_ptr<mindrecord::ShardOperator> PKSamplerObj::BuildForMindDataset() { | |||
| // runtime mindrecord sampler object | |||
| @@ -229,6 +229,11 @@ std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; } | |||
| std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; } | |||
| Status PreBuiltOperation::to_json(nlohmann::json *out_json) { | |||
| RETURN_IF_NOT_OK(op_->to_json(out_json)); | |||
| return Status::OK(); | |||
| } | |||
| // RandomApplyOperation | |||
| RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob) | |||
| : TensorOperation(true), transforms_(transforms), prob_(prob) {} | |||
| @@ -20,6 +20,7 @@ set(SRC_FILES_LIST | |||
| runtime_context.cc | |||
| python_runtime_context.cc | |||
| consumers/tree_consumer.cc | |||
| serdes.cc | |||
| ) | |||
| if (ENABLE_PYTHON) | |||
| set(SRC_FILES_LIST | |||
| @@ -190,5 +190,26 @@ void DistributedSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const | |||
| } | |||
| } | |||
| Status DistributedSamplerRT::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["sampler_name"] = "DistributedSampler"; | |||
| args["num_shards"] = num_devices_; | |||
| args["shard_id"] = device_id_; | |||
| args["shuffle"] = shuffle_; | |||
| args["num_samples"] = num_samples_; | |||
| args["offset"] = offset_; | |||
| if (this->HasChildSampler()) { | |||
| std::vector<nlohmann::json> children_args; | |||
| for (auto child : child_) { | |||
| nlohmann::json child_arg; | |||
| RETURN_IF_NOT_OK(child->to_json(&child_arg)); | |||
| children_args.push_back(child_arg); | |||
| } | |||
| args["child_sampler"] = children_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -72,6 +72,11 @@ class DistributedSamplerRT : public SamplerRT { | |||
| void SamplerPrint(std::ostream &out, bool show_all) const override; | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| int64_t cnt_; // number of samples that have already been filled in to buffer | |||
| uint32_t seed_; | |||
| @@ -128,5 +128,24 @@ void PKSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const { | |||
| // Then add our own info if any | |||
| } | |||
| } | |||
| Status PKSamplerRT::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["sampler_name"] = "PKSampler"; | |||
| args["num_val"] = samples_per_class_; | |||
| args["shuffle"] = shuffle_; | |||
| args["num_samples"] = num_samples_; | |||
| if (this->HasChildSampler()) { | |||
| std::vector<nlohmann::json> children_args; | |||
| for (auto child : child_) { | |||
| nlohmann::json child_arg; | |||
| RETURN_IF_NOT_OK(child->to_json(&child_arg)); | |||
| children_args.push_back(child_arg); | |||
| } | |||
| args["child_sampler"] = children_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -61,6 +61,11 @@ class PKSamplerRT : public SamplerRT { // NOT YET FINISHED | |||
| // @param show_all - bool to show detailed vs summary | |||
| void SamplerPrint(std::ostream &out, bool show_all) const override; | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| bool shuffle_; | |||
| uint32_t seed_; | |||
| @@ -127,5 +127,24 @@ void RandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const { | |||
| // Then add our own info if any | |||
| } | |||
| } | |||
| Status RandomSamplerRT::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["sampler_name"] = "RandomSampler"; | |||
| args["replacement"] = replacement_; | |||
| args["num_samples"] = num_samples_; | |||
| args["reshuffle_each_epoch"] = reshuffle_each_epoch_; | |||
| if (this->HasChildSampler()) { | |||
| std::vector<nlohmann::json> children_args; | |||
| for (auto child : child_) { | |||
| nlohmann::json child_arg; | |||
| RETURN_IF_NOT_OK(child->to_json(&child_arg)); | |||
| children_args.push_back(child_arg); | |||
| } | |||
| args["child_sampler"] = children_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -52,6 +52,11 @@ class RandomSamplerRT : public SamplerRT { | |||
| void SamplerPrint(std::ostream &out, bool show_all) const override; | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| uint32_t seed_; | |||
| bool replacement_; | |||
| @@ -149,6 +149,11 @@ class SamplerRT { | |||
| // @return Status The status code returned | |||
| Status GetAssociatedChildId(int64_t *out_associated_id, int64_t id); | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); } | |||
| protected: | |||
| // Number of rows of data from the place this sampler is sampling from. If this sampler | |||
| // has a child sampler, num_rows_ is the number of ids the child sampler will | |||
| @@ -17,6 +17,7 @@ | |||
| #include <algorithm> | |||
| #include <memory> | |||
| #include <vector> | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -131,5 +132,23 @@ void SequentialSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const { | |||
| out << "\nStart index: " << start_index_; | |||
| } | |||
| } | |||
| Status SequentialSamplerRT::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["sampler_name"] = "SequentialSampler"; | |||
| args["start_index"] = start_index_; | |||
| args["num_samples"] = num_samples_; | |||
| if (this->HasChildSampler()) { | |||
| std::vector<nlohmann::json> children_args; | |||
| for (auto child : child_) { | |||
| nlohmann::json child_arg; | |||
| RETURN_IF_NOT_OK(child->to_json(&child_arg)); | |||
| children_args.push_back(child_arg); | |||
| } | |||
| args["child_sampler"] = children_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -61,6 +61,11 @@ class SequentialSamplerRT : public SamplerRT { | |||
| // @param show_all - bool to show detailed vs summary | |||
| void SamplerPrint(std::ostream &out, bool show_all) const override; | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| int64_t current_id_; // The id sequencer. Each new id increments from this | |||
| int64_t start_index_; // The starting id. current_id_ begins from here. | |||
| @@ -131,5 +131,23 @@ void SubsetRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const | |||
| // Then add our own info if any | |||
| } | |||
| } | |||
| Status SubsetRandomSamplerRT::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["sampler_name"] = "SubsetRandomSampler"; | |||
| args["indices"] = indices_; | |||
| args["num_samples"] = num_samples_; | |||
| if (this->HasChildSampler()) { | |||
| std::vector<nlohmann::json> children_args; | |||
| for (auto child : child_) { | |||
| nlohmann::json child_arg; | |||
| RETURN_IF_NOT_OK(child->to_json(&child_arg)); | |||
| children_args.push_back(child_arg); | |||
| } | |||
| args["child_sampler"] = children_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -56,6 +56,11 @@ class SubsetRandomSamplerRT : public SamplerRT { | |||
| // @param show_all - bool to show detailed vs summary | |||
| void SamplerPrint(std::ostream &out, bool show_all) const override; | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| // A list of indices (already randomized in constructor). | |||
| std::vector<int64_t> indices_; | |||
| @@ -193,5 +193,24 @@ void WeightedRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) con | |||
| // Then add our own info if any | |||
| } | |||
| } | |||
| Status WeightedRandomSamplerRT::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["sampler_name"] = "WeightedRandomSampler"; | |||
| args["weights"] = weights_; | |||
| args["num_samples"] = num_samples_; | |||
| args["replacement"] = replacement_; | |||
| if (this->HasChildSampler()) { | |||
| std::vector<nlohmann::json> children_args; | |||
| for (auto child : child_) { | |||
| nlohmann::json child_arg; | |||
| RETURN_IF_NOT_OK(child->to_json(&child_arg)); | |||
| children_args.push_back(child_arg); | |||
| } | |||
| args["child_sampler"] = children_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -58,6 +58,11 @@ class WeightedRandomSamplerRT : public SamplerRT { | |||
| // @param show_all - bool to show detailed vs summary | |||
| void SamplerPrint(std::ostream &out, bool show_all) const override; | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| // A list of weights for each sample. | |||
| std::vector<double> weights_; | |||
| @@ -28,6 +28,7 @@ class DatasetCache { | |||
| virtual Status Build() = 0; | |||
| virtual Status ValidateParams() = 0; | |||
| virtual Status CreateCacheOp(int num_workers, std::shared_ptr<DatasetOp> *ds_op) = 0; | |||
| virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); } | |||
| }; | |||
| } // namespace mindspore::dataset | |||
| @@ -44,5 +44,18 @@ Status DatasetCacheImpl::CreateCacheOp(int32_t num_workers, std::shared_ptr<Data | |||
| return Status::OK(); | |||
| } | |||
| Status DatasetCacheImpl::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["session_id"] = session_id_; | |||
| args["cache_memory_size"] = cache_mem_sz_; | |||
| args["spill"] = spill_; | |||
| if (hostname_) args["hostname"] = hostname_.value(); | |||
| if (port_) args["port"] = port_.value(); | |||
| if (num_connections_) args["num_connections"] = num_connections_.value(); | |||
| if (prefetch_sz_) args["prefetch_size"] = prefetch_sz_.value(); | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -60,6 +60,8 @@ class DatasetCacheImpl : public DatasetCache { | |||
| ~DatasetCacheImpl() = default; | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::shared_ptr<CacheClient> cache_client_; | |||
| session_id_type session_id_; | |||
| @@ -152,5 +152,20 @@ Status BatchNode::AcceptAfter(IRNodePass *const p, bool *const modified) { | |||
| // Downcast shared pointer then call visitor | |||
| return p->VisitAfter(shared_from_base<BatchNode>(), modified); | |||
| } | |||
| Status BatchNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| args["batch_size"] = batch_size_; | |||
| args["drop_remainder"] = drop_remainder_; | |||
| #ifdef ENABLE_PYTHON | |||
| args["input_columns"] = in_col_names_; | |||
| args["output_columns"] = out_col_names_; | |||
| args["column_order"] = col_order_; | |||
| if (batch_map_func_ != nullptr) args["per_batch_map"] = "pyfunc"; | |||
| #endif | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -87,6 +87,24 @@ class BatchNode : public DatasetNode { | |||
| /// \return Status of the node visit | |||
| Status AcceptAfter(IRNodePass *const p, bool *const modified) override; | |||
| /// \brief Getter functions | |||
| int32_t BatchSize() const { return batch_size_; } | |||
| bool DropRemainder() const { return drop_remainder_; } | |||
| #ifdef ENABLE_PYTHON | |||
| bool Pad() const { return pad_; } | |||
| const std::vector<std::string> &InColNames() const { return in_col_names_; } | |||
| const std::vector<std::string> &OutColNames() const { return out_col_names_; } | |||
| const std::vector<std::string> &ColOrder() const { return col_order_; } | |||
| const py::function &BatchSizeFunc() const { return batch_size_func_; } | |||
| const py::function &BatchMapFunc() const { return batch_map_func_; } | |||
| const std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> &PadMap() const { return pad_map_; } | |||
| #endif | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| int32_t batch_size_; | |||
| bool drop_remainder_; | |||
| @@ -431,6 +431,13 @@ Status DatasetNode::ValidateParams() { | |||
| return Status::OK(); | |||
| } | |||
| Status DatasetNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| Status MappableSourceNode::Accept(IRNodePass *const p, bool *const modified) { | |||
| return p->Visit(shared_from_base<MappableSourceNode>(), modified); | |||
| } | |||
| @@ -271,6 +271,11 @@ class DatasetNode : public std::enable_shared_from_this<DatasetNode> { | |||
| virtual bool IsSizeDefined() { return true; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| virtual Status to_json(nlohmann::json *out_json); | |||
| protected: | |||
| std::vector<std::shared_ptr<DatasetNode>> children_; | |||
| DatasetNode *parent_; // used to record the only one parent of an IR node after parsing phase | |||
| @@ -16,6 +16,7 @@ | |||
| #include "minddata/dataset/engine/ir/datasetops/map_node.h" | |||
| #include <algorithm> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <utility> | |||
| @@ -122,5 +123,33 @@ void MapNode::setOperations(const std::vector<std::shared_ptr<TensorOperation>> | |||
| operations_ = operations; | |||
| } | |||
| std::vector<std::shared_ptr<TensorOperation>> MapNode::operations() { return operations_; } | |||
| Status MapNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| args["input_columns"] = input_columns_; | |||
| args["output_columns"] = output_columns_; | |||
| if (!project_columns_.empty()) args["column_order"] = project_columns_; | |||
| if (cache_ != nullptr) { | |||
| nlohmann::json cache_args; | |||
| RETURN_IF_NOT_OK(cache_->to_json(&cache_args)); | |||
| args["cache"] = cache_args; | |||
| } | |||
| std::vector<nlohmann::json> ops; | |||
| std::vector<int32_t> cbs; | |||
| nlohmann::json op_args; | |||
| for (auto op : operations_) { | |||
| RETURN_IF_NOT_OK(op->to_json(&op_args)); | |||
| op_args["tensor_op_name"] = op->Name(); | |||
| ops.push_back(op_args); | |||
| } | |||
| args["operations"] = ops; | |||
| std::transform(callbacks_.begin(), callbacks_.end(), std::back_inserter(cbs), | |||
| [](std::shared_ptr<DSCallback> cb) -> int32_t { return cb->step_size(); }); | |||
| args["callback"] = cbs; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -58,10 +58,6 @@ class MapNode : public DatasetNode { | |||
| /// \return Status Status::OK() if all the parameters are valid | |||
| Status ValidateParams() override; | |||
| /// \brief Getter of tensor operations | |||
| /// \return Vector of operations the Map node will process | |||
| const auto &TensorOperations() const { return operations_; } | |||
| /// \brief Base-class override for accepting IRNodePass visitor | |||
| /// \param[in] p The node to visit | |||
| /// \param[out] modified Indicator if the node was modified | |||
| @@ -83,6 +79,20 @@ class MapNode : public DatasetNode { | |||
| /// \brief setter to set all tensor operations | |||
| void setOperations(const std::vector<std::shared_ptr<TensorOperation>> &operations); | |||
| /// \brief Getter functions | |||
| /// \brief Getter of tensor operations | |||
| /// \return Vector of operations the Map node will process | |||
| const auto &TensorOperations() const { return operations_; } | |||
| const std::vector<std::string> &InputColumns() const { return input_columns_; } | |||
| const std::vector<std::string> &OutputColumns() const { return output_columns_; } | |||
| const std::vector<std::string> &ProjectColumns() const { return project_columns_; } | |||
| const std::vector<std::shared_ptr<DSCallback>> &Callbacks() const { return callbacks_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> operations_; | |||
| @@ -62,5 +62,12 @@ Status RenameNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops | |||
| return Status::OK(); | |||
| } | |||
| Status RenameNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["input_columns"] = input_columns_; | |||
| args["output_columns"] = output_columns_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -56,6 +56,15 @@ class RenameNode : public DatasetNode { | |||
| /// \return Status Status::OK() if all the parameters are valid | |||
| Status ValidateParams() override; | |||
| /// \brief Getter functions | |||
| const std::vector<std::string> &InputColumns() const { return input_columns_; } | |||
| const std::vector<std::string> &OutputColumns() const { return output_columns_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::vector<std::string> input_columns_; | |||
| std::vector<std::string> output_columns_; | |||
| @@ -95,5 +95,12 @@ Status RepeatNode::AcceptAfter(IRNodePass *const p, bool *const modified) { | |||
| // Downcast shared pointer then call visitor | |||
| return p->VisitAfter(shared_from_base<RepeatNode>(), modified); | |||
| } | |||
| Status RepeatNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["count"] = repeat_count_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -115,6 +115,14 @@ class RepeatNode : public DatasetNode { | |||
| return Status::OK(); | |||
| } | |||
| /// \brief Getter functions | |||
| int32_t RepeatCount() const { return repeat_count_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| protected: | |||
| std::shared_ptr<RepeatOp> op_; // keep its corresponding run-time op of EpochCtrlNode and RepeatNode | |||
| std::shared_ptr<RepeatNode> reset_ancestor_; // updated its immediate Repeat/EpochCtrl ancestor in GeneratorNodePass | |||
| @@ -61,5 +61,12 @@ Status ShuffleNode::ValidateParams() { | |||
| return Status::OK(); | |||
| } | |||
| Status ShuffleNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["buffer_size"] = shuffle_size_; | |||
| args["reshuffle_each_epoch"] = reset_every_epoch_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -53,6 +53,16 @@ class ShuffleNode : public DatasetNode { | |||
| Status ValidateParams() override; | |||
| /// \brief Getter functions | |||
| int32_t ShuffleSize() const { return shuffle_size_; } | |||
| uint32_t ShuffleSeed() const { return shuffle_seed_; } | |||
| bool ResetEveryEpoch() const { return reset_every_epoch_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| int32_t shuffle_size_; | |||
| uint32_t shuffle_seed_; | |||
| @@ -100,5 +100,22 @@ Status ImageFolderNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> | |||
| return Status::OK(); | |||
| } | |||
| Status ImageFolderNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args, sampler_args; | |||
| RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args)); | |||
| args["sampler"] = sampler_args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| args["dataset_dir"] = dataset_dir_; | |||
| args["decode"] = decode_; | |||
| args["extensions"] = exts_; | |||
| args["class_indexing"] = class_indexing_; | |||
| if (cache_ != nullptr) { | |||
| nlohmann::json cache_args; | |||
| RETURN_IF_NOT_OK(cache_->to_json(&cache_args)); | |||
| args["cache"] = cache_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -75,6 +75,19 @@ class ImageFolderNode : public MappableSourceNode { | |||
| Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate, | |||
| int64_t *dataset_size) override; | |||
| /// \brief Getter functions | |||
| const std::string &DatasetDir() const { return dataset_dir_; } | |||
| bool Decode() const { return decode_; } | |||
| bool Recursive() const { return recursive_; } | |||
| const std::shared_ptr<SamplerObj> &Sampler() const { return sampler_; } | |||
| const std::map<std::string, int32_t> &ClassIndexing() const { return class_indexing_; } | |||
| const std::set<std::string> &Exts() const { return exts_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::string dataset_dir_; | |||
| bool decode_; | |||
| @@ -87,5 +87,20 @@ Status MnistNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_ | |||
| return Status::OK(); | |||
| } | |||
| Status MnistNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args, sampler_args; | |||
| RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args)); | |||
| args["sampler"] = sampler_args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| args["dataset_dir"] = dataset_dir_; | |||
| args["usage"] = usage_; | |||
| if (cache_ != nullptr) { | |||
| nlohmann::json cache_args; | |||
| RETURN_IF_NOT_OK(cache_->to_json(&cache_args)); | |||
| args["cache"] = cache_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -69,6 +69,16 @@ class MnistNode : public MappableSourceNode { | |||
| Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate, | |||
| int64_t *dataset_size) override; | |||
| /// \brief Getter functions | |||
| const std::string &DatasetDir() const { return dataset_dir_; } | |||
| const std::string &Usage() const { return usage_; } | |||
| const std::shared_ptr<SamplerObj> &Sampler() const { return sampler_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::string dataset_dir_; | |||
| std::string usage_; | |||
| @@ -199,5 +199,33 @@ Status TFRecordNode::GetShardFileList(std::vector<std::string> *shard_filenames) | |||
| return Status::OK(); | |||
| } | |||
| Status TFRecordNode::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| args["dataset_files"] = dataset_files_; | |||
| args["schema"] = schema_path_; | |||
| args["columns_list"] = columns_list_; | |||
| args["num_samples"] = num_samples_; | |||
| args["shuffle_global"] = (shuffle_ == ShuffleMode::kGlobal); | |||
| args["shuffle_files"] = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles); | |||
| args["shuffle"] = shuffle_; | |||
| args["num_shards"] = num_shards_; | |||
| args["shard_id"] = shard_id_; | |||
| args["shard_equal_rows"] = shard_equal_rows_; | |||
| if (cache_ != nullptr) { | |||
| nlohmann::json cache_args; | |||
| RETURN_IF_NOT_OK(cache_->to_json(&cache_args)); | |||
| args["cache"] = cache_args; | |||
| } | |||
| if (schema_obj_ != nullptr) { | |||
| schema_obj_->set_dataset_type("TF"); | |||
| schema_obj_->set_num_rows(num_samples_); | |||
| args["schema_json_string"] = schema_obj_->to_json(); | |||
| } else { | |||
| args["schema_file_path"] = schema_path_; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -109,6 +109,21 @@ class TFRecordNode : public NonMappableSourceNode { | |||
| /// \return Status of the function | |||
| Status GetShardFileList(std::vector<std::string> *shard_filenames); | |||
| /// \brief Getter functions | |||
| const std::vector<std::string> &DatasetFiles() const { return dataset_files_; } | |||
| const std::string &SchemaPath() const { return schema_path_; } | |||
| const std::shared_ptr<SchemaObj> &GetSchemaObj() const { return schema_obj_; } | |||
| const std::vector<std::string> &ColumnsList() const { return columns_list_; } | |||
| int64_t NumSamples() const { return num_samples_; } | |||
| ShuffleMode Shuffle() const { return shuffle_; } | |||
| int32_t NumShards() const { return num_shards_; } | |||
| bool ShardEqualRows() const { return shard_equal_rows_; } | |||
| /// \brief Get the arguments of node | |||
| /// \param[out] out_json JSON string of all attributes | |||
| /// \return Status of the function | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::vector<std::string> dataset_files_; | |||
| std::string schema_path_; // schema_path_ path to schema file. It is set when type of schema parameter is string | |||
| @@ -0,0 +1,57 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/engine/serdes.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| Status Serdes::SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json) { | |||
| // Dump attributes of current node to json string | |||
| nlohmann::json args; | |||
| RETURN_IF_NOT_OK(node->to_json(&args)); | |||
| args["op_type"] = node->Name(); | |||
| // If the current node isn't leaf node, visit all its children and get all attributes | |||
| std::vector<nlohmann::json> children_pipeline; | |||
| if (!node->IsLeaf()) { | |||
| for (auto child : node->Children()) { | |||
| nlohmann::json child_args; | |||
| RETURN_IF_NOT_OK(SaveToJSON(child, "", &child_args)); | |||
| children_pipeline.push_back(child_args); | |||
| } | |||
| } | |||
| args["children"] = children_pipeline; | |||
| // Save json string into file if filename is given. | |||
| if (!filename.empty()) { | |||
| RETURN_IF_NOT_OK(SaveJSONToFile(args, filename)); | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| Status Serdes::SaveJSONToFile(nlohmann::json json_string, const std::string &file_name) { | |||
| try { | |||
| std::ofstream file(file_name); | |||
| file << json_string; | |||
| } catch (const std::exception &err) { | |||
| RETURN_STATUS_UNEXPECTED("Save json string into " + file_name + " failed!"); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,56 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include <nlohmann/json.hpp> | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| /// \brief The Serdes class is used to serialize an IR tree into JSON string and dump into file if file name | |||
| /// specified. | |||
| class Serdes { | |||
| public: | |||
| /// \brief Constructor | |||
| Serdes() {} | |||
| /// \brief default destructor | |||
| ~Serdes() = default; | |||
| /// \brief function to serialize IR tree into JSON string and/or JSON file | |||
| /// \param[in] node IR node to be transferred | |||
| /// \param[in] filename The file name. If specified, save the generated JSON string into the file | |||
| /// \param[out] out_json The result json string | |||
| /// \return Status The status code returned | |||
| Status SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json); | |||
| protected: | |||
| /// \brief Helper function to save JSON to a file | |||
| /// \param[in] json_string The JSON string to be saved to the file | |||
| /// \param[in] file_name The file name | |||
| /// \return Status The status code returned | |||
| Status SaveJSONToFile(nlohmann::json json_string, const std::string &file_name); | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_ | |||
| @@ -64,6 +64,8 @@ class SamplerObj : public std::enable_shared_from_this<SamplerObj> { | |||
| /// \return the Status code returned | |||
| Status AddChild(std::shared_ptr<SamplerObj> child); | |||
| virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); } | |||
| #ifndef ENABLE_ANDROID | |||
| /// \brief Virtual function to convert a SamplerObj class into a runtime mindrecord sampler object, | |||
| /// only override by SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler | |||
| @@ -228,6 +230,8 @@ class PreBuiltSamplerObj : public SamplerObj { | |||
| Status ValidateParams() override; | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::shared_ptr<SamplerRT> sp_; | |||
| #ifndef ENABLE_ANDROID | |||
| @@ -20,6 +20,8 @@ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include <nlohmann/json.hpp> | |||
| #include "minddata/dataset/include/constants.h" | |||
| #include "minddata/dataset/include/status.h" | |||
| @@ -63,6 +65,8 @@ class TensorOperation : public std::enable_shared_from_this<TensorOperation> { | |||
| /// \return true if this op is a random op (returns non-deterministic result e.g. RandomCrop) | |||
| bool IsRandomOp() const { return random_op_; } | |||
| virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); } | |||
| protected: | |||
| bool random_op_; | |||
| }; | |||
| @@ -206,6 +210,8 @@ class PreBuiltOperation : public TensorOperation { | |||
| std::string Name() const override; | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::shared_ptr<TensorOp> op_; | |||
| }; | |||
| @@ -37,5 +37,12 @@ Status OneHotOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector | |||
| if (!outputs.empty()) return Status::OK(); | |||
| return Status(StatusCode::kUnexpectedError, "Input has a wrong shape"); | |||
| } | |||
| Status OneHotOp::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["num_classes"] = num_classes_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -37,6 +37,8 @@ class OneHotOp : public TensorOp { | |||
| std::string Name() const override { return kOneHotOp; } | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| int num_classes_; | |||
| }; | |||
| @@ -60,5 +60,12 @@ Status DecodeOp::OutputType(const std::vector<DataType> &inputs, std::vector<Dat | |||
| outputs[0] = DataType(DataType::DE_UINT8); | |||
| return Status::OK(); | |||
| } | |||
| Status DecodeOp::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["rgb"] = is_rgb_format_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -42,6 +42,8 @@ class DecodeOp : public TensorOp { | |||
| std::string Name() const override { return kDecodeOp; } | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| bool is_rgb_format_ = true; | |||
| }; | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "minddata/dataset/kernels/image/random_crop_op.h" | |||
| #include <random> | |||
| #include <tuple> | |||
| #include "minddata/dataset/kernels/image/image_utils.h" | |||
| #include "minddata/dataset/util/random.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| @@ -137,5 +138,16 @@ Status RandomCropOp::OutputShape(const std::vector<TensorShape> &inputs, std::ve | |||
| if (!outputs.empty()) return Status::OK(); | |||
| return Status(StatusCode::kUnexpectedError, "Input has a wrong shape"); | |||
| } | |||
| Status RandomCropOp::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["size"] = std::vector<int32_t>{crop_height_, crop_width_}; | |||
| args["padding"] = std::vector<int32_t>{pad_top_, pad_bottom_, pad_left_, pad_right_}; | |||
| args["pad_if_needed"] = pad_if_needed_; | |||
| args["fill_value"] = std::tuple<uint8_t, uint8_t, uint8_t>{fill_r_, fill_g_, fill_b_}; | |||
| args["padding_mode"] = border_type_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -79,6 +79,8 @@ class RandomCropOp : public TensorOp { | |||
| std::string Name() const override { return kRandomCropOp; } | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| protected: | |||
| int32_t crop_height_ = 0; | |||
| int32_t crop_width_ = 0; | |||
| @@ -29,5 +29,13 @@ Status RescaleOp::OutputType(const std::vector<DataType> &inputs, std::vector<Da | |||
| outputs[0] = DataType(DataType::DE_FLOAT32); | |||
| return Status::OK(); | |||
| } | |||
| Status RescaleOp::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["rescale"] = rescale_; | |||
| args["shift"] = shift_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -41,6 +41,8 @@ class RescaleOp : public TensorOp { | |||
| std::string Name() const override { return kRescaleOp; } | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| float rescale_; | |||
| float shift_; | |||
| @@ -67,5 +67,13 @@ Status ResizeOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector | |||
| if (!outputs.empty()) return Status::OK(); | |||
| return Status(StatusCode::kUnexpectedError, "Input has a wrong shape"); | |||
| } | |||
| Status ResizeOp::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["size"] = std::vector<int32_t>{size1_, size2_}; | |||
| args["interpolation"] = interpolation_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -61,6 +61,8 @@ class ResizeOp : public TensorOp { | |||
| std::string Name() const override { return kResizeOp; } | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| protected: | |||
| int32_t size1_; | |||
| int32_t size2_; | |||
| @@ -19,6 +19,7 @@ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "nlohmann/json.hpp" | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/core/tensor_row.h" | |||
| @@ -199,6 +200,8 @@ class TensorOp { | |||
| virtual std::string Name() const = 0; | |||
| virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); } | |||
| protected: | |||
| bool is_deterministic_{true}; | |||
| }; | |||
| @@ -203,6 +203,18 @@ class Dataset: | |||
| args["num_parallel_workers"] = self.num_parallel_workers | |||
| return args | |||
| def to_json(self, filename=""): | |||
| """ | |||
| Serialize a pipeline into JSON string and dump into file if filename is provided. | |||
| Args: | |||
| filename (str): filename of json file to be saved as | |||
| Returns: | |||
| Str, JSON string of the pipeline. | |||
| """ | |||
| return json.loads(self.parse_tree().to_json(filename)) | |||
| @check_bucket_batch_by_length | |||
| def bucket_batch_by_length(self, column_names, bucket_boundaries, bucket_batch_sizes, | |||
| element_length_function=None, pad_info=None, | |||
| @@ -19,13 +19,13 @@ import json | |||
| import os | |||
| import sys | |||
| import mindspore.common.dtype as mstype | |||
| from mindspore import log as logger | |||
| from . import datasets as de | |||
| from ..vision.utils import Inter, Border | |||
| from ..core import config | |||
| def serialize(dataset, json_filepath=None): | |||
| def serialize(dataset, json_filepath=""): | |||
| """ | |||
| Serialize dataset pipeline into a json file. | |||
| @@ -51,11 +51,7 @@ def serialize(dataset, json_filepath=None): | |||
| >>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json") # serialize it to json file | |||
| >>> serialized_data = ds.engine.serialize(data) # serialize it to Python dict | |||
| """ | |||
| serialized_pipeline = traverse(dataset) | |||
| if json_filepath: | |||
| with open(json_filepath, 'w') as json_file: | |||
| json.dump(serialized_pipeline, json_file, indent=2) | |||
| return serialized_pipeline | |||
| return dataset.to_json(json_filepath) | |||
| def deserialize(input_dict=None, json_filepath=None): | |||
| @@ -110,93 +106,6 @@ def expand_path(node_repr, key, val): | |||
| node_repr[key] = os.path.abspath(val) | |||
| def serialize_operations(node_repr, key, val): | |||
| """Serialize tensor op (Python object) to dictionary.""" | |||
| if isinstance(val, list): | |||
| node_repr[key] = [] | |||
| for op in val: | |||
| node_repr[key].append(op.__dict__) | |||
| # Extracting module and name information from a Python object | |||
| # Example: tensor_op_module is 'minddata.transforms.c_transforms' and tensor_op_name is 'Decode' | |||
| node_repr[key][-1]['tensor_op_name'] = type(op).__name__ | |||
| node_repr[key][-1]['tensor_op_module'] = type(op).__module__ | |||
| else: | |||
| node_repr[key] = val.__dict__ | |||
| node_repr[key]['tensor_op_name'] = type(val).__name__ | |||
| node_repr[key]['tensor_op_module'] = type(val).__module__ | |||
| def serialize_sampler(node_repr, val): | |||
| """Serialize sampler object to dictionary.""" | |||
| if val is None: | |||
| node_repr['sampler'] = None | |||
| else: | |||
| node_repr['sampler'] = val.__dict__ | |||
| node_repr['sampler']['sampler_module'] = type(val).__module__ | |||
| node_repr['sampler']['sampler_name'] = type(val).__name__ | |||
| def traverse(node): | |||
| """Pre-order traverse the pipeline and capture the information as we go.""" | |||
| # Node representation (node_repr) is a Python dictionary that capture and store the | |||
| # dataset pipeline information before dumping it to JSON or other format. | |||
| node_repr = dict() | |||
| node_repr['op_type'] = type(node).__name__ | |||
| node_repr['op_module'] = type(node).__module__ | |||
| # Start with an empty list of children, will be added later as we traverse this node. | |||
| node_repr["children"] = [] | |||
| # Retrieve the information about the current node. It should include arguments | |||
| # passed to the node during object construction. | |||
| node_args = node.get_args() | |||
| for k, v in node_args.items(): | |||
| # Store the information about this node into node_repr. | |||
| # Further serialize the object in the arguments if needed. | |||
| if k == 'operations': | |||
| serialize_operations(node_repr, k, v) | |||
| elif k == 'sampler': | |||
| serialize_sampler(node_repr, v) | |||
| elif k == 'padded_sample' and v: | |||
| v1 = {key: value for key, value in v.items() if not isinstance(value, bytes)} | |||
| node_repr[k] = json.dumps(v1, indent=2) | |||
| # return schema json str if its type is mindspore.dataset.Schema | |||
| elif k == 'schema' and isinstance(v, de.Schema): | |||
| node_repr[k] = v.to_json() | |||
| elif k in set(['schema', 'dataset_files', 'dataset_dir', 'schema_file_path']): | |||
| expand_path(node_repr, k, v) | |||
| elif k == "num_parallel_workers" and v is None: | |||
| node_repr[k] = config.get_num_parallel_workers() | |||
| else: | |||
| node_repr[k] = v | |||
| # If a sampler exists in this node, then the following 4 arguments must be set to None: | |||
| # num_samples, shard_id, num_shards, shuffle | |||
| # These arguments get moved into the sampler itself, so they are no longer needed to | |||
| # be set at the dataset level. | |||
| # TF Record is a special case because it uses both the dataset and sampler arguments | |||
| # which is not decided until later during tree preparation phase. | |||
| if node_repr['op_type'] != 'TFRecordDataset' and 'sampler' in node_args.keys(): | |||
| if 'num_samples' in node_repr.keys(): | |||
| node_repr['num_samples'] = None | |||
| if 'shuffle' in node_repr.keys(): | |||
| node_repr['shuffle'] = None | |||
| if 'num_shards' in node_repr.keys(): | |||
| node_repr['num_shards'] = None | |||
| if 'shard_id' in node_repr.keys(): | |||
| node_repr['shard_id'] = None | |||
| # Leaf node doesn't have input attribute. | |||
| if not node.children: | |||
| return node_repr | |||
| # Recursively traverse the child and assign it to the current node_repr['children']. | |||
| for child in node.children: | |||
| node_repr["children"].append(traverse(child)) | |||
| return node_repr | |||
| def show(dataset, indentation=2): | |||
| """ | |||
| Write the dataset pipeline graph onto logger.info. | |||
| @@ -206,7 +115,7 @@ def show(dataset, indentation=2): | |||
| indentation (int, optional): indentation used by the json print. Pass None to not indent. | |||
| """ | |||
| pipeline = traverse(dataset) | |||
| pipeline = dataset.to_json() | |||
| logger.info(json.dumps(pipeline, indent=indentation)) | |||
| @@ -219,7 +128,7 @@ def compare(pipeline1, pipeline2): | |||
| pipeline2 (Dataset): a dataset pipeline. | |||
| """ | |||
| return traverse(pipeline1) == traverse(pipeline2) | |||
| return pipeline1.to_json() == pipeline2.to_json() | |||
| def construct_pipeline(node): | |||
| @@ -244,140 +153,65 @@ def create_node(node): | |||
| """Parse the key, value in the node dictionary and instantiate the Python Dataset object""" | |||
| logger.info('creating node: %s', node['op_type']) | |||
| dataset_op = node['op_type'] | |||
| op_module = node['op_module'] | |||
| op_module = "mindspore.dataset" | |||
| # Get the Python class to be instantiated. | |||
| # Example: | |||
| # "op_type": "MapDataset", | |||
| # "op_module": "mindspore.dataset.datasets", | |||
| pyclass = getattr(sys.modules[op_module], dataset_op) | |||
| if node.get("children"): | |||
| pyclass = getattr(sys.modules[op_module], "Dataset") | |||
| else: | |||
| pyclass = getattr(sys.modules[op_module], dataset_op) | |||
| pyobj = None | |||
| # Find a matching Dataset class and call the constructor with the corresponding args. | |||
| # When a new Dataset class is introduced, another if clause and parsing code needs to be added. | |||
| if dataset_op == 'ImageFolderDataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node.get('num_samples'), node.get('num_parallel_workers'), | |||
| num_samples = check_and_replace_input(node.get('num_samples'), 0, None) | |||
| pyobj = pyclass(node['dataset_dir'], num_samples, node.get('num_parallel_workers'), | |||
| node.get('shuffle'), sampler, node.get('extensions'), | |||
| node.get('class_indexing'), node.get('decode'), node.get('num_shards'), | |||
| node.get('shard_id')) | |||
| elif dataset_op == 'RangeDataset': | |||
| pyobj = pyclass(node['start'], node['stop'], node['step']) | |||
| elif dataset_op == 'ImageFolderDataset': | |||
| pyobj = pyclass(node['dataset_dir'], node['schema'], node.get('distribution'), | |||
| node.get('column_list'), node.get('num_parallel_workers'), | |||
| node.get('deterministic_output'), node.get('prefetch_size'), | |||
| node.get('labels_filename'), node.get('dataset_usage')) | |||
| node.get('shard_id'), node.get('cache')) | |||
| elif dataset_op == 'MnistDataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'), | |||
| node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'MindDataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_file'], node.get('columns_list'), | |||
| node.get('num_parallel_workers'), node.get('seed'), node.get('num_shards'), | |||
| node.get('shard_id'), sampler) | |||
| num_samples = check_and_replace_input(node.get('num_samples'), 0, None) | |||
| pyobj = pyclass(node['dataset_dir'], node['usage'], num_samples, node.get('num_parallel_workers'), | |||
| node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'), node.get('cache')) | |||
| elif dataset_op == 'TFRecordDataset': | |||
| shuffle = node.get('shuffle') | |||
| shuffle = to_shuffle_mode(node.get('shuffle')) | |||
| if shuffle is not None and isinstance(shuffle, str): | |||
| shuffle = de.Shuffle(shuffle) | |||
| pyobj = pyclass(node['dataset_files'], node.get('schema'), node.get('column_list'), | |||
| node.get('num_samples'), node.get('num_parallel_workers'), | |||
| shuffle, node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'ManifestDataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_file'], node['usage'], node.get('num_samples'), | |||
| node.get('num_parallel_workers'), node.get('shuffle'), sampler, | |||
| node.get('class_indexing'), node.get('decode'), node.get('num_shards'), | |||
| node.get('shard_id')) | |||
| num_samples = check_and_replace_input(node.get('num_samples'), 0, None) | |||
| pyobj = pyclass(node['dataset_files'], node.get('schema'), node.get('columns_list'), | |||
| num_samples, node.get('num_parallel_workers'), | |||
| shuffle, node.get('num_shards'), node.get('shard_id'), node.get('cache')) | |||
| elif dataset_op == 'Cifar10Dataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'), | |||
| node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'Cifar100Dataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'), | |||
| node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'VOCDataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node.get('task'), node.get('mode'), node.get('class_indexing'), | |||
| node.get('num_samples'), node.get('num_parallel_workers'), node.get('shuffle'), | |||
| node.get('decode'), sampler, node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'CocoDataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node.get('annotation_file'), node.get('task'), node.get('num_samples'), | |||
| node.get('num_parallel_workers'), node.get('shuffle'), node.get('decode'), sampler, | |||
| node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'CelebADataset': | |||
| sampler = construct_sampler(node.get('sampler')) | |||
| pyobj = pyclass(node['dataset_dir'], node.get('num_parallel_workers'), node.get('shuffle'), | |||
| node.get('dataset_type'), sampler, node.get('decode'), node.get('extensions'), | |||
| node.get('num_samples'), sampler, node.get('num_shards'), node.get('shard_id')) | |||
| elif dataset_op == 'GeneratorDataset': | |||
| # Serializing py function can be done using marshal library | |||
| raise RuntimeError(dataset_op + " is not yet supported") | |||
| elif dataset_op == 'RepeatDataset': | |||
| elif dataset_op == 'Repeat': | |||
| pyobj = de.Dataset().repeat(node.get('count')) | |||
| elif dataset_op == 'SkipDataset': | |||
| pyobj = de.Dataset().skip(node.get('count')) | |||
| elif dataset_op == 'TakeDataset': | |||
| pyobj = de.Dataset().take(node.get('count')) | |||
| elif dataset_op == 'MapDataset': | |||
| elif dataset_op == 'Map': | |||
| tensor_ops = construct_tensor_ops(node.get('operations')) | |||
| pyobj = de.Dataset().map(tensor_ops, node.get('input_columns'), node.get('output_columns'), | |||
| node.get('column_order'), node.get('num_parallel_workers')) | |||
| node.get('column_order'), node.get('num_parallel_workers'), | |||
| True, node.get('cache'), node.get('callbacks')) | |||
| elif dataset_op == 'ShuffleDataset': | |||
| elif dataset_op == 'Shuffle': | |||
| pyobj = de.Dataset().shuffle(node.get('buffer_size')) | |||
| elif dataset_op == 'BatchDataset': | |||
| elif dataset_op == 'Batch': | |||
| pyobj = de.Dataset().batch(node['batch_size'], node.get('drop_remainder')) | |||
| elif dataset_op == 'CacheDataset': | |||
| # Member function cache() is not defined in class Dataset yet. | |||
| raise RuntimeError(dataset_op + " is not yet supported.") | |||
| elif dataset_op == 'FilterDataset': | |||
| # Member function filter() is not defined in class Dataset yet. | |||
| raise RuntimeError(dataset_op + " is not yet supported.") | |||
| elif dataset_op == 'TakeDataset': | |||
| # Member function take() is not defined in class Dataset yet. | |||
| raise RuntimeError(dataset_op + " is not yet supported.") | |||
| elif dataset_op == 'ZipDataset': | |||
| elif dataset_op == 'Zip': | |||
| # Create ZipDataset instance, giving dummy input dataset that will be overrided in the caller. | |||
| pyobj = de.ZipDataset((de.Dataset(), de.Dataset())) | |||
| elif dataset_op == 'ConcatDataset': | |||
| # Create ConcatDataset instance, giving dummy input dataset that will be overrided in the caller. | |||
| pyobj = de.ConcatDataset((de.Dataset(), de.Dataset())) | |||
| elif dataset_op == 'RenameDataset': | |||
| elif dataset_op == 'Rename': | |||
| pyobj = de.Dataset().rename(node['input_columns'], node['output_columns']) | |||
| elif dataset_op == 'ProjectDataset': | |||
| pyobj = de.Dataset().project(node['columns']) | |||
| elif dataset_op == 'TransferDataset': | |||
| pyobj = de.Dataset().to_device() | |||
| else: | |||
| raise RuntimeError(dataset_op + " is not yet supported by ds.engine.deserialize().") | |||
| @@ -388,23 +222,28 @@ def construct_sampler(in_sampler): | |||
| """Instantiate Sampler object based on the information from dictionary['sampler']""" | |||
| sampler = None | |||
| if in_sampler is not None: | |||
| if "num_samples" in in_sampler: | |||
| num_samples = check_and_replace_input(in_sampler['num_samples'], 0, None) | |||
| sampler_name = in_sampler['sampler_name'] | |||
| sampler_module = in_sampler['sampler_module'] | |||
| sampler_module = "mindspore.dataset" | |||
| sampler_class = getattr(sys.modules[sampler_module], sampler_name) | |||
| if sampler_name == 'DistributedSampler': | |||
| sampler = sampler_class(in_sampler['num_shards'], in_sampler['shard_id'], in_sampler.get('shuffle')) | |||
| elif sampler_name == 'PKSampler': | |||
| sampler = sampler_class(in_sampler['num_val'], in_sampler.get('num_class'), in_sampler('shuffle')) | |||
| elif sampler_name == 'RandomSampler': | |||
| sampler = sampler_class(in_sampler.get('replacement'), in_sampler.get('num_samples')) | |||
| sampler = sampler_class(in_sampler.get('replacement'), num_samples) | |||
| elif sampler_name == 'SequentialSampler': | |||
| sampler = sampler_class() | |||
| sampler = sampler_class(in_sampler.get('start_index'), num_samples) | |||
| elif sampler_name == 'SubsetRandomSampler': | |||
| sampler = sampler_class(in_sampler['indices']) | |||
| sampler = sampler_class(in_sampler['indices'], num_samples) | |||
| elif sampler_name == 'WeightedRandomSampler': | |||
| sampler = sampler_class(in_sampler['weights'], in_sampler['num_samples'], in_sampler.get('replacement')) | |||
| sampler = sampler_class(in_sampler['weights'], num_samples, in_sampler.get('replacement')) | |||
| else: | |||
| raise ValueError("Sampler type is unknown: {}.".format(sampler_name)) | |||
| if in_sampler.get("child_sampler"): | |||
| for child in in_sampler["child_sampler"]: | |||
| sampler.add_child(construct_sampler(child)) | |||
| return sampler | |||
| @@ -413,70 +252,85 @@ def construct_tensor_ops(operations): | |||
| """Instantiate tensor op object(s) based on the information from dictionary['operations']""" | |||
| result = [] | |||
| for op in operations: | |||
| op_module = op['tensor_op_module'] | |||
| op_name = op['tensor_op_name'] | |||
| op_class = getattr(sys.modules[op_module], op_name) | |||
| op_name = op['tensor_op_name'][:-2] # to remove op from the back of the name | |||
| op_module_vis = sys.modules["mindspore.dataset.vision.c_transforms"] | |||
| op_module_trans = sys.modules["mindspore.dataset.transforms.c_transforms"] | |||
| if hasattr(op_module_vis, op_name): | |||
| op_class = getattr(op_module_vis, op_name) | |||
| elif hasattr(op_module_trans, op_name): | |||
| op_class = getattr(op_module_trans, op_name) | |||
| else: | |||
| raise RuntimeError(op_name + " is not yet supported by deserialize().") | |||
| if op_name == 'Decode': | |||
| result.append(op_class(op.get('rgb'))) | |||
| elif op_name == 'Normalize': | |||
| result.append(op_class(op['mean'], op['std'])) | |||
| elif op_name == 'RandomCrop': | |||
| result.append(op_class(op['size'], op.get('padding'), op.get('pad_if_needed'), | |||
| op.get('fill_value'), Border(op.get('padding_mode')))) | |||
| elif op_name == 'RandomHorizontalFlip': | |||
| result.append(op_class(op.get('prob'))) | |||
| elif op_name == 'RandomVerticalFlip': | |||
| result.append(op_class(op.get('prob'))) | |||
| tuple(op.get('fill_value')), Border(to_border_mode(op.get('padding_mode'))))) | |||
| elif op_name == 'Resize': | |||
| result.append(op_class(op['size'], Inter(op.get('interpolation')))) | |||
| elif op_name == 'RandomResizedCrop': | |||
| result.append(op_class(op['size'], op.get('scale'), op.get('ratio'), | |||
| Inter(op.get('interpolation')), op.get('max_attempts'))) | |||
| elif op_name == 'CenterCrop': | |||
| result.append(op_class(op['size'])) | |||
| elif op_name == 'RandomColorAdjust': | |||
| result.append(op_class(op.get('brightness'), op.get('contrast'), op.get('saturation'), | |||
| op.get('hue'))) | |||
| elif op_name == 'RandomRotation': | |||
| result.append(op_class(op['degree'], op.get('resample'), op.get('expand'), | |||
| op.get('center'), op.get('fill_value'))) | |||
| result.append(op_class(op['size'], Inter(to_interpolation_mode(op.get('interpolation'))))) | |||
| elif op_name == 'Rescale': | |||
| result.append(op_class(op['rescale'], op['shift'])) | |||
| elif op_name == 'RandomResize': | |||
| result.append(op_class(op['size'])) | |||
| elif op_name == 'TypeCast': | |||
| result.append(op_class(op['data_type'])) | |||
| elif op_name == 'HWC2CHW': | |||
| result.append(op_class()) | |||
| elif op_name == 'CHW2HWC': | |||
| raise ValueError("Tensor op is not supported: {}.".format(op_name)) | |||
| elif op_name == 'OneHot': | |||
| result.append(op_class(op['num_classes'])) | |||
| elif op_name == 'RandomCropDecodeResize': | |||
| result.append(op_class(op['size'], op.get('scale'), op.get('ratio'), | |||
| Inter(op.get('interpolation')), op.get('max_attempts'))) | |||
| elif op_name == 'Pad': | |||
| result.append(op_class(op['padding'], op['fill_value'], Border(op['padding_mode']))) | |||
| else: | |||
| raise ValueError("Tensor op name is unknown: {}.".format(op_name)) | |||
| return result | |||
| def to_shuffle_mode(shuffle): | |||
| if shuffle == 2: return "global" | |||
| if shuffle == 1: return "file" | |||
| return False | |||
| def to_interpolation_mode(inter): | |||
| return { | |||
| 0: Inter.LINEAR, | |||
| 1: Inter.NEAREST, | |||
| 2: Inter.CUBIC, | |||
| 3: Inter.AREA | |||
| }[inter] | |||
| def to_border_mode(border): | |||
| return { | |||
| 0: Border.CONSTANT, | |||
| 1: Border.EDGE, | |||
| 2: Border.REFLECT, | |||
| 3: Border.SYMMETRIC | |||
| }[border] | |||
| def to_mstype(data_type): | |||
| return { | |||
| "bool": mstype.bool_, | |||
| "int8": mstype.int8, | |||
| "int16": mstype.int16, | |||
| "int32": mstype.int32, | |||
| "int64": mstype.int64, | |||
| "uint8": mstype.uint8, | |||
| "uint16": mstype.uint16, | |||
| "uint32": mstype.uint32, | |||
| "uint64": mstype.uint64, | |||
| "float16": mstype.float16, | |||
| "float32": mstype.float32, | |||
| "float64": mstype.float64, | |||
| "string": mstype.string | |||
| }[data_type] | |||
| def check_and_replace_input(input_value, expect, replace): | |||
| if input_value == expect: | |||
| return replace | |||
| return input_value | |||
| @@ -22,7 +22,7 @@ import os | |||
| import numpy as np | |||
| from test_minddataset_sampler import add_and_remove_cv_file, get_data, CV_DIR_NAME, CV_FILE_NAME | |||
| from util import config_get_set_num_parallel_workers | |||
| from util import config_get_set_num_parallel_workers, config_get_set_seed | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as c | |||
| @@ -46,6 +46,8 @@ def test_imagefolder(remove_json_files=True): | |||
| # Constructing DE pipeline | |||
| sampler = ds.WeightedRandomSampler(weights, 11) | |||
| child_sampler = ds.SequentialSampler() | |||
| sampler.add_child(child_sampler) | |||
| data1 = ds.ImageFolderDataset(data_dir, sampler=sampler) | |||
| data1 = data1.repeat(1) | |||
| data1 = data1.map(operations=[vision.Decode(True)], input_columns=["image"]) | |||
| @@ -99,6 +101,9 @@ def test_imagefolder(remove_json_files=True): | |||
| def test_mnist_dataset(remove_json_files=True): | |||
| """ | |||
| Test serdes on mnist dataset pipeline. | |||
| """ | |||
| data_dir = "../data/dataset/testMnistData" | |||
| ds.config.set_seed(1) | |||
| @@ -137,6 +142,9 @@ def test_mnist_dataset(remove_json_files=True): | |||
| def test_zip_dataset(remove_json_files=True): | |||
| """ | |||
| Test serdes on zip dataset pipeline. | |||
| """ | |||
| files = ["../data/dataset/testTFTestAllTypes/test.data"] | |||
| schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json" | |||
| ds.config.set_seed(1) | |||
| @@ -178,9 +186,13 @@ def test_zip_dataset(remove_json_files=True): | |||
| def test_random_crop(): | |||
| """ | |||
| Test serdes on RandomCrop pipeline. | |||
| """ | |||
| logger.info("test_random_crop") | |||
| DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"] | |||
| SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json" | |||
| original_seed = config_get_set_seed(1) | |||
| original_num_parallel_workers = config_get_set_num_parallel_workers(1) | |||
| # First dataset | |||
| @@ -209,6 +221,7 @@ def test_random_crop(): | |||
| _ = item2["image"] | |||
| # Restore configuration num_parallel_workers | |||
| ds.config.set_seed(original_seed) | |||
| ds.config.set_num_parallel_workers(original_num_parallel_workers) | |||
| @@ -232,7 +245,7 @@ def delete_json_files(): | |||
| # Test save load minddataset | |||
| def test_minddataset(add_and_remove_cv_file): | |||
| def skip_test_minddataset(add_and_remove_cv_file): | |||
| """tutorial for cv minderdataset.""" | |||
| columns_list = ["data", "file_name", "label"] | |||
| num_readers = 4 | |||