[Part I] Push down json save logic to IR and add getter to each IR node

5 years ago · 4812fece80
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc
@@ -22,6 +22,7 @@
 #include "minddata/dataset/callback/py_ds_callback.h"
 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/global_context.h"
 #include "minddata/dataset/engine/serdes.h"
 #include "minddata/dataset/include/datasets.h"
 #include "minddata/dataset/text/sentence_piece_vocab.h"

@@ -92,7 +93,13 @@ PYBIND_REGISTER(DatasetNode, 1, ([](const py::module *m) {
                        THROW_IF_ERROR(zip->ValidateParams());
                        return zip;
                      },
                      py::arg("datasets"));
                      py::arg("datasets"))
                    .def("to_json", [](std::shared_ptr<DatasetNode> self, const std::string &json_filepath) {
                      nlohmann::json args;
                      auto serdas = std::make_shared<Serdes>();
                      THROW_IF_ERROR(serdas->SaveToJSON(self, json_filepath, &args));
                      return args.dump();
                    });
                }));

 // PYBIND FOR LEAF NODES
--- a/mindspore/ccsrc/minddata/dataset/api/samplers.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/samplers.cc
@@ -258,6 +258,11 @@ std::shared_ptr<SamplerObj> PreBuiltSamplerObj::Copy() {
  return sampler;
 }

 Status PreBuiltSamplerObj::to_json(nlohmann::json *out_json) {
  RETURN_IF_NOT_OK(sp_->to_json(out_json));
  return Status::OK();
 }

 #ifndef ENABLE_ANDROID
 std::shared_ptr<mindrecord::ShardOperator> PKSamplerObj::BuildForMindDataset() {
  // runtime mindrecord sampler object
--- a/mindspore/ccsrc/minddata/dataset/api/transforms.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/transforms.cc
@@ -229,6 +229,11 @@ std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; }

 std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; }

 Status PreBuiltOperation::to_json(nlohmann::json *out_json) {
  RETURN_IF_NOT_OK(op_->to_json(out_json));
  return Status::OK();
 }

 // RandomApplyOperation
 RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob)
    : TensorOperation(true), transforms_(transforms), prob_(prob) {}
--- a/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt
@@ -20,6 +20,7 @@ set(SRC_FILES_LIST
        runtime_context.cc
        python_runtime_context.cc
        consumers/tree_consumer.cc
        serdes.cc
        )
 if (ENABLE_PYTHON)
  set(SRC_FILES_LIST
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
@@ -190,5 +190,26 @@ void DistributedSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const
  }
 }

 Status DistributedSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "DistributedSampler";
  args["num_shards"] = num_devices_;
  args["shard_id"] = device_id_;
  args["shuffle"] = shuffle_;
  args["num_samples"] = num_samples_;
  args["offset"] = offset_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h
@@ -72,6 +72,11 @@ class DistributedSamplerRT : public SamplerRT {

  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  int64_t cnt_;  // number of samples that have already been filled in to buffer
  uint32_t seed_;
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.cc
@@ -128,5 +128,24 @@ void PKSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
    // Then add our own info if any
  }
 }

 Status PKSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "PKSampler";
  args["num_val"] = samples_per_class_;
  args["shuffle"] = shuffle_;
  args["num_samples"] = num_samples_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h
@@ -61,6 +61,11 @@ class PKSamplerRT : public SamplerRT {  // NOT YET FINISHED
  // @param show_all - bool to show detailed vs summary
  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  bool shuffle_;
  uint32_t seed_;
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.cc
@@ -127,5 +127,24 @@ void RandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
    // Then add our own info if any
  }
 }

 Status RandomSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "RandomSampler";
  args["replacement"] = replacement_;
  args["num_samples"] = num_samples_;
  args["reshuffle_each_epoch"] = reshuffle_each_epoch_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.h
@@ -52,6 +52,11 @@ class RandomSamplerRT : public SamplerRT {

  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  uint32_t seed_;
  bool replacement_;
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.h
@@ -149,6 +149,11 @@ class SamplerRT {
  // @return Status The status code returned
  Status GetAssociatedChildId(int64_t *out_associated_id, int64_t id);

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

 protected:
  // Number of rows of data from the place this sampler is sampling from. If this sampler
  // has a child sampler, num_rows_ is the number of ids the child sampler will
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
@@ -17,6 +17,7 @@

 #include <algorithm>
 #include <memory>
 #include <vector>

 namespace mindspore {
 namespace dataset {
@@ -131,5 +132,23 @@ void SequentialSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
    out << "\nStart index: " << start_index_;
  }
 }

 Status SequentialSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "SequentialSampler";
  args["start_index"] = start_index_;
  args["num_samples"] = num_samples_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h
@@ -61,6 +61,11 @@ class SequentialSamplerRT : public SamplerRT {
  // @param show_all - bool to show detailed vs summary
  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  int64_t current_id_;   // The id sequencer.  Each new id increments from this
  int64_t start_index_;  // The starting id.  current_id_ begins from here.
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
@@ -131,5 +131,23 @@ void SubsetRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const
    // Then add our own info if any
  }
 }

 Status SubsetRandomSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "SubsetRandomSampler";
  args["indices"] = indices_;
  args["num_samples"] = num_samples_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
@@ -56,6 +56,11 @@ class SubsetRandomSamplerRT : public SamplerRT {
  // @param show_all - bool to show detailed vs summary
  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  // A list of indices (already randomized in constructor).
  std::vector<int64_t> indices_;
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
@@ -193,5 +193,24 @@ void WeightedRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) con
    // Then add our own info if any
  }
 }

 Status WeightedRandomSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "WeightedRandomSampler";
  args["weights"] = weights_;
  args["num_samples"] = num_samples_;
  args["replacement"] = replacement_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h
@@ -58,6 +58,11 @@ class WeightedRandomSamplerRT : public SamplerRT {
  // @param show_all - bool to show detailed vs summary
  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  // A list of weights for each sample.
  std::vector<double> weights_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h
@@ -28,6 +28,7 @@ class DatasetCache {
  virtual Status Build() = 0;
  virtual Status ValidateParams() = 0;
  virtual Status CreateCacheOp(int num_workers, std::shared_ptr<DatasetOp> *ds_op) = 0;
  virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }
 };
 }  // namespace mindspore::dataset

--- a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.cc
@@ -44,5 +44,18 @@ Status DatasetCacheImpl::CreateCacheOp(int32_t num_workers, std::shared_ptr<Data

  return Status::OK();
 }

 Status DatasetCacheImpl::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["session_id"] = session_id_;
  args["cache_memory_size"] = cache_mem_sz_;
  args["spill"] = spill_;
  if (hostname_) args["hostname"] = hostname_.value();
  if (port_) args["port"] = port_.value();
  if (num_connections_) args["num_connections"] = num_connections_.value();
  if (prefetch_sz_) args["prefetch_size"] = prefetch_sz_.value();
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.h
@@ -60,6 +60,8 @@ class DatasetCacheImpl : public DatasetCache {

  ~DatasetCacheImpl() = default;

  Status to_json(nlohmann::json *out_json) override;

 private:
  std::shared_ptr<CacheClient> cache_client_;
  session_id_type session_id_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc
@@ -152,5 +152,20 @@ Status BatchNode::AcceptAfter(IRNodePass *const p, bool *const modified) {
  // Downcast shared pointer then call visitor
  return p->VisitAfter(shared_from_base<BatchNode>(), modified);
 }

 Status BatchNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["num_parallel_workers"] = num_workers_;
  args["batch_size"] = batch_size_;
  args["drop_remainder"] = drop_remainder_;
 #ifdef ENABLE_PYTHON
  args["input_columns"] = in_col_names_;
  args["output_columns"] = out_col_names_;
  args["column_order"] = col_order_;
  if (batch_map_func_ != nullptr) args["per_batch_map"] = "pyfunc";
 #endif
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h
@@ -87,6 +87,24 @@ class BatchNode : public DatasetNode {
  /// \return Status of the node visit
  Status AcceptAfter(IRNodePass *const p, bool *const modified) override;

  /// \brief Getter functions
  int32_t BatchSize() const { return batch_size_; }
  bool DropRemainder() const { return drop_remainder_; }
 #ifdef ENABLE_PYTHON
  bool Pad() const { return pad_; }
  const std::vector<std::string> &InColNames() const { return in_col_names_; }
  const std::vector<std::string> &OutColNames() const { return out_col_names_; }
  const std::vector<std::string> &ColOrder() const { return col_order_; }
  const py::function &BatchSizeFunc() const { return batch_size_func_; }
  const py::function &BatchMapFunc() const { return batch_map_func_; }
  const std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> &PadMap() const { return pad_map_; }
 #endif

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  int32_t batch_size_;
  bool drop_remainder_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc
@@ -431,6 +431,13 @@ Status DatasetNode::ValidateParams() {
  return Status::OK();
 }

 Status DatasetNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["num_parallel_workers"] = num_workers_;
  *out_json = args;
  return Status::OK();
 }

 Status MappableSourceNode::Accept(IRNodePass *const p, bool *const modified) {
  return p->Visit(shared_from_base<MappableSourceNode>(), modified);
 }
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
@@ -271,6 +271,11 @@ class DatasetNode : public std::enable_shared_from_this<DatasetNode> {

  virtual bool IsSizeDefined() { return true; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  virtual Status to_json(nlohmann::json *out_json);

 protected:
  std::vector<std::shared_ptr<DatasetNode>> children_;
  DatasetNode *parent_;  // used to record the only one parent of an IR node after parsing phase
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
@@ -16,6 +16,7 @@

 #include "minddata/dataset/engine/ir/datasetops/map_node.h"

 #include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -122,5 +123,33 @@ void MapNode::setOperations(const std::vector<std::shared_ptr<TensorOperation>>
  operations_ = operations;
 }
 std::vector<std::shared_ptr<TensorOperation>> MapNode::operations() { return operations_; }

 Status MapNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["num_parallel_workers"] = num_workers_;
  args["input_columns"] = input_columns_;
  args["output_columns"] = output_columns_;
  if (!project_columns_.empty()) args["column_order"] = project_columns_;
  if (cache_ != nullptr) {
    nlohmann::json cache_args;
    RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
    args["cache"] = cache_args;
  }

  std::vector<nlohmann::json> ops;
  std::vector<int32_t> cbs;
  nlohmann::json op_args;
  for (auto op : operations_) {
    RETURN_IF_NOT_OK(op->to_json(&op_args));
    op_args["tensor_op_name"] = op->Name();
    ops.push_back(op_args);
  }
  args["operations"] = ops;
  std::transform(callbacks_.begin(), callbacks_.end(), std::back_inserter(cbs),
                 [](std::shared_ptr<DSCallback> cb) -> int32_t { return cb->step_size(); });
  args["callback"] = cbs;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
@@ -58,10 +58,6 @@ class MapNode : public DatasetNode {
  /// \return Status Status::OK() if all the parameters are valid
  Status ValidateParams() override;

  /// \brief Getter of tensor operations
  /// \return Vector of operations the Map node will process
  const auto &TensorOperations() const { return operations_; }

  /// \brief Base-class override for accepting IRNodePass visitor
  /// \param[in] p The node to visit
  /// \param[out] modified Indicator if the node was modified
@@ -83,6 +79,20 @@ class MapNode : public DatasetNode {
  /// \brief setter to set all tensor operations
  void setOperations(const std::vector<std::shared_ptr<TensorOperation>> &operations);

  /// \brief Getter functions
  /// \brief Getter of tensor operations
  /// \return Vector of operations the Map node will process
  const auto &TensorOperations() const { return operations_; }
  const std::vector<std::string> &InputColumns() const { return input_columns_; }
  const std::vector<std::string> &OutputColumns() const { return output_columns_; }
  const std::vector<std::string> &ProjectColumns() const { return project_columns_; }
  const std::vector<std::shared_ptr<DSCallback>> &Callbacks() const { return callbacks_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  std::vector<std::shared_ptr<TensorOperation>> operations_;

--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc
@@ -62,5 +62,12 @@ Status RenameNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops
  return Status::OK();
 }

 Status RenameNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["input_columns"] = input_columns_;
  args["output_columns"] = output_columns_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h
@@ -56,6 +56,15 @@ class RenameNode : public DatasetNode {
  /// \return Status Status::OK() if all the parameters are valid
  Status ValidateParams() override;

  /// \brief Getter functions
  const std::vector<std::string> &InputColumns() const { return input_columns_; }
  const std::vector<std::string> &OutputColumns() const { return output_columns_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  std::vector<std::string> input_columns_;
  std::vector<std::string> output_columns_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc
@@ -95,5 +95,12 @@ Status RepeatNode::AcceptAfter(IRNodePass *const p, bool *const modified) {
  // Downcast shared pointer then call visitor
  return p->VisitAfter(shared_from_base<RepeatNode>(), modified);
 }

 Status RepeatNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["count"] = repeat_count_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h
@@ -115,6 +115,14 @@ class RepeatNode : public DatasetNode {
    return Status::OK();
  }

  /// \brief Getter functions
  int32_t RepeatCount() const { return repeat_count_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 protected:
  std::shared_ptr<RepeatOp> op_;                // keep its corresponding run-time op of EpochCtrlNode and RepeatNode
  std::shared_ptr<RepeatNode> reset_ancestor_;  // updated its immediate Repeat/EpochCtrl ancestor in GeneratorNodePass
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc
@@ -61,5 +61,12 @@ Status ShuffleNode::ValidateParams() {
  return Status::OK();
 }

 Status ShuffleNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["buffer_size"] = shuffle_size_;
  args["reshuffle_each_epoch"] = reset_every_epoch_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h
@@ -53,6 +53,16 @@ class ShuffleNode : public DatasetNode {

  Status ValidateParams() override;

  /// \brief Getter functions
  int32_t ShuffleSize() const { return shuffle_size_; }
  uint32_t ShuffleSeed() const { return shuffle_seed_; }
  bool ResetEveryEpoch() const { return reset_every_epoch_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  int32_t shuffle_size_;
  uint32_t shuffle_seed_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc
@@ -100,5 +100,22 @@ Status ImageFolderNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter>
  return Status::OK();
 }

 Status ImageFolderNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args, sampler_args;
  RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
  args["sampler"] = sampler_args;
  args["num_parallel_workers"] = num_workers_;
  args["dataset_dir"] = dataset_dir_;
  args["decode"] = decode_;
  args["extensions"] = exts_;
  args["class_indexing"] = class_indexing_;
  if (cache_ != nullptr) {
    nlohmann::json cache_args;
    RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
    args["cache"] = cache_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h
@@ -75,6 +75,19 @@ class ImageFolderNode : public MappableSourceNode {
  Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
                        int64_t *dataset_size) override;

  /// \brief Getter functions
  const std::string &DatasetDir() const { return dataset_dir_; }
  bool Decode() const { return decode_; }
  bool Recursive() const { return recursive_; }
  const std::shared_ptr<SamplerObj> &Sampler() const { return sampler_; }
  const std::map<std::string, int32_t> &ClassIndexing() const { return class_indexing_; }
  const std::set<std::string> &Exts() const { return exts_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  std::string dataset_dir_;
  bool decode_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc
@@ -87,5 +87,20 @@ Status MnistNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_
  return Status::OK();
 }

 Status MnistNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args, sampler_args;
  RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
  args["sampler"] = sampler_args;
  args["num_parallel_workers"] = num_workers_;
  args["dataset_dir"] = dataset_dir_;
  args["usage"] = usage_;
  if (cache_ != nullptr) {
    nlohmann::json cache_args;
    RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
    args["cache"] = cache_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h
@@ -69,6 +69,16 @@ class MnistNode : public MappableSourceNode {
  Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
                        int64_t *dataset_size) override;

  /// \brief Getter functions
  const std::string &DatasetDir() const { return dataset_dir_; }
  const std::string &Usage() const { return usage_; }
  const std::shared_ptr<SamplerObj> &Sampler() const { return sampler_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  std::string dataset_dir_;
  std::string usage_;
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc
@@ -199,5 +199,33 @@ Status TFRecordNode::GetShardFileList(std::vector<std::string> *shard_filenames)
  return Status::OK();
 }

 Status TFRecordNode::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["num_parallel_workers"] = num_workers_;
  args["dataset_files"] = dataset_files_;
  args["schema"] = schema_path_;
  args["columns_list"] = columns_list_;
  args["num_samples"] = num_samples_;
  args["shuffle_global"] = (shuffle_ == ShuffleMode::kGlobal);
  args["shuffle_files"] = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);
  args["shuffle"] = shuffle_;
  args["num_shards"] = num_shards_;
  args["shard_id"] = shard_id_;
  args["shard_equal_rows"] = shard_equal_rows_;
  if (cache_ != nullptr) {
    nlohmann::json cache_args;
    RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
    args["cache"] = cache_args;
  }
  if (schema_obj_ != nullptr) {
    schema_obj_->set_dataset_type("TF");
    schema_obj_->set_num_rows(num_samples_);
    args["schema_json_string"] = schema_obj_->to_json();
  } else {
    args["schema_file_path"] = schema_path_;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h
@@ -109,6 +109,21 @@ class TFRecordNode : public NonMappableSourceNode {
  /// \return Status of the function
  Status GetShardFileList(std::vector<std::string> *shard_filenames);

  /// \brief Getter functions
  const std::vector<std::string> &DatasetFiles() const { return dataset_files_; }
  const std::string &SchemaPath() const { return schema_path_; }
  const std::shared_ptr<SchemaObj> &GetSchemaObj() const { return schema_obj_; }
  const std::vector<std::string> &ColumnsList() const { return columns_list_; }
  int64_t NumSamples() const { return num_samples_; }
  ShuffleMode Shuffle() const { return shuffle_; }
  int32_t NumShards() const { return num_shards_; }
  bool ShardEqualRows() const { return shard_equal_rows_; }

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

 private:
  std::vector<std::string> dataset_files_;
  std::string schema_path_;  // schema_path_ path to schema file. It is set when type of schema parameter is string
--- a/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
@@ -0,0 +1,57 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "minddata/dataset/engine/serdes.h"

 namespace mindspore {
 namespace dataset {

 Status Serdes::SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json) {
  // Dump attributes of current node to json string
  nlohmann::json args;
  RETURN_IF_NOT_OK(node->to_json(&args));
  args["op_type"] = node->Name();

  // If the current node isn't leaf node, visit all its children and get all attributes
  std::vector<nlohmann::json> children_pipeline;
  if (!node->IsLeaf()) {
    for (auto child : node->Children()) {
      nlohmann::json child_args;
      RETURN_IF_NOT_OK(SaveToJSON(child, "", &child_args));
      children_pipeline.push_back(child_args);
    }
  }
  args["children"] = children_pipeline;

  // Save json string into file if filename is given.
  if (!filename.empty()) {
    RETURN_IF_NOT_OK(SaveJSONToFile(args, filename));
  }

  *out_json = args;
  return Status::OK();
 }

 Status Serdes::SaveJSONToFile(nlohmann::json json_string, const std::string &file_name) {
  try {
    std::ofstream file(file_name);
    file << json_string;
  } catch (const std::exception &err) {
    RETURN_STATUS_UNEXPECTED("Save json string into " + file_name + " failed!");
  }
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/serdes.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/serdes.h
@@ -0,0 +1,56 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_

 #include <memory>
 #include <string>
 #include <vector>
 #include <nlohmann/json.hpp>

 #include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
 #include "minddata/dataset/util/status.h"

 namespace mindspore {
 namespace dataset {
 /// \brief The Serdes class is used to serialize an IR tree into JSON string and dump into file if file name
 /// specified.
 class Serdes {
 public:
  /// \brief Constructor
  Serdes() {}

  /// \brief default destructor
  ~Serdes() = default;

  /// \brief function to serialize IR tree into JSON string and/or JSON file
  /// \param[in] node IR node to be transferred
  /// \param[in] filename The file name. If specified, save the generated JSON string into the file
  /// \param[out] out_json The result json string
  /// \return Status The status code returned
  Status SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json);

 protected:
  /// \brief Helper function to save JSON to a file
  /// \param[in] json_string The JSON string to be saved to the file
  /// \param[in] file_name The file name
  /// \return Status The status code returned
  Status SaveJSONToFile(nlohmann::json json_string, const std::string &file_name);
 };
 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_
--- a/mindspore/ccsrc/minddata/dataset/include/samplers.h
+++ b/mindspore/ccsrc/minddata/dataset/include/samplers.h
@@ -64,6 +64,8 @@ class SamplerObj : public std::enable_shared_from_this<SamplerObj> {
  /// \return the Status code returned
  Status AddChild(std::shared_ptr<SamplerObj> child);

  virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

 #ifndef ENABLE_ANDROID
  /// \brief Virtual function to convert a SamplerObj class into a runtime mindrecord sampler object,
  ///     only override by SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler
@@ -228,6 +230,8 @@ class PreBuiltSamplerObj : public SamplerObj {

  Status ValidateParams() override;

  Status to_json(nlohmann::json *out_json) override;

 private:
  std::shared_ptr<SamplerRT> sp_;
 #ifndef ENABLE_ANDROID
--- a/mindspore/ccsrc/minddata/dataset/include/transforms.h
+++ b/mindspore/ccsrc/minddata/dataset/include/transforms.h
@@ -20,6 +20,8 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include <nlohmann/json.hpp>

 #include "minddata/dataset/include/constants.h"
 #include "minddata/dataset/include/status.h"

@@ -63,6 +65,8 @@ class TensorOperation : public std::enable_shared_from_this<TensorOperation> {
  /// \return true if this op is a random op (returns non-deterministic result e.g. RandomCrop)
  bool IsRandomOp() const { return random_op_; }

  virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

 protected:
  bool random_op_;
 };
@@ -206,6 +210,8 @@ class PreBuiltOperation : public TensorOperation {

  std::string Name() const override;

  Status to_json(nlohmann::json *out_json) override;

 private:
  std::shared_ptr<TensorOp> op_;
 };
--- a/mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.cc
@@ -37,5 +37,12 @@ Status OneHotOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector
  if (!outputs.empty()) return Status::OK();
  return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
 }

 Status OneHotOp::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["num_classes"] = num_classes_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.h
@@ -37,6 +37,8 @@ class OneHotOp : public TensorOp {

  std::string Name() const override { return kOneHotOp; }

  Status to_json(nlohmann::json *out_json) override;

 private:
  int num_classes_;
 };
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.cc
@@ -60,5 +60,12 @@ Status DecodeOp::OutputType(const std::vector<DataType> &inputs, std::vector<Dat
  outputs[0] = DataType(DataType::DE_UINT8);
  return Status::OK();
 }

 Status DecodeOp::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["rgb"] = is_rgb_format_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.h
@@ -42,6 +42,8 @@ class DecodeOp : public TensorOp {

  std::string Name() const override { return kDecodeOp; }

  Status to_json(nlohmann::json *out_json) override;

 private:
  bool is_rgb_format_ = true;
 };
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.cc
@@ -15,6 +15,7 @@
 */
 #include "minddata/dataset/kernels/image/random_crop_op.h"
 #include <random>
 #include <tuple>
 #include "minddata/dataset/kernels/image/image_utils.h"
 #include "minddata/dataset/util/random.h"
 #include "minddata/dataset/util/status.h"
@@ -137,5 +138,16 @@ Status RandomCropOp::OutputShape(const std::vector<TensorShape> &inputs, std::ve
  if (!outputs.empty()) return Status::OK();
  return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
 }

 Status RandomCropOp::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["size"] = std::vector<int32_t>{crop_height_, crop_width_};
  args["padding"] = std::vector<int32_t>{pad_top_, pad_bottom_, pad_left_, pad_right_};
  args["pad_if_needed"] = pad_if_needed_;
  args["fill_value"] = std::tuple<uint8_t, uint8_t, uint8_t>{fill_r_, fill_g_, fill_b_};
  args["padding_mode"] = border_type_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.h
@@ -79,6 +79,8 @@ class RandomCropOp : public TensorOp {

  std::string Name() const override { return kRandomCropOp; }

  Status to_json(nlohmann::json *out_json) override;

 protected:
  int32_t crop_height_ = 0;
  int32_t crop_width_ = 0;
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.cc
@@ -29,5 +29,13 @@ Status RescaleOp::OutputType(const std::vector<DataType> &inputs, std::vector<Da
  outputs[0] = DataType(DataType::DE_FLOAT32);
  return Status::OK();
 }

 Status RescaleOp::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["rescale"] = rescale_;
  args["shift"] = shift_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.h
@@ -41,6 +41,8 @@ class RescaleOp : public TensorOp {

  std::string Name() const override { return kRescaleOp; }

  Status to_json(nlohmann::json *out_json) override;

 private:
  float rescale_;
  float shift_;
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.cc
@@ -67,5 +67,13 @@ Status ResizeOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector
  if (!outputs.empty()) return Status::OK();
  return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
 }

 Status ResizeOp::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["size"] = std::vector<int32_t>{size1_, size2_};
  args["interpolation"] = interpolation_;
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.h
@@ -61,6 +61,8 @@ class ResizeOp : public TensorOp {

  std::string Name() const override { return kResizeOp; }

  Status to_json(nlohmann::json *out_json) override;

 protected:
  int32_t size1_;
  int32_t size2_;
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@@ -19,6 +19,7 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include "nlohmann/json.hpp"

 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/core/tensor_row.h"
@@ -199,6 +200,8 @@ class TensorOp {

  virtual std::string Name() const = 0;

  virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

 protected:
  bool is_deterministic_{true};
 };
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -203,6 +203,18 @@ class Dataset:
        args["num_parallel_workers"] = self.num_parallel_workers
        return args

    def to_json(self, filename=""):
        """
        Serialize a pipeline into JSON string and dump into file if filename is provided.

        Args:
            filename (str): filename of json file to be saved as

        Returns:
            Str, JSON string of the pipeline.
        """
        return json.loads(self.parse_tree().to_json(filename))

    @check_bucket_batch_by_length
    def bucket_batch_by_length(self, column_names, bucket_boundaries, bucket_batch_sizes,
                               element_length_function=None, pad_info=None,
--- a/mindspore/dataset/engine/serializer_deserializer.py
+++ b/mindspore/dataset/engine/serializer_deserializer.py
@@ -19,13 +19,13 @@ import json
 import os
 import sys

 import mindspore.common.dtype as mstype
 from mindspore import log as logger
 from . import datasets as de
 from ..vision.utils import Inter, Border
 from ..core import config


 def serialize(dataset, json_filepath=None):
 def serialize(dataset, json_filepath=""):
    """
    Serialize dataset pipeline into a json file.

@@ -51,11 +51,7 @@ def serialize(dataset, json_filepath=None):
        >>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json")  # serialize it to json file
        >>> serialized_data = ds.engine.serialize(data)  # serialize it to Python dict
    """
    serialized_pipeline = traverse(dataset)
    if json_filepath:
        with open(json_filepath, 'w') as json_file:
            json.dump(serialized_pipeline, json_file, indent=2)
    return serialized_pipeline
    return dataset.to_json(json_filepath)


 def deserialize(input_dict=None, json_filepath=None):
@@ -110,93 +106,6 @@ def expand_path(node_repr, key, val):
        node_repr[key] = os.path.abspath(val)


 def serialize_operations(node_repr, key, val):
    """Serialize tensor op (Python object) to dictionary."""
    if isinstance(val, list):
        node_repr[key] = []
        for op in val:
            node_repr[key].append(op.__dict__)
            # Extracting module and name information from a Python object
            # Example: tensor_op_module is 'minddata.transforms.c_transforms' and tensor_op_name is 'Decode'
            node_repr[key][-1]['tensor_op_name'] = type(op).__name__
            node_repr[key][-1]['tensor_op_module'] = type(op).__module__
    else:
        node_repr[key] = val.__dict__
        node_repr[key]['tensor_op_name'] = type(val).__name__
        node_repr[key]['tensor_op_module'] = type(val).__module__


 def serialize_sampler(node_repr, val):
    """Serialize sampler object to dictionary."""
    if val is None:
        node_repr['sampler'] = None
    else:
        node_repr['sampler'] = val.__dict__
        node_repr['sampler']['sampler_module'] = type(val).__module__
        node_repr['sampler']['sampler_name'] = type(val).__name__


 def traverse(node):
    """Pre-order traverse the pipeline and capture the information as we go."""
    # Node representation (node_repr) is a Python dictionary that capture and store the
    # dataset pipeline information before dumping it to JSON or other format.
    node_repr = dict()
    node_repr['op_type'] = type(node).__name__
    node_repr['op_module'] = type(node).__module__

    # Start with an empty list of children, will be added later as we traverse this node.
    node_repr["children"] = []

    # Retrieve the information about the current node. It should include arguments
    # passed to the node during object construction.
    node_args = node.get_args()
    for k, v in node_args.items():
        # Store the information about this node into node_repr.
        # Further serialize the object in the arguments if needed.
        if k == 'operations':
            serialize_operations(node_repr, k, v)
        elif k == 'sampler':
            serialize_sampler(node_repr, v)
        elif k == 'padded_sample' and v:
            v1 = {key: value for key, value in v.items() if not isinstance(value, bytes)}
            node_repr[k] = json.dumps(v1, indent=2)
        # return schema json str if its type is mindspore.dataset.Schema
        elif k == 'schema' and isinstance(v, de.Schema):
            node_repr[k] = v.to_json()
        elif k in set(['schema', 'dataset_files', 'dataset_dir', 'schema_file_path']):
            expand_path(node_repr, k, v)
        elif k == "num_parallel_workers" and v is None:
            node_repr[k] = config.get_num_parallel_workers()
        else:
            node_repr[k] = v

    # If a sampler exists in this node, then the following 4 arguments must be set to None:
    #    num_samples, shard_id, num_shards, shuffle
    # These arguments get moved into the sampler itself, so they are no longer needed to
    # be set at the dataset level.
    # TF Record is a special case because it uses both the dataset and sampler arguments
    # which is not decided until later during tree preparation phase.
    if node_repr['op_type'] != 'TFRecordDataset' and 'sampler' in node_args.keys():
        if 'num_samples' in node_repr.keys():
            node_repr['num_samples'] = None
        if 'shuffle' in node_repr.keys():
            node_repr['shuffle'] = None
        if 'num_shards' in node_repr.keys():
            node_repr['num_shards'] = None
        if 'shard_id' in node_repr.keys():
            node_repr['shard_id'] = None

    # Leaf node doesn't have input attribute.
    if not node.children:
        return node_repr

    # Recursively traverse the child and assign it to the current node_repr['children'].
    for child in node.children:
        node_repr["children"].append(traverse(child))

    return node_repr


 def show(dataset, indentation=2):
    """
    Write the dataset pipeline graph onto logger.info.
@@ -206,7 +115,7 @@ def show(dataset, indentation=2):
        indentation (int, optional): indentation used by the json print. Pass None to not indent.
    """

    pipeline = traverse(dataset)
    pipeline = dataset.to_json()
    logger.info(json.dumps(pipeline, indent=indentation))


@@ -219,7 +128,7 @@ def compare(pipeline1, pipeline2):
        pipeline2 (Dataset): a dataset pipeline.
    """

    return traverse(pipeline1) == traverse(pipeline2)
    return pipeline1.to_json() == pipeline2.to_json()


 def construct_pipeline(node):
@@ -244,140 +153,65 @@ def create_node(node):
    """Parse the key, value in the node dictionary and instantiate the Python Dataset object"""
    logger.info('creating node: %s', node['op_type'])
    dataset_op = node['op_type']
    op_module = node['op_module']
    op_module = "mindspore.dataset"

    # Get the Python class to be instantiated.
    # Example:
    #  "op_type": "MapDataset",
    #  "op_module": "mindspore.dataset.datasets",
    pyclass = getattr(sys.modules[op_module], dataset_op)
    if node.get("children"):
        pyclass = getattr(sys.modules[op_module], "Dataset")
    else:
        pyclass = getattr(sys.modules[op_module], dataset_op)

    pyobj = None
    # Find a matching Dataset class and call the constructor with the corresponding args.
    # When a new Dataset class is introduced, another if clause and parsing code needs to be added.
    if dataset_op == 'ImageFolderDataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node.get('num_samples'), node.get('num_parallel_workers'),
        num_samples = check_and_replace_input(node.get('num_samples'), 0, None)
        pyobj = pyclass(node['dataset_dir'], num_samples, node.get('num_parallel_workers'),
                        node.get('shuffle'), sampler, node.get('extensions'),
                        node.get('class_indexing'), node.get('decode'), node.get('num_shards'),
                        node.get('shard_id'))

    elif dataset_op == 'RangeDataset':
        pyobj = pyclass(node['start'], node['stop'], node['step'])

    elif dataset_op == 'ImageFolderDataset':
        pyobj = pyclass(node['dataset_dir'], node['schema'], node.get('distribution'),
                        node.get('column_list'), node.get('num_parallel_workers'),
                        node.get('deterministic_output'), node.get('prefetch_size'),
                        node.get('labels_filename'), node.get('dataset_usage'))
                        node.get('shard_id'), node.get('cache'))

    elif dataset_op == 'MnistDataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'),
                        node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'MindDataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_file'], node.get('columns_list'),
                        node.get('num_parallel_workers'), node.get('seed'), node.get('num_shards'),
                        node.get('shard_id'), sampler)
        num_samples = check_and_replace_input(node.get('num_samples'), 0, None)
        pyobj = pyclass(node['dataset_dir'], node['usage'], num_samples, node.get('num_parallel_workers'),
                        node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'), node.get('cache'))

    elif dataset_op == 'TFRecordDataset':
        shuffle = node.get('shuffle')
        shuffle = to_shuffle_mode(node.get('shuffle'))
        if shuffle is not None and isinstance(shuffle, str):
            shuffle = de.Shuffle(shuffle)
        pyobj = pyclass(node['dataset_files'], node.get('schema'), node.get('column_list'),
                        node.get('num_samples'), node.get('num_parallel_workers'),
                        shuffle, node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'ManifestDataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_file'], node['usage'], node.get('num_samples'),
                        node.get('num_parallel_workers'), node.get('shuffle'), sampler,
                        node.get('class_indexing'), node.get('decode'), node.get('num_shards'),
                        node.get('shard_id'))
        num_samples = check_and_replace_input(node.get('num_samples'), 0, None)
        pyobj = pyclass(node['dataset_files'], node.get('schema'), node.get('columns_list'),
                        num_samples, node.get('num_parallel_workers'),
                        shuffle, node.get('num_shards'), node.get('shard_id'), node.get('cache'))

    elif dataset_op == 'Cifar10Dataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'),
                        node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'Cifar100Dataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'),
                        node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'VOCDataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node.get('task'), node.get('mode'), node.get('class_indexing'),
                        node.get('num_samples'), node.get('num_parallel_workers'), node.get('shuffle'),
                        node.get('decode'), sampler, node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'CocoDataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node.get('annotation_file'), node.get('task'), node.get('num_samples'),
                        node.get('num_parallel_workers'), node.get('shuffle'), node.get('decode'), sampler,
                        node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'CelebADataset':
        sampler = construct_sampler(node.get('sampler'))
        pyobj = pyclass(node['dataset_dir'], node.get('num_parallel_workers'), node.get('shuffle'),
                        node.get('dataset_type'), sampler, node.get('decode'), node.get('extensions'),
                        node.get('num_samples'), sampler, node.get('num_shards'), node.get('shard_id'))

    elif dataset_op == 'GeneratorDataset':
        # Serializing py function can be done using marshal library
        raise RuntimeError(dataset_op + " is not yet supported")

    elif dataset_op == 'RepeatDataset':
    elif dataset_op == 'Repeat':
        pyobj = de.Dataset().repeat(node.get('count'))

    elif dataset_op == 'SkipDataset':
        pyobj = de.Dataset().skip(node.get('count'))

    elif dataset_op == 'TakeDataset':
        pyobj = de.Dataset().take(node.get('count'))

    elif dataset_op == 'MapDataset':
    elif dataset_op == 'Map':
        tensor_ops = construct_tensor_ops(node.get('operations'))
        pyobj = de.Dataset().map(tensor_ops, node.get('input_columns'), node.get('output_columns'),
                                 node.get('column_order'), node.get('num_parallel_workers'))
                                 node.get('column_order'), node.get('num_parallel_workers'),
                                 True, node.get('cache'), node.get('callbacks'))

    elif dataset_op == 'ShuffleDataset':
    elif dataset_op == 'Shuffle':
        pyobj = de.Dataset().shuffle(node.get('buffer_size'))

    elif dataset_op == 'BatchDataset':
    elif dataset_op == 'Batch':
        pyobj = de.Dataset().batch(node['batch_size'], node.get('drop_remainder'))

    elif dataset_op == 'CacheDataset':
        # Member function cache() is not defined in class Dataset yet.
        raise RuntimeError(dataset_op + " is not yet supported.")

    elif dataset_op == 'FilterDataset':
        # Member function filter() is not defined in class Dataset yet.
        raise RuntimeError(dataset_op + " is not yet supported.")

    elif dataset_op == 'TakeDataset':
        # Member function take() is not defined in class Dataset yet.
        raise RuntimeError(dataset_op + " is not yet supported.")

    elif dataset_op == 'ZipDataset':
    elif dataset_op == 'Zip':
        # Create ZipDataset instance, giving dummy input dataset that will be overrided in the caller.
        pyobj = de.ZipDataset((de.Dataset(), de.Dataset()))

    elif dataset_op == 'ConcatDataset':
        # Create ConcatDataset instance, giving dummy input dataset that will be overrided in the caller.
        pyobj = de.ConcatDataset((de.Dataset(), de.Dataset()))

    elif dataset_op == 'RenameDataset':
    elif dataset_op == 'Rename':
        pyobj = de.Dataset().rename(node['input_columns'], node['output_columns'])

    elif dataset_op == 'ProjectDataset':
        pyobj = de.Dataset().project(node['columns'])

    elif dataset_op == 'TransferDataset':
        pyobj = de.Dataset().to_device()

    else:
        raise RuntimeError(dataset_op + " is not yet supported by ds.engine.deserialize().")

@@ -388,23 +222,28 @@ def construct_sampler(in_sampler):
    """Instantiate Sampler object based on the information from dictionary['sampler']"""
    sampler = None
    if in_sampler is not None:
        if "num_samples" in in_sampler:
            num_samples = check_and_replace_input(in_sampler['num_samples'], 0, None)
        sampler_name = in_sampler['sampler_name']
        sampler_module = in_sampler['sampler_module']
        sampler_module = "mindspore.dataset"
        sampler_class = getattr(sys.modules[sampler_module], sampler_name)
        if sampler_name == 'DistributedSampler':
            sampler = sampler_class(in_sampler['num_shards'], in_sampler['shard_id'], in_sampler.get('shuffle'))
        elif sampler_name == 'PKSampler':
            sampler = sampler_class(in_sampler['num_val'], in_sampler.get('num_class'), in_sampler('shuffle'))
        elif sampler_name == 'RandomSampler':
            sampler = sampler_class(in_sampler.get('replacement'), in_sampler.get('num_samples'))
            sampler = sampler_class(in_sampler.get('replacement'), num_samples)
        elif sampler_name == 'SequentialSampler':
            sampler = sampler_class()
            sampler = sampler_class(in_sampler.get('start_index'), num_samples)
        elif sampler_name == 'SubsetRandomSampler':
            sampler = sampler_class(in_sampler['indices'])
            sampler = sampler_class(in_sampler['indices'], num_samples)
        elif sampler_name == 'WeightedRandomSampler':
            sampler = sampler_class(in_sampler['weights'], in_sampler['num_samples'], in_sampler.get('replacement'))
            sampler = sampler_class(in_sampler['weights'], num_samples, in_sampler.get('replacement'))
        else:
            raise ValueError("Sampler type is unknown: {}.".format(sampler_name))
    if in_sampler.get("child_sampler"):
        for child in in_sampler["child_sampler"]:
            sampler.add_child(construct_sampler(child))

    return sampler

@@ -413,70 +252,85 @@ def construct_tensor_ops(operations):
    """Instantiate tensor op object(s) based on the information from dictionary['operations']"""
    result = []
    for op in operations:
        op_module = op['tensor_op_module']
        op_name = op['tensor_op_name']
        op_class = getattr(sys.modules[op_module], op_name)
        op_name = op['tensor_op_name'][:-2]  # to remove op from the back of the name
        op_module_vis = sys.modules["mindspore.dataset.vision.c_transforms"]
        op_module_trans = sys.modules["mindspore.dataset.transforms.c_transforms"]

        if hasattr(op_module_vis, op_name):
            op_class = getattr(op_module_vis, op_name)
        elif hasattr(op_module_trans, op_name):
            op_class = getattr(op_module_trans, op_name)
        else:
            raise RuntimeError(op_name + " is not yet supported by deserialize().")

        if op_name == 'Decode':
            result.append(op_class(op.get('rgb')))

        elif op_name == 'Normalize':
            result.append(op_class(op['mean'], op['std']))

        elif op_name == 'RandomCrop':
            result.append(op_class(op['size'], op.get('padding'), op.get('pad_if_needed'),
                                   op.get('fill_value'), Border(op.get('padding_mode'))))

        elif op_name == 'RandomHorizontalFlip':
            result.append(op_class(op.get('prob')))

        elif op_name == 'RandomVerticalFlip':
            result.append(op_class(op.get('prob')))
                                   tuple(op.get('fill_value')), Border(to_border_mode(op.get('padding_mode')))))

        elif op_name == 'Resize':
            result.append(op_class(op['size'], Inter(op.get('interpolation'))))

        elif op_name == 'RandomResizedCrop':
            result.append(op_class(op['size'], op.get('scale'), op.get('ratio'),
                                   Inter(op.get('interpolation')), op.get('max_attempts')))

        elif op_name == 'CenterCrop':
            result.append(op_class(op['size']))

        elif op_name == 'RandomColorAdjust':
            result.append(op_class(op.get('brightness'), op.get('contrast'), op.get('saturation'),
                                   op.get('hue')))

        elif op_name == 'RandomRotation':
            result.append(op_class(op['degree'], op.get('resample'), op.get('expand'),
                                   op.get('center'), op.get('fill_value')))
            result.append(op_class(op['size'], Inter(to_interpolation_mode(op.get('interpolation')))))

        elif op_name == 'Rescale':
            result.append(op_class(op['rescale'], op['shift']))

        elif op_name == 'RandomResize':
            result.append(op_class(op['size']))

        elif op_name == 'TypeCast':
            result.append(op_class(op['data_type']))

        elif op_name == 'HWC2CHW':
            result.append(op_class())

        elif op_name == 'CHW2HWC':
            raise ValueError("Tensor op is not supported: {}.".format(op_name))

        elif op_name == 'OneHot':
            result.append(op_class(op['num_classes']))

        elif op_name == 'RandomCropDecodeResize':
            result.append(op_class(op['size'], op.get('scale'), op.get('ratio'),
                                   Inter(op.get('interpolation')), op.get('max_attempts')))

        elif op_name == 'Pad':
            result.append(op_class(op['padding'], op['fill_value'], Border(op['padding_mode'])))

        else:
            raise ValueError("Tensor op name is unknown: {}.".format(op_name))

    return result


 def to_shuffle_mode(shuffle):
    if shuffle == 2: return "global"
    if shuffle == 1: return "file"
    return False


 def to_interpolation_mode(inter):
    return {
        0: Inter.LINEAR,
        1: Inter.NEAREST,
        2: Inter.CUBIC,
        3: Inter.AREA
    }[inter]


 def to_border_mode(border):
    return {
        0: Border.CONSTANT,
        1: Border.EDGE,
        2: Border.REFLECT,
        3: Border.SYMMETRIC
    }[border]


 def to_mstype(data_type):
    return {
        "bool": mstype.bool_,
        "int8": mstype.int8,
        "int16": mstype.int16,
        "int32": mstype.int32,
        "int64": mstype.int64,
        "uint8": mstype.uint8,
        "uint16": mstype.uint16,
        "uint32": mstype.uint32,
        "uint64": mstype.uint64,
        "float16": mstype.float16,
        "float32": mstype.float32,
        "float64": mstype.float64,
        "string": mstype.string
    }[data_type]


 def check_and_replace_input(input_value, expect, replace):
    if input_value == expect:
        return replace
    return input_value
--- a/tests/ut/python/dataset/test_serdes_dataset.py
+++ b/tests/ut/python/dataset/test_serdes_dataset.py
@@ -22,7 +22,7 @@ import os

 import numpy as np
 from test_minddataset_sampler import add_and_remove_cv_file, get_data, CV_DIR_NAME, CV_FILE_NAME
 from util import config_get_set_num_parallel_workers
 from util import config_get_set_num_parallel_workers, config_get_set_seed

 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as c
@@ -46,6 +46,8 @@ def test_imagefolder(remove_json_files=True):

    # Constructing DE pipeline
    sampler = ds.WeightedRandomSampler(weights, 11)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)
    data1 = ds.ImageFolderDataset(data_dir, sampler=sampler)
    data1 = data1.repeat(1)
    data1 = data1.map(operations=[vision.Decode(True)], input_columns=["image"])
@@ -99,6 +101,9 @@ def test_imagefolder(remove_json_files=True):


 def test_mnist_dataset(remove_json_files=True):
    """
    Test serdes on mnist dataset pipeline.
    """
    data_dir = "../data/dataset/testMnistData"
    ds.config.set_seed(1)

@@ -137,6 +142,9 @@ def test_mnist_dataset(remove_json_files=True):


 def test_zip_dataset(remove_json_files=True):
    """
    Test serdes on zip dataset pipeline.
    """
    files = ["../data/dataset/testTFTestAllTypes/test.data"]
    schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
    ds.config.set_seed(1)
@@ -178,9 +186,13 @@ def test_zip_dataset(remove_json_files=True):


 def test_random_crop():
    """
    Test serdes on RandomCrop pipeline.
    """
    logger.info("test_random_crop")
    DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
    SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
    original_seed = config_get_set_seed(1)
    original_num_parallel_workers = config_get_set_num_parallel_workers(1)

    # First dataset
@@ -209,6 +221,7 @@ def test_random_crop():
        _ = item2["image"]

    # Restore configuration num_parallel_workers
    ds.config.set_seed(original_seed)
    ds.config.set_num_parallel_workers(original_num_parallel_workers)


@@ -232,7 +245,7 @@ def delete_json_files():


 # Test save load minddataset
 def test_minddataset(add_and_remove_cv_file):
 def skip_test_minddataset(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
    num_readers = 4