Browse Source

[Part I] Push down json save logic to IR and add getter to each IR node

tags/v1.2.0-rc1
TinaMengtingZhang 5 years ago
parent
commit
4812fece80
56 changed files with 723 additions and 252 deletions
  1. +8
    -1
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc
  2. +5
    -0
      mindspore/ccsrc/minddata/dataset/api/samplers.cc
  3. +5
    -0
      mindspore/ccsrc/minddata/dataset/api/transforms.cc
  4. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt
  5. +21
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
  6. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h
  7. +19
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.cc
  8. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h
  9. +19
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.cc
  10. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.h
  11. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.h
  12. +19
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
  13. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h
  14. +18
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
  15. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
  16. +19
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
  17. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h
  18. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h
  19. +13
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.cc
  20. +2
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.h
  21. +15
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc
  22. +18
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h
  23. +7
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc
  24. +5
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
  25. +29
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
  26. +14
    -4
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
  27. +7
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc
  28. +9
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h
  29. +7
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc
  30. +8
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h
  31. +7
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc
  32. +10
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h
  33. +17
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc
  34. +13
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h
  35. +15
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc
  36. +10
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h
  37. +28
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc
  38. +15
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h
  39. +57
    -0
      mindspore/ccsrc/minddata/dataset/engine/serdes.cc
  40. +56
    -0
      mindspore/ccsrc/minddata/dataset/engine/serdes.h
  41. +4
    -0
      mindspore/ccsrc/minddata/dataset/include/samplers.h
  42. +6
    -0
      mindspore/ccsrc/minddata/dataset/include/transforms.h
  43. +7
    -0
      mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.cc
  44. +2
    -0
      mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.h
  45. +7
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.cc
  46. +2
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.h
  47. +12
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.cc
  48. +2
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.h
  49. +8
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.cc
  50. +2
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.h
  51. +8
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.cc
  52. +2
    -0
      mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.h
  53. +3
    -0
      mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
  54. +12
    -0
      mindspore/dataset/engine/datasets.py
  55. +99
    -245
      mindspore/dataset/engine/serializer_deserializer.py
  56. +15
    -2
      tests/ut/python/dataset/test_serdes_dataset.py

+ 8
- 1
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc View File

@@ -22,6 +22,7 @@
#include "minddata/dataset/callback/py_ds_callback.h"
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/engine/serdes.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"

@@ -92,7 +93,13 @@ PYBIND_REGISTER(DatasetNode, 1, ([](const py::module *m) {
THROW_IF_ERROR(zip->ValidateParams());
return zip;
},
py::arg("datasets"));
py::arg("datasets"))
.def("to_json", [](std::shared_ptr<DatasetNode> self, const std::string &json_filepath) {
nlohmann::json args;
auto serdas = std::make_shared<Serdes>();
THROW_IF_ERROR(serdas->SaveToJSON(self, json_filepath, &args));
return args.dump();
});
}));

// PYBIND FOR LEAF NODES


+ 5
- 0
mindspore/ccsrc/minddata/dataset/api/samplers.cc View File

@@ -258,6 +258,11 @@ std::shared_ptr<SamplerObj> PreBuiltSamplerObj::Copy() {
return sampler;
}

Status PreBuiltSamplerObj::to_json(nlohmann::json *out_json) {
RETURN_IF_NOT_OK(sp_->to_json(out_json));
return Status::OK();
}

#ifndef ENABLE_ANDROID
std::shared_ptr<mindrecord::ShardOperator> PKSamplerObj::BuildForMindDataset() {
// runtime mindrecord sampler object


+ 5
- 0
mindspore/ccsrc/minddata/dataset/api/transforms.cc View File

@@ -229,6 +229,11 @@ std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; }

std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; }

Status PreBuiltOperation::to_json(nlohmann::json *out_json) {
RETURN_IF_NOT_OK(op_->to_json(out_json));
return Status::OK();
}

// RandomApplyOperation
RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob)
: TensorOperation(true), transforms_(transforms), prob_(prob) {}


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/CMakeLists.txt View File

@@ -20,6 +20,7 @@ set(SRC_FILES_LIST
runtime_context.cc
python_runtime_context.cc
consumers/tree_consumer.cc
serdes.cc
)
if (ENABLE_PYTHON)
set(SRC_FILES_LIST


+ 21
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.cc View File

@@ -190,5 +190,26 @@ void DistributedSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const
}
}

Status DistributedSamplerRT::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sampler_name"] = "DistributedSampler";
args["num_shards"] = num_devices_;
args["shard_id"] = device_id_;
args["shuffle"] = shuffle_;
args["num_samples"] = num_samples_;
args["offset"] = offset_;
if (this->HasChildSampler()) {
std::vector<nlohmann::json> children_args;
for (auto child : child_) {
nlohmann::json child_arg;
RETURN_IF_NOT_OK(child->to_json(&child_arg));
children_args.push_back(child_arg);
}
args["child_sampler"] = children_args;
}
*out_json = args;
return Status::OK();
}

} // namespace dataset
} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h View File

@@ -72,6 +72,11 @@ class DistributedSamplerRT : public SamplerRT {

void SamplerPrint(std::ostream &out, bool show_all) const override;

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
int64_t cnt_; // number of samples that have already been filled in to buffer
uint32_t seed_;


+ 19
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.cc View File

@@ -128,5 +128,24 @@ void PKSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
// Then add our own info if any
}
}

Status PKSamplerRT::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sampler_name"] = "PKSampler";
args["num_val"] = samples_per_class_;
args["shuffle"] = shuffle_;
args["num_samples"] = num_samples_;
if (this->HasChildSampler()) {
std::vector<nlohmann::json> children_args;
for (auto child : child_) {
nlohmann::json child_arg;
RETURN_IF_NOT_OK(child->to_json(&child_arg));
children_args.push_back(child_arg);
}
args["child_sampler"] = children_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h View File

@@ -61,6 +61,11 @@ class PKSamplerRT : public SamplerRT { // NOT YET FINISHED
// @param show_all - bool to show detailed vs summary
void SamplerPrint(std::ostream &out, bool show_all) const override;

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
bool shuffle_;
uint32_t seed_;


+ 19
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.cc View File

@@ -127,5 +127,24 @@ void RandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
// Then add our own info if any
}
}

Status RandomSamplerRT::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sampler_name"] = "RandomSampler";
args["replacement"] = replacement_;
args["num_samples"] = num_samples_;
args["reshuffle_each_epoch"] = reshuffle_each_epoch_;
if (this->HasChildSampler()) {
std::vector<nlohmann::json> children_args;
for (auto child : child_) {
nlohmann::json child_arg;
RETURN_IF_NOT_OK(child->to_json(&child_arg));
children_args.push_back(child_arg);
}
args["child_sampler"] = children_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/random_sampler.h View File

@@ -52,6 +52,11 @@ class RandomSamplerRT : public SamplerRT {

void SamplerPrint(std::ostream &out, bool show_all) const override;

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
uint32_t seed_;
bool replacement_;


+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.h View File

@@ -149,6 +149,11 @@ class SamplerRT {
// @return Status The status code returned
Status GetAssociatedChildId(int64_t *out_associated_id, int64_t id);

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

protected:
// Number of rows of data from the place this sampler is sampling from. If this sampler
// has a child sampler, num_rows_ is the number of ids the child sampler will


+ 19
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.cc View File

@@ -17,6 +17,7 @@

#include <algorithm>
#include <memory>
#include <vector>

namespace mindspore {
namespace dataset {
@@ -131,5 +132,23 @@ void SequentialSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
out << "\nStart index: " << start_index_;
}
}

Status SequentialSamplerRT::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sampler_name"] = "SequentialSampler";
args["start_index"] = start_index_;
args["num_samples"] = num_samples_;
if (this->HasChildSampler()) {
std::vector<nlohmann::json> children_args;
for (auto child : child_) {
nlohmann::json child_arg;
RETURN_IF_NOT_OK(child->to_json(&child_arg));
children_args.push_back(child_arg);
}
args["child_sampler"] = children_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h View File

@@ -61,6 +61,11 @@ class SequentialSamplerRT : public SamplerRT {
// @param show_all - bool to show detailed vs summary
void SamplerPrint(std::ostream &out, bool show_all) const override;

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
int64_t current_id_; // The id sequencer. Each new id increments from this
int64_t start_index_; // The starting id. current_id_ begins from here.


+ 18
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc View File

@@ -131,5 +131,23 @@ void SubsetRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const
// Then add our own info if any
}
}

Status SubsetRandomSamplerRT::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sampler_name"] = "SubsetRandomSampler";
args["indices"] = indices_;
args["num_samples"] = num_samples_;
if (this->HasChildSampler()) {
std::vector<nlohmann::json> children_args;
for (auto child : child_) {
nlohmann::json child_arg;
RETURN_IF_NOT_OK(child->to_json(&child_arg));
children_args.push_back(child_arg);
}
args["child_sampler"] = children_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h View File

@@ -56,6 +56,11 @@ class SubsetRandomSamplerRT : public SamplerRT {
// @param show_all - bool to show detailed vs summary
void SamplerPrint(std::ostream &out, bool show_all) const override;

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
// A list of indices (already randomized in constructor).
std::vector<int64_t> indices_;


+ 19
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc View File

@@ -193,5 +193,24 @@ void WeightedRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) con
// Then add our own info if any
}
}

Status WeightedRandomSamplerRT::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sampler_name"] = "WeightedRandomSampler";
args["weights"] = weights_;
args["num_samples"] = num_samples_;
args["replacement"] = replacement_;
if (this->HasChildSampler()) {
std::vector<nlohmann::json> children_args;
for (auto child : child_) {
nlohmann::json child_arg;
RETURN_IF_NOT_OK(child->to_json(&child_arg));
children_args.push_back(child_arg);
}
args["child_sampler"] = children_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h View File

@@ -58,6 +58,11 @@ class WeightedRandomSamplerRT : public SamplerRT {
// @param show_all - bool to show detailed vs summary
void SamplerPrint(std::ostream &out, bool show_all) const override;

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
// A list of weights for each sample.
std::vector<double> weights_;


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h View File

@@ -28,6 +28,7 @@ class DatasetCache {
virtual Status Build() = 0;
virtual Status ValidateParams() = 0;
virtual Status CreateCacheOp(int num_workers, std::shared_ptr<DatasetOp> *ds_op) = 0;
virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }
};
} // namespace mindspore::dataset



+ 13
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.cc View File

@@ -44,5 +44,18 @@ Status DatasetCacheImpl::CreateCacheOp(int32_t num_workers, std::shared_ptr<Data

return Status::OK();
}

Status DatasetCacheImpl::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["session_id"] = session_id_;
args["cache_memory_size"] = cache_mem_sz_;
args["spill"] = spill_;
if (hostname_) args["hostname"] = hostname_.value();
if (port_) args["port"] = port_.value();
if (num_connections_) args["num_connections"] = num_connections_.value();
if (prefetch_sz_) args["prefetch_size"] = prefetch_sz_.value();
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache_impl.h View File

@@ -60,6 +60,8 @@ class DatasetCacheImpl : public DatasetCache {

~DatasetCacheImpl() = default;

Status to_json(nlohmann::json *out_json) override;

private:
std::shared_ptr<CacheClient> cache_client_;
session_id_type session_id_;


+ 15
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc View File

@@ -152,5 +152,20 @@ Status BatchNode::AcceptAfter(IRNodePass *const p, bool *const modified) {
// Downcast shared pointer then call visitor
return p->VisitAfter(shared_from_base<BatchNode>(), modified);
}

Status BatchNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_parallel_workers"] = num_workers_;
args["batch_size"] = batch_size_;
args["drop_remainder"] = drop_remainder_;
#ifdef ENABLE_PYTHON
args["input_columns"] = in_col_names_;
args["output_columns"] = out_col_names_;
args["column_order"] = col_order_;
if (batch_map_func_ != nullptr) args["per_batch_map"] = "pyfunc";
#endif
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 18
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h View File

@@ -87,6 +87,24 @@ class BatchNode : public DatasetNode {
/// \return Status of the node visit
Status AcceptAfter(IRNodePass *const p, bool *const modified) override;

/// \brief Getter functions
int32_t BatchSize() const { return batch_size_; }
bool DropRemainder() const { return drop_remainder_; }
#ifdef ENABLE_PYTHON
bool Pad() const { return pad_; }
const std::vector<std::string> &InColNames() const { return in_col_names_; }
const std::vector<std::string> &OutColNames() const { return out_col_names_; }
const std::vector<std::string> &ColOrder() const { return col_order_; }
const py::function &BatchSizeFunc() const { return batch_size_func_; }
const py::function &BatchMapFunc() const { return batch_map_func_; }
const std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> &PadMap() const { return pad_map_; }
#endif

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
int32_t batch_size_;
bool drop_remainder_;


+ 7
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc View File

@@ -431,6 +431,13 @@ Status DatasetNode::ValidateParams() {
return Status::OK();
}

Status DatasetNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_parallel_workers"] = num_workers_;
*out_json = args;
return Status::OK();
}

Status MappableSourceNode::Accept(IRNodePass *const p, bool *const modified) {
return p->Visit(shared_from_base<MappableSourceNode>(), modified);
}


+ 5
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h View File

@@ -271,6 +271,11 @@ class DatasetNode : public std::enable_shared_from_this<DatasetNode> {

virtual bool IsSizeDefined() { return true; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
virtual Status to_json(nlohmann::json *out_json);

protected:
std::vector<std::shared_ptr<DatasetNode>> children_;
DatasetNode *parent_; // used to record the only one parent of an IR node after parsing phase


+ 29
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc View File

@@ -16,6 +16,7 @@

#include "minddata/dataset/engine/ir/datasetops/map_node.h"

#include <algorithm>
#include <memory>
#include <string>
#include <utility>
@@ -122,5 +123,33 @@ void MapNode::setOperations(const std::vector<std::shared_ptr<TensorOperation>>
operations_ = operations;
}
std::vector<std::shared_ptr<TensorOperation>> MapNode::operations() { return operations_; }

Status MapNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_parallel_workers"] = num_workers_;
args["input_columns"] = input_columns_;
args["output_columns"] = output_columns_;
if (!project_columns_.empty()) args["column_order"] = project_columns_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}

std::vector<nlohmann::json> ops;
std::vector<int32_t> cbs;
nlohmann::json op_args;
for (auto op : operations_) {
RETURN_IF_NOT_OK(op->to_json(&op_args));
op_args["tensor_op_name"] = op->Name();
ops.push_back(op_args);
}
args["operations"] = ops;
std::transform(callbacks_.begin(), callbacks_.end(), std::back_inserter(cbs),
[](std::shared_ptr<DSCallback> cb) -> int32_t { return cb->step_size(); });
args["callback"] = cbs;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 14
- 4
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h View File

@@ -58,10 +58,6 @@ class MapNode : public DatasetNode {
/// \return Status Status::OK() if all the parameters are valid
Status ValidateParams() override;

/// \brief Getter of tensor operations
/// \return Vector of operations the Map node will process
const auto &TensorOperations() const { return operations_; }

/// \brief Base-class override for accepting IRNodePass visitor
/// \param[in] p The node to visit
/// \param[out] modified Indicator if the node was modified
@@ -83,6 +79,20 @@ class MapNode : public DatasetNode {
/// \brief setter to set all tensor operations
void setOperations(const std::vector<std::shared_ptr<TensorOperation>> &operations);

/// \brief Getter functions
/// \brief Getter of tensor operations
/// \return Vector of operations the Map node will process
const auto &TensorOperations() const { return operations_; }
const std::vector<std::string> &InputColumns() const { return input_columns_; }
const std::vector<std::string> &OutputColumns() const { return output_columns_; }
const std::vector<std::string> &ProjectColumns() const { return project_columns_; }
const std::vector<std::shared_ptr<DSCallback>> &Callbacks() const { return callbacks_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
std::vector<std::shared_ptr<TensorOperation>> operations_;



+ 7
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc View File

@@ -62,5 +62,12 @@ Status RenameNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops
return Status::OK();
}

Status RenameNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["input_columns"] = input_columns_;
args["output_columns"] = output_columns_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 9
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h View File

@@ -56,6 +56,15 @@ class RenameNode : public DatasetNode {
/// \return Status Status::OK() if all the parameters are valid
Status ValidateParams() override;

/// \brief Getter functions
const std::vector<std::string> &InputColumns() const { return input_columns_; }
const std::vector<std::string> &OutputColumns() const { return output_columns_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
std::vector<std::string> input_columns_;
std::vector<std::string> output_columns_;


+ 7
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc View File

@@ -95,5 +95,12 @@ Status RepeatNode::AcceptAfter(IRNodePass *const p, bool *const modified) {
// Downcast shared pointer then call visitor
return p->VisitAfter(shared_from_base<RepeatNode>(), modified);
}

Status RepeatNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["count"] = repeat_count_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 8
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h View File

@@ -115,6 +115,14 @@ class RepeatNode : public DatasetNode {
return Status::OK();
}

/// \brief Getter functions
int32_t RepeatCount() const { return repeat_count_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

protected:
std::shared_ptr<RepeatOp> op_; // keep its corresponding run-time op of EpochCtrlNode and RepeatNode
std::shared_ptr<RepeatNode> reset_ancestor_; // updated its immediate Repeat/EpochCtrl ancestor in GeneratorNodePass


+ 7
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc View File

@@ -61,5 +61,12 @@ Status ShuffleNode::ValidateParams() {
return Status::OK();
}

Status ShuffleNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["buffer_size"] = shuffle_size_;
args["reshuffle_each_epoch"] = reset_every_epoch_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 10
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h View File

@@ -53,6 +53,16 @@ class ShuffleNode : public DatasetNode {

Status ValidateParams() override;

/// \brief Getter functions
int32_t ShuffleSize() const { return shuffle_size_; }
uint32_t ShuffleSeed() const { return shuffle_seed_; }
bool ResetEveryEpoch() const { return reset_every_epoch_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
int32_t shuffle_size_;
uint32_t shuffle_seed_;


+ 17
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc View File

@@ -100,5 +100,22 @@ Status ImageFolderNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter>
return Status::OK();
}

Status ImageFolderNode::to_json(nlohmann::json *out_json) {
nlohmann::json args, sampler_args;
RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
args["sampler"] = sampler_args;
args["num_parallel_workers"] = num_workers_;
args["dataset_dir"] = dataset_dir_;
args["decode"] = decode_;
args["extensions"] = exts_;
args["class_indexing"] = class_indexing_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 13
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h View File

@@ -75,6 +75,19 @@ class ImageFolderNode : public MappableSourceNode {
Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) override;

/// \brief Getter functions
const std::string &DatasetDir() const { return dataset_dir_; }
bool Decode() const { return decode_; }
bool Recursive() const { return recursive_; }
const std::shared_ptr<SamplerObj> &Sampler() const { return sampler_; }
const std::map<std::string, int32_t> &ClassIndexing() const { return class_indexing_; }
const std::set<std::string> &Exts() const { return exts_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
std::string dataset_dir_;
bool decode_;


+ 15
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc View File

@@ -87,5 +87,20 @@ Status MnistNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_
return Status::OK();
}

Status MnistNode::to_json(nlohmann::json *out_json) {
nlohmann::json args, sampler_args;
RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
args["sampler"] = sampler_args;
args["num_parallel_workers"] = num_workers_;
args["dataset_dir"] = dataset_dir_;
args["usage"] = usage_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 10
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h View File

@@ -69,6 +69,16 @@ class MnistNode : public MappableSourceNode {
Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) override;

/// \brief Getter functions
const std::string &DatasetDir() const { return dataset_dir_; }
const std::string &Usage() const { return usage_; }
const std::shared_ptr<SamplerObj> &Sampler() const { return sampler_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
std::string dataset_dir_;
std::string usage_;


+ 28
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc View File

@@ -199,5 +199,33 @@ Status TFRecordNode::GetShardFileList(std::vector<std::string> *shard_filenames)
return Status::OK();
}

Status TFRecordNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_parallel_workers"] = num_workers_;
args["dataset_files"] = dataset_files_;
args["schema"] = schema_path_;
args["columns_list"] = columns_list_;
args["num_samples"] = num_samples_;
args["shuffle_global"] = (shuffle_ == ShuffleMode::kGlobal);
args["shuffle_files"] = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);
args["shuffle"] = shuffle_;
args["num_shards"] = num_shards_;
args["shard_id"] = shard_id_;
args["shard_equal_rows"] = shard_equal_rows_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
if (schema_obj_ != nullptr) {
schema_obj_->set_dataset_type("TF");
schema_obj_->set_num_rows(num_samples_);
args["schema_json_string"] = schema_obj_->to_json();
} else {
args["schema_file_path"] = schema_path_;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 15
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h View File

@@ -109,6 +109,21 @@ class TFRecordNode : public NonMappableSourceNode {
/// \return Status of the function
Status GetShardFileList(std::vector<std::string> *shard_filenames);

/// \brief Getter functions
const std::vector<std::string> &DatasetFiles() const { return dataset_files_; }
const std::string &SchemaPath() const { return schema_path_; }
const std::shared_ptr<SchemaObj> &GetSchemaObj() const { return schema_obj_; }
const std::vector<std::string> &ColumnsList() const { return columns_list_; }
int64_t NumSamples() const { return num_samples_; }
ShuffleMode Shuffle() const { return shuffle_; }
int32_t NumShards() const { return num_shards_; }
bool ShardEqualRows() const { return shard_equal_rows_; }

/// \brief Get the arguments of node
/// \param[out] out_json JSON string of all attributes
/// \return Status of the function
Status to_json(nlohmann::json *out_json) override;

private:
std::vector<std::string> dataset_files_;
std::string schema_path_; // schema_path_ path to schema file. It is set when type of schema parameter is string


+ 57
- 0
mindspore/ccsrc/minddata/dataset/engine/serdes.cc View File

@@ -0,0 +1,57 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/serdes.h"

namespace mindspore {
namespace dataset {

Status Serdes::SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json) {
// Dump attributes of current node to json string
nlohmann::json args;
RETURN_IF_NOT_OK(node->to_json(&args));
args["op_type"] = node->Name();

// If the current node isn't leaf node, visit all its children and get all attributes
std::vector<nlohmann::json> children_pipeline;
if (!node->IsLeaf()) {
for (auto child : node->Children()) {
nlohmann::json child_args;
RETURN_IF_NOT_OK(SaveToJSON(child, "", &child_args));
children_pipeline.push_back(child_args);
}
}
args["children"] = children_pipeline;

// Save json string into file if filename is given.
if (!filename.empty()) {
RETURN_IF_NOT_OK(SaveJSONToFile(args, filename));
}

*out_json = args;
return Status::OK();
}

Status Serdes::SaveJSONToFile(nlohmann::json json_string, const std::string &file_name) {
try {
std::ofstream file(file_name);
file << json_string;
} catch (const std::exception &err) {
RETURN_STATUS_UNEXPECTED("Save json string into " + file_name + " failed!");
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 56
- 0
mindspore/ccsrc/minddata/dataset/engine/serdes.h View File

@@ -0,0 +1,56 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_

#include <memory>
#include <string>
#include <vector>
#include <nlohmann/json.hpp>

#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {
/// \brief The Serdes class is used to serialize an IR tree into JSON string and dump into file if file name
/// specified.
class Serdes {
public:
/// \brief Constructor
Serdes() {}

/// \brief default destructor
~Serdes() = default;

/// \brief function to serialize IR tree into JSON string and/or JSON file
/// \param[in] node IR node to be transferred
/// \param[in] filename The file name. If specified, save the generated JSON string into the file
/// \param[out] out_json The result json string
/// \return Status The status code returned
Status SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json);

protected:
/// \brief Helper function to save JSON to a file
/// \param[in] json_string The JSON string to be saved to the file
/// \param[in] file_name The file name
/// \return Status The status code returned
Status SaveJSONToFile(nlohmann::json json_string, const std::string &file_name);
};
} // namespace dataset
} // namespace mindspore

#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_SERDES_H_

+ 4
- 0
mindspore/ccsrc/minddata/dataset/include/samplers.h View File

@@ -64,6 +64,8 @@ class SamplerObj : public std::enable_shared_from_this<SamplerObj> {
/// \return the Status code returned
Status AddChild(std::shared_ptr<SamplerObj> child);

virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

#ifndef ENABLE_ANDROID
/// \brief Virtual function to convert a SamplerObj class into a runtime mindrecord sampler object,
/// only override by SubsetRandomSampler, PkSampler, RandomSampler, SequentialSampler, DistributedSampler
@@ -228,6 +230,8 @@ class PreBuiltSamplerObj : public SamplerObj {

Status ValidateParams() override;

Status to_json(nlohmann::json *out_json) override;

private:
std::shared_ptr<SamplerRT> sp_;
#ifndef ENABLE_ANDROID


+ 6
- 0
mindspore/ccsrc/minddata/dataset/include/transforms.h View File

@@ -20,6 +20,8 @@
#include <memory>
#include <string>
#include <vector>
#include <nlohmann/json.hpp>

#include "minddata/dataset/include/constants.h"
#include "minddata/dataset/include/status.h"

@@ -63,6 +65,8 @@ class TensorOperation : public std::enable_shared_from_this<TensorOperation> {
/// \return true if this op is a random op (returns non-deterministic result e.g. RandomCrop)
bool IsRandomOp() const { return random_op_; }

virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

protected:
bool random_op_;
};
@@ -206,6 +210,8 @@ class PreBuiltOperation : public TensorOperation {

std::string Name() const override;

Status to_json(nlohmann::json *out_json) override;

private:
std::shared_ptr<TensorOp> op_;
};


+ 7
- 0
mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.cc View File

@@ -37,5 +37,12 @@ Status OneHotOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector
if (!outputs.empty()) return Status::OK();
return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
}

Status OneHotOp::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_classes"] = num_classes_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/kernels/data/one_hot_op.h View File

@@ -37,6 +37,8 @@ class OneHotOp : public TensorOp {

std::string Name() const override { return kOneHotOp; }

Status to_json(nlohmann::json *out_json) override;

private:
int num_classes_;
};


+ 7
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.cc View File

@@ -60,5 +60,12 @@ Status DecodeOp::OutputType(const std::vector<DataType> &inputs, std::vector<Dat
outputs[0] = DataType(DataType::DE_UINT8);
return Status::OK();
}

Status DecodeOp::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["rgb"] = is_rgb_format_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/decode_op.h View File

@@ -42,6 +42,8 @@ class DecodeOp : public TensorOp {

std::string Name() const override { return kDecodeOp; }

Status to_json(nlohmann::json *out_json) override;

private:
bool is_rgb_format_ = true;
};


+ 12
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.cc View File

@@ -15,6 +15,7 @@
*/
#include "minddata/dataset/kernels/image/random_crop_op.h"
#include <random>
#include <tuple>
#include "minddata/dataset/kernels/image/image_utils.h"
#include "minddata/dataset/util/random.h"
#include "minddata/dataset/util/status.h"
@@ -137,5 +138,16 @@ Status RandomCropOp::OutputShape(const std::vector<TensorShape> &inputs, std::ve
if (!outputs.empty()) return Status::OK();
return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
}

Status RandomCropOp::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["size"] = std::vector<int32_t>{crop_height_, crop_width_};
args["padding"] = std::vector<int32_t>{pad_top_, pad_bottom_, pad_left_, pad_right_};
args["pad_if_needed"] = pad_if_needed_;
args["fill_value"] = std::tuple<uint8_t, uint8_t, uint8_t>{fill_r_, fill_g_, fill_b_};
args["padding_mode"] = border_type_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_op.h View File

@@ -79,6 +79,8 @@ class RandomCropOp : public TensorOp {

std::string Name() const override { return kRandomCropOp; }

Status to_json(nlohmann::json *out_json) override;

protected:
int32_t crop_height_ = 0;
int32_t crop_width_ = 0;


+ 8
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.cc View File

@@ -29,5 +29,13 @@ Status RescaleOp::OutputType(const std::vector<DataType> &inputs, std::vector<Da
outputs[0] = DataType(DataType::DE_FLOAT32);
return Status::OK();
}

Status RescaleOp::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["rescale"] = rescale_;
args["shift"] = shift_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/rescale_op.h View File

@@ -41,6 +41,8 @@ class RescaleOp : public TensorOp {

std::string Name() const override { return kRescaleOp; }

Status to_json(nlohmann::json *out_json) override;

private:
float rescale_;
float shift_;


+ 8
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.cc View File

@@ -67,5 +67,13 @@ Status ResizeOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector
if (!outputs.empty()) return Status::OK();
return Status(StatusCode::kUnexpectedError, "Input has a wrong shape");
}

Status ResizeOp::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["size"] = std::vector<int32_t>{size1_, size2_};
args["interpolation"] = interpolation_;
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/minddata/dataset/kernels/image/resize_op.h View File

@@ -61,6 +61,8 @@ class ResizeOp : public TensorOp {

std::string Name() const override { return kResizeOp; }

Status to_json(nlohmann::json *out_json) override;

protected:
int32_t size1_;
int32_t size2_;


+ 3
- 0
mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h View File

@@ -19,6 +19,7 @@
#include <memory>
#include <string>
#include <vector>
#include "nlohmann/json.hpp"

#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/core/tensor_row.h"
@@ -199,6 +200,8 @@ class TensorOp {

virtual std::string Name() const = 0;

virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }

protected:
bool is_deterministic_{true};
};


+ 12
- 0
mindspore/dataset/engine/datasets.py View File

@@ -203,6 +203,18 @@ class Dataset:
args["num_parallel_workers"] = self.num_parallel_workers
return args

def to_json(self, filename=""):
"""
Serialize a pipeline into JSON string and dump into file if filename is provided.

Args:
filename (str): filename of json file to be saved as

Returns:
Str, JSON string of the pipeline.
"""
return json.loads(self.parse_tree().to_json(filename))

@check_bucket_batch_by_length
def bucket_batch_by_length(self, column_names, bucket_boundaries, bucket_batch_sizes,
element_length_function=None, pad_info=None,


+ 99
- 245
mindspore/dataset/engine/serializer_deserializer.py View File

@@ -19,13 +19,13 @@ import json
import os
import sys

import mindspore.common.dtype as mstype
from mindspore import log as logger
from . import datasets as de
from ..vision.utils import Inter, Border
from ..core import config


def serialize(dataset, json_filepath=None):
def serialize(dataset, json_filepath=""):
"""
Serialize dataset pipeline into a json file.

@@ -51,11 +51,7 @@ def serialize(dataset, json_filepath=None):
>>> ds.engine.serialize(data, json_filepath="mnist_dataset_pipeline.json") # serialize it to json file
>>> serialized_data = ds.engine.serialize(data) # serialize it to Python dict
"""
serialized_pipeline = traverse(dataset)
if json_filepath:
with open(json_filepath, 'w') as json_file:
json.dump(serialized_pipeline, json_file, indent=2)
return serialized_pipeline
return dataset.to_json(json_filepath)


def deserialize(input_dict=None, json_filepath=None):
@@ -110,93 +106,6 @@ def expand_path(node_repr, key, val):
node_repr[key] = os.path.abspath(val)


def serialize_operations(node_repr, key, val):
"""Serialize tensor op (Python object) to dictionary."""
if isinstance(val, list):
node_repr[key] = []
for op in val:
node_repr[key].append(op.__dict__)
# Extracting module and name information from a Python object
# Example: tensor_op_module is 'minddata.transforms.c_transforms' and tensor_op_name is 'Decode'
node_repr[key][-1]['tensor_op_name'] = type(op).__name__
node_repr[key][-1]['tensor_op_module'] = type(op).__module__
else:
node_repr[key] = val.__dict__
node_repr[key]['tensor_op_name'] = type(val).__name__
node_repr[key]['tensor_op_module'] = type(val).__module__


def serialize_sampler(node_repr, val):
"""Serialize sampler object to dictionary."""
if val is None:
node_repr['sampler'] = None
else:
node_repr['sampler'] = val.__dict__
node_repr['sampler']['sampler_module'] = type(val).__module__
node_repr['sampler']['sampler_name'] = type(val).__name__


def traverse(node):
"""Pre-order traverse the pipeline and capture the information as we go."""
# Node representation (node_repr) is a Python dictionary that capture and store the
# dataset pipeline information before dumping it to JSON or other format.
node_repr = dict()
node_repr['op_type'] = type(node).__name__
node_repr['op_module'] = type(node).__module__

# Start with an empty list of children, will be added later as we traverse this node.
node_repr["children"] = []

# Retrieve the information about the current node. It should include arguments
# passed to the node during object construction.
node_args = node.get_args()
for k, v in node_args.items():
# Store the information about this node into node_repr.
# Further serialize the object in the arguments if needed.
if k == 'operations':
serialize_operations(node_repr, k, v)
elif k == 'sampler':
serialize_sampler(node_repr, v)
elif k == 'padded_sample' and v:
v1 = {key: value for key, value in v.items() if not isinstance(value, bytes)}
node_repr[k] = json.dumps(v1, indent=2)
# return schema json str if its type is mindspore.dataset.Schema
elif k == 'schema' and isinstance(v, de.Schema):
node_repr[k] = v.to_json()
elif k in set(['schema', 'dataset_files', 'dataset_dir', 'schema_file_path']):
expand_path(node_repr, k, v)
elif k == "num_parallel_workers" and v is None:
node_repr[k] = config.get_num_parallel_workers()
else:
node_repr[k] = v

# If a sampler exists in this node, then the following 4 arguments must be set to None:
# num_samples, shard_id, num_shards, shuffle
# These arguments get moved into the sampler itself, so they are no longer needed to
# be set at the dataset level.
# TF Record is a special case because it uses both the dataset and sampler arguments
# which is not decided until later during tree preparation phase.
if node_repr['op_type'] != 'TFRecordDataset' and 'sampler' in node_args.keys():
if 'num_samples' in node_repr.keys():
node_repr['num_samples'] = None
if 'shuffle' in node_repr.keys():
node_repr['shuffle'] = None
if 'num_shards' in node_repr.keys():
node_repr['num_shards'] = None
if 'shard_id' in node_repr.keys():
node_repr['shard_id'] = None

# Leaf node doesn't have input attribute.
if not node.children:
return node_repr

# Recursively traverse the child and assign it to the current node_repr['children'].
for child in node.children:
node_repr["children"].append(traverse(child))

return node_repr


def show(dataset, indentation=2):
"""
Write the dataset pipeline graph onto logger.info.
@@ -206,7 +115,7 @@ def show(dataset, indentation=2):
indentation (int, optional): indentation used by the json print. Pass None to not indent.
"""

pipeline = traverse(dataset)
pipeline = dataset.to_json()
logger.info(json.dumps(pipeline, indent=indentation))


@@ -219,7 +128,7 @@ def compare(pipeline1, pipeline2):
pipeline2 (Dataset): a dataset pipeline.
"""

return traverse(pipeline1) == traverse(pipeline2)
return pipeline1.to_json() == pipeline2.to_json()


def construct_pipeline(node):
@@ -244,140 +153,65 @@ def create_node(node):
"""Parse the key, value in the node dictionary and instantiate the Python Dataset object"""
logger.info('creating node: %s', node['op_type'])
dataset_op = node['op_type']
op_module = node['op_module']
op_module = "mindspore.dataset"

# Get the Python class to be instantiated.
# Example:
# "op_type": "MapDataset",
# "op_module": "mindspore.dataset.datasets",
pyclass = getattr(sys.modules[op_module], dataset_op)
if node.get("children"):
pyclass = getattr(sys.modules[op_module], "Dataset")
else:
pyclass = getattr(sys.modules[op_module], dataset_op)

pyobj = None
# Find a matching Dataset class and call the constructor with the corresponding args.
# When a new Dataset class is introduced, another if clause and parsing code needs to be added.
if dataset_op == 'ImageFolderDataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node.get('num_samples'), node.get('num_parallel_workers'),
num_samples = check_and_replace_input(node.get('num_samples'), 0, None)
pyobj = pyclass(node['dataset_dir'], num_samples, node.get('num_parallel_workers'),
node.get('shuffle'), sampler, node.get('extensions'),
node.get('class_indexing'), node.get('decode'), node.get('num_shards'),
node.get('shard_id'))

elif dataset_op == 'RangeDataset':
pyobj = pyclass(node['start'], node['stop'], node['step'])

elif dataset_op == 'ImageFolderDataset':
pyobj = pyclass(node['dataset_dir'], node['schema'], node.get('distribution'),
node.get('column_list'), node.get('num_parallel_workers'),
node.get('deterministic_output'), node.get('prefetch_size'),
node.get('labels_filename'), node.get('dataset_usage'))
node.get('shard_id'), node.get('cache'))

elif dataset_op == 'MnistDataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'),
node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'MindDataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_file'], node.get('columns_list'),
node.get('num_parallel_workers'), node.get('seed'), node.get('num_shards'),
node.get('shard_id'), sampler)
num_samples = check_and_replace_input(node.get('num_samples'), 0, None)
pyobj = pyclass(node['dataset_dir'], node['usage'], num_samples, node.get('num_parallel_workers'),
node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'), node.get('cache'))

elif dataset_op == 'TFRecordDataset':
shuffle = node.get('shuffle')
shuffle = to_shuffle_mode(node.get('shuffle'))
if shuffle is not None and isinstance(shuffle, str):
shuffle = de.Shuffle(shuffle)
pyobj = pyclass(node['dataset_files'], node.get('schema'), node.get('column_list'),
node.get('num_samples'), node.get('num_parallel_workers'),
shuffle, node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'ManifestDataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_file'], node['usage'], node.get('num_samples'),
node.get('num_parallel_workers'), node.get('shuffle'), sampler,
node.get('class_indexing'), node.get('decode'), node.get('num_shards'),
node.get('shard_id'))
num_samples = check_and_replace_input(node.get('num_samples'), 0, None)
pyobj = pyclass(node['dataset_files'], node.get('schema'), node.get('columns_list'),
num_samples, node.get('num_parallel_workers'),
shuffle, node.get('num_shards'), node.get('shard_id'), node.get('cache'))

elif dataset_op == 'Cifar10Dataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'),
node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'Cifar100Dataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node['usage'], node.get('num_samples'), node.get('num_parallel_workers'),
node.get('shuffle'), sampler, node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'VOCDataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node.get('task'), node.get('mode'), node.get('class_indexing'),
node.get('num_samples'), node.get('num_parallel_workers'), node.get('shuffle'),
node.get('decode'), sampler, node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'CocoDataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node.get('annotation_file'), node.get('task'), node.get('num_samples'),
node.get('num_parallel_workers'), node.get('shuffle'), node.get('decode'), sampler,
node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'CelebADataset':
sampler = construct_sampler(node.get('sampler'))
pyobj = pyclass(node['dataset_dir'], node.get('num_parallel_workers'), node.get('shuffle'),
node.get('dataset_type'), sampler, node.get('decode'), node.get('extensions'),
node.get('num_samples'), sampler, node.get('num_shards'), node.get('shard_id'))

elif dataset_op == 'GeneratorDataset':
# Serializing py function can be done using marshal library
raise RuntimeError(dataset_op + " is not yet supported")

elif dataset_op == 'RepeatDataset':
elif dataset_op == 'Repeat':
pyobj = de.Dataset().repeat(node.get('count'))

elif dataset_op == 'SkipDataset':
pyobj = de.Dataset().skip(node.get('count'))

elif dataset_op == 'TakeDataset':
pyobj = de.Dataset().take(node.get('count'))

elif dataset_op == 'MapDataset':
elif dataset_op == 'Map':
tensor_ops = construct_tensor_ops(node.get('operations'))
pyobj = de.Dataset().map(tensor_ops, node.get('input_columns'), node.get('output_columns'),
node.get('column_order'), node.get('num_parallel_workers'))
node.get('column_order'), node.get('num_parallel_workers'),
True, node.get('cache'), node.get('callbacks'))

elif dataset_op == 'ShuffleDataset':
elif dataset_op == 'Shuffle':
pyobj = de.Dataset().shuffle(node.get('buffer_size'))

elif dataset_op == 'BatchDataset':
elif dataset_op == 'Batch':
pyobj = de.Dataset().batch(node['batch_size'], node.get('drop_remainder'))

elif dataset_op == 'CacheDataset':
# Member function cache() is not defined in class Dataset yet.
raise RuntimeError(dataset_op + " is not yet supported.")

elif dataset_op == 'FilterDataset':
# Member function filter() is not defined in class Dataset yet.
raise RuntimeError(dataset_op + " is not yet supported.")

elif dataset_op == 'TakeDataset':
# Member function take() is not defined in class Dataset yet.
raise RuntimeError(dataset_op + " is not yet supported.")

elif dataset_op == 'ZipDataset':
elif dataset_op == 'Zip':
# Create ZipDataset instance, giving dummy input dataset that will be overrided in the caller.
pyobj = de.ZipDataset((de.Dataset(), de.Dataset()))

elif dataset_op == 'ConcatDataset':
# Create ConcatDataset instance, giving dummy input dataset that will be overrided in the caller.
pyobj = de.ConcatDataset((de.Dataset(), de.Dataset()))

elif dataset_op == 'RenameDataset':
elif dataset_op == 'Rename':
pyobj = de.Dataset().rename(node['input_columns'], node['output_columns'])

elif dataset_op == 'ProjectDataset':
pyobj = de.Dataset().project(node['columns'])

elif dataset_op == 'TransferDataset':
pyobj = de.Dataset().to_device()

else:
raise RuntimeError(dataset_op + " is not yet supported by ds.engine.deserialize().")

@@ -388,23 +222,28 @@ def construct_sampler(in_sampler):
"""Instantiate Sampler object based on the information from dictionary['sampler']"""
sampler = None
if in_sampler is not None:
if "num_samples" in in_sampler:
num_samples = check_and_replace_input(in_sampler['num_samples'], 0, None)
sampler_name = in_sampler['sampler_name']
sampler_module = in_sampler['sampler_module']
sampler_module = "mindspore.dataset"
sampler_class = getattr(sys.modules[sampler_module], sampler_name)
if sampler_name == 'DistributedSampler':
sampler = sampler_class(in_sampler['num_shards'], in_sampler['shard_id'], in_sampler.get('shuffle'))
elif sampler_name == 'PKSampler':
sampler = sampler_class(in_sampler['num_val'], in_sampler.get('num_class'), in_sampler('shuffle'))
elif sampler_name == 'RandomSampler':
sampler = sampler_class(in_sampler.get('replacement'), in_sampler.get('num_samples'))
sampler = sampler_class(in_sampler.get('replacement'), num_samples)
elif sampler_name == 'SequentialSampler':
sampler = sampler_class()
sampler = sampler_class(in_sampler.get('start_index'), num_samples)
elif sampler_name == 'SubsetRandomSampler':
sampler = sampler_class(in_sampler['indices'])
sampler = sampler_class(in_sampler['indices'], num_samples)
elif sampler_name == 'WeightedRandomSampler':
sampler = sampler_class(in_sampler['weights'], in_sampler['num_samples'], in_sampler.get('replacement'))
sampler = sampler_class(in_sampler['weights'], num_samples, in_sampler.get('replacement'))
else:
raise ValueError("Sampler type is unknown: {}.".format(sampler_name))
if in_sampler.get("child_sampler"):
for child in in_sampler["child_sampler"]:
sampler.add_child(construct_sampler(child))

return sampler

@@ -413,70 +252,85 @@ def construct_tensor_ops(operations):
"""Instantiate tensor op object(s) based on the information from dictionary['operations']"""
result = []
for op in operations:
op_module = op['tensor_op_module']
op_name = op['tensor_op_name']
op_class = getattr(sys.modules[op_module], op_name)
op_name = op['tensor_op_name'][:-2] # to remove op from the back of the name
op_module_vis = sys.modules["mindspore.dataset.vision.c_transforms"]
op_module_trans = sys.modules["mindspore.dataset.transforms.c_transforms"]

if hasattr(op_module_vis, op_name):
op_class = getattr(op_module_vis, op_name)
elif hasattr(op_module_trans, op_name):
op_class = getattr(op_module_trans, op_name)
else:
raise RuntimeError(op_name + " is not yet supported by deserialize().")

if op_name == 'Decode':
result.append(op_class(op.get('rgb')))

elif op_name == 'Normalize':
result.append(op_class(op['mean'], op['std']))

elif op_name == 'RandomCrop':
result.append(op_class(op['size'], op.get('padding'), op.get('pad_if_needed'),
op.get('fill_value'), Border(op.get('padding_mode'))))

elif op_name == 'RandomHorizontalFlip':
result.append(op_class(op.get('prob')))

elif op_name == 'RandomVerticalFlip':
result.append(op_class(op.get('prob')))
tuple(op.get('fill_value')), Border(to_border_mode(op.get('padding_mode')))))

elif op_name == 'Resize':
result.append(op_class(op['size'], Inter(op.get('interpolation'))))

elif op_name == 'RandomResizedCrop':
result.append(op_class(op['size'], op.get('scale'), op.get('ratio'),
Inter(op.get('interpolation')), op.get('max_attempts')))

elif op_name == 'CenterCrop':
result.append(op_class(op['size']))

elif op_name == 'RandomColorAdjust':
result.append(op_class(op.get('brightness'), op.get('contrast'), op.get('saturation'),
op.get('hue')))

elif op_name == 'RandomRotation':
result.append(op_class(op['degree'], op.get('resample'), op.get('expand'),
op.get('center'), op.get('fill_value')))
result.append(op_class(op['size'], Inter(to_interpolation_mode(op.get('interpolation')))))

elif op_name == 'Rescale':
result.append(op_class(op['rescale'], op['shift']))

elif op_name == 'RandomResize':
result.append(op_class(op['size']))

elif op_name == 'TypeCast':
result.append(op_class(op['data_type']))

elif op_name == 'HWC2CHW':
result.append(op_class())

elif op_name == 'CHW2HWC':
raise ValueError("Tensor op is not supported: {}.".format(op_name))

elif op_name == 'OneHot':
result.append(op_class(op['num_classes']))

elif op_name == 'RandomCropDecodeResize':
result.append(op_class(op['size'], op.get('scale'), op.get('ratio'),
Inter(op.get('interpolation')), op.get('max_attempts')))

elif op_name == 'Pad':
result.append(op_class(op['padding'], op['fill_value'], Border(op['padding_mode'])))

else:
raise ValueError("Tensor op name is unknown: {}.".format(op_name))

return result


def to_shuffle_mode(shuffle):
if shuffle == 2: return "global"
if shuffle == 1: return "file"
return False


def to_interpolation_mode(inter):
return {
0: Inter.LINEAR,
1: Inter.NEAREST,
2: Inter.CUBIC,
3: Inter.AREA
}[inter]


def to_border_mode(border):
return {
0: Border.CONSTANT,
1: Border.EDGE,
2: Border.REFLECT,
3: Border.SYMMETRIC
}[border]


def to_mstype(data_type):
return {
"bool": mstype.bool_,
"int8": mstype.int8,
"int16": mstype.int16,
"int32": mstype.int32,
"int64": mstype.int64,
"uint8": mstype.uint8,
"uint16": mstype.uint16,
"uint32": mstype.uint32,
"uint64": mstype.uint64,
"float16": mstype.float16,
"float32": mstype.float32,
"float64": mstype.float64,
"string": mstype.string
}[data_type]


def check_and_replace_input(input_value, expect, replace):
if input_value == expect:
return replace
return input_value

+ 15
- 2
tests/ut/python/dataset/test_serdes_dataset.py View File

@@ -22,7 +22,7 @@ import os

import numpy as np
from test_minddataset_sampler import add_and_remove_cv_file, get_data, CV_DIR_NAME, CV_FILE_NAME
from util import config_get_set_num_parallel_workers
from util import config_get_set_num_parallel_workers, config_get_set_seed

import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as c
@@ -46,6 +46,8 @@ def test_imagefolder(remove_json_files=True):

# Constructing DE pipeline
sampler = ds.WeightedRandomSampler(weights, 11)
child_sampler = ds.SequentialSampler()
sampler.add_child(child_sampler)
data1 = ds.ImageFolderDataset(data_dir, sampler=sampler)
data1 = data1.repeat(1)
data1 = data1.map(operations=[vision.Decode(True)], input_columns=["image"])
@@ -99,6 +101,9 @@ def test_imagefolder(remove_json_files=True):


def test_mnist_dataset(remove_json_files=True):
"""
Test serdes on mnist dataset pipeline.
"""
data_dir = "../data/dataset/testMnistData"
ds.config.set_seed(1)

@@ -137,6 +142,9 @@ def test_mnist_dataset(remove_json_files=True):


def test_zip_dataset(remove_json_files=True):
"""
Test serdes on zip dataset pipeline.
"""
files = ["../data/dataset/testTFTestAllTypes/test.data"]
schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
ds.config.set_seed(1)
@@ -178,9 +186,13 @@ def test_zip_dataset(remove_json_files=True):


def test_random_crop():
"""
Test serdes on RandomCrop pipeline.
"""
logger.info("test_random_crop")
DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
original_seed = config_get_set_seed(1)
original_num_parallel_workers = config_get_set_num_parallel_workers(1)

# First dataset
@@ -209,6 +221,7 @@ def test_random_crop():
_ = item2["image"]

# Restore configuration num_parallel_workers
ds.config.set_seed(original_seed)
ds.config.set_num_parallel_workers(original_num_parallel_workers)


@@ -232,7 +245,7 @@ def delete_json_files():


# Test save load minddataset
def test_minddataset(add_and_remove_cv_file):
def skip_test_minddataset(add_and_remove_cv_file):
"""tutorial for cv minderdataset."""
columns_list = ["data", "file_name", "label"]
num_readers = 4


Loading…
Cancel
Save