Added get dataset size and fix macro

5 years ago · 38944eafc4
--- a/mindspore/ccsrc/minddata/dataset/api/vision.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/vision.cc
@@ -38,7 +38,6 @@ namespace dataset {

 // Transform operations for computer vision.
 namespace vision {
 #ifndef ENABLE_ANDROID
 // CONSTRUCTORS FOR API CLASSES TO CREATE VISION TENSOR TRANSFORM OPERATIONS
 // (In alphabetical order)

@@ -69,6 +68,7 @@ std::shared_ptr<TensorOperation> Affine::Parse() {
                                           data_->interpolation_, data_->fill_value_);
 }

 #ifndef ENABLE_ANDROID
 // AutoContrast Transform Operation.
 struct AutoContrast::Data {
  Data(float cutoff, const std::vector<uint32_t> &ignore) : cutoff_(cutoff), ignore_(ignore) {}
@@ -290,7 +290,7 @@ std::shared_ptr<TensorOperation> Normalize::Parse(const MapTargetDevice &env) {
  if (env == MapTargetDevice::kAscend310) {
 #ifdef ENABLE_ACL
    return std::make_shared<DvppNormalizeOperation>(data_->mean_, data_->std_);
 #endif
 #endif  // ENABLE_ACL
  }
  return std::make_shared<NormalizeOperation>(data_->mean_, data_->std_);
 }
@@ -328,6 +328,7 @@ Pad::Pad(std::vector<int32_t> padding, std::vector<uint8_t> fill_value, BorderTy
 std::shared_ptr<TensorOperation> Pad::Parse() {
  return std::make_shared<PadOperation>(data_->padding_, data_->fill_value_, data_->padding_mode_);
 }
 #endif  // not ENABLE_ANDROID

 // RandomAffine Transform Operation.
 struct RandomAffine::Data {
@@ -358,6 +359,7 @@ std::shared_ptr<TensorOperation> RandomAffine::Parse() {
                                                 data_->shear_ranges_, data_->interpolation_, data_->fill_value_);
 }

 #ifndef ENABLE_ANDROID
 // RandomColor Transform Operation.
 struct RandomColor::Data {
  Data(float t_lb, float t_ub) : t_lb_(t_lb), t_ub_(t_ub) {}
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_op.cc
@@ -115,7 +115,7 @@ Status MapOp::FetchNextWork(uint32_t worker_id, std::unique_ptr<DataBuffer> *db,

 Status MapOp::GenerateWorkerJob(const std::unique_ptr<MapWorkerJob> *worker_job) {
  std::shared_ptr<MapJob> map_job = nullptr;
  MapTargetDevice prev_target;
  MapTargetDevice prev_target = MapTargetDevice::kCpu;
  for (size_t i = 0; i < tfuncs_.size(); i++) {
    // Currently we only have CPU as the device target
    // In the future, we will have heuristic or control from user to select target device
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
@@ -107,7 +107,7 @@ Status AlbumOp::PrescanEntry() {
  Path folder(folder_path_);
  dirname_offset_ = folder_path_.length();
  std::shared_ptr<Path::DirIterator> dirItr = Path::DirIterator::OpenDirectory(&folder);
  if (folder.Exists() == false || dirItr == nullptr) {
  if (!folder.Exists() || dirItr == nullptr) {
    RETURN_STATUS_UNEXPECTED("Invalid file, failed to open folder: " + folder_path_);
  }
  MS_LOG(INFO) << "Album folder Path found: " << folder_path_ << ".";
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc
@@ -70,7 +70,7 @@ Status AlbumNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops)
  RETURN_IF_NOT_OK(schema->LoadSchemaFile(schema_path_, column_names_));

  // Argument that is not exposed to user in the API.
  std::set<std::string> extensions = {};
  std::set<std::string> extensions = {".json", ".JSON"};
  std::shared_ptr<SamplerRT> sampler_rt = nullptr;
  RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));

@@ -89,5 +89,41 @@ Status AlbumNode::GetShardId(int32_t *shard_id) {
  return Status::OK();
 }

 // Get Dataset size
 Status AlbumNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
                                 int64_t *dataset_size) {
  if (dataset_size_ > 0) {
    *dataset_size = dataset_size_;
    return Status::OK();
  }
  int64_t sample_size = -1;
  int64_t num_rows = 0;
  // iterate over the files in the directory and count files to initiate num_rows
  Path folder(dataset_dir_);
  std::shared_ptr<Path::DirIterator> dirItr = Path::DirIterator::OpenDirectory(&folder);
  if (!folder.Exists() || dirItr == nullptr) {
    RETURN_STATUS_UNEXPECTED("Invalid file, failed to open folder: " + dataset_dir_);
  }
  std::set<std::string> extensions = {".json", ".JSON"};

  while (dirItr->hasNext()) {
    Path file = dirItr->next();
    if (extensions.empty() || extensions.find(file.Extension()) != extensions.end()) {
      num_rows += 1;
    }
  }
  // give sampler the total number of files and check if num_samples is smaller
  std::shared_ptr<SamplerRT> sampler_rt = nullptr;
  RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
  sample_size = sampler_rt->CalculateNumSamples(num_rows);
  if (sample_size == -1) {
    RETURN_IF_NOT_OK(size_getter->DryRun(shared_from_this(), &sample_size));
  }
  *dataset_size = sample_size;
  // We cache dataset size so as to not duplicated run
  dataset_size_ = *dataset_size;
  return Status::OK();
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,6 +61,15 @@ class AlbumNode : public MappableSourceNode {
  /// \return Status Status::OK() if get shard id successfully
  Status GetShardId(int32_t *shard_id) override;

  /// \brief Base-class override for GetDatasetSize
  /// \param[in] size_getter Shared pointer to DatasetSizeGetter
  /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
  ///     dataset size at the expense of accuracy.
  /// \param[out] dataset_size the size of the dataset
  /// \return Status of the function
  Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
                        int64_t *dataset_size) override;

  /// \brief Getter functions
  const std::string &DatasetDir() const { return dataset_dir_; }
  const std::string &SchemaPath() const { return schema_path_; }
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
@@ -207,7 +207,7 @@ uint64_t ProfilingTime::GetCurMilliSecond() {
  using std::chrono::duration_cast;
  using std::chrono::milliseconds;
  using std::chrono::steady_clock;
  return duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count();
  return static_cast<uint64_t>(duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count());
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/vision_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/vision_ir.cc
@@ -474,6 +474,7 @@ Status PadOperation::to_json(nlohmann::json *out_json) {
  *out_json = args;
  return Status::OK();
 }
 #endif

 // RandomAffineOperation
 RandomAffineOperation::RandomAffineOperation(const std::vector<float_t> &degrees,
@@ -586,6 +587,7 @@ Status RandomAffineOperation::to_json(nlohmann::json *out_json) {
  return Status::OK();
 }

 #ifndef ENABLE_ANDROID
 // RandomColorOperation.
 RandomColorOperation::RandomColorOperation(float t_lb, float t_ub) : t_lb_(t_lb), t_ub_(t_ub) { random_op_ = true; }

--- a/tests/ut/cpp/dataset/c_api_dataset_album_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_album_test.cc
@@ -123,11 +123,20 @@ TEST_F(MindDataTestPipeline, TestAlbumGetters) {

  int64_t num_classes = ds->GetNumClasses();
  EXPECT_EQ(num_classes, -1);
  int64_t num_samples = ds->GetDatasetSize();
  EXPECT_EQ(num_samples, 7);

  int64_t batch_size = ds->GetBatchSize();
  EXPECT_EQ(batch_size, 1);
  int64_t repeat_count = ds->GetRepeatCount();
  EXPECT_EQ(repeat_count, 1);
  EXPECT_EQ(ds->GetColumnNames(), column_names);

  // Test get dataset size with num_samples > files in dataset 
  auto sampler = std::make_shared<SequentialSampler>(0, 12);
  std::shared_ptr<Dataset> ds2 = Album(folder_path, schema_file, column_names, false, sampler);
  num_samples = ds->GetDatasetSize();
  EXPECT_EQ(num_samples, 7);
 }

 TEST_F(MindDataTestPipeline, TestAlbumDecode) {