Add SubsetSampler

5 years ago · 7120c66998
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/datasetops/source/sampler/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/datasetops/source/sampler/bindings.cc
@@ -72,12 +72,17 @@ PYBIND_REGISTER(SequentialSamplerRT, 1, ([](const py::module *m) {
                    .def(py::init<int64_t, int64_t>());
                }));

 PYBIND_REGISTER(SubsetRandomSamplerRT, 1, ([](const py::module *m) {
                  (void)py::class_<SubsetRandomSamplerRT, SamplerRT, std::shared_ptr<SubsetRandomSamplerRT>>(
 PYBIND_REGISTER(SubsetRandomSamplerRT, 2, ([](const py::module *m) {
                  (void)py::class_<SubsetRandomSamplerRT, SubsetSamplerRT, std::shared_ptr<SubsetRandomSamplerRT>>(
                    *m, "SubsetRandomSampler")
                    .def(py::init<int64_t, std::vector<int64_t>>());
                }));

 PYBIND_REGISTER(SubsetSamplerRT, 1, ([](const py::module *m) {
                  (void)py::class_<SubsetSamplerRT, SamplerRT, std::shared_ptr<SubsetSamplerRT>>(*m, "SubsetSampler")
                    .def(py::init<int64_t, std::vector<int64_t>>());
                }));

 PYBIND_REGISTER(WeightedRandomSamplerRT, 1, ([](const py::module *m) {
                  (void)py::class_<WeightedRandomSamplerRT, SamplerRT, std::shared_ptr<WeightedRandomSamplerRT>>(
                    *m, "WeightedRandomSampler")
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/mindrecord/include/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/mindrecord/include/bindings.cc
@@ -61,8 +61,9 @@ PYBIND_REGISTER(
 PYBIND_REGISTER(
  ShardSample, 0, ([](const py::module *m) {
    (void)py::class_<mindrecord::ShardSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardSample>>(
      *m, "MindrecordSubsetRandomSampler")
      .def(py::init<std::vector<int64_t>, uint32_t>());
      *m, "MindrecordSubsetSampler")
      .def(py::init<std::vector<int64_t>, uint32_t>())
      .def(py::init<std::vector<int64_t>>());
  }));

 PYBIND_REGISTER(ShardSequentialSample, 0, ([](const py::module *m) {
--- a/mindspore/ccsrc/minddata/dataset/api/samplers.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/samplers.cc
@@ -20,6 +20,7 @@
 #include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/subset_sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"

@@ -121,6 +122,16 @@ std::shared_ptr<SequentialSamplerObj> SequentialSampler(int64_t start_index, int
  return sampler;
 }

 /// Function to create a Subset Random Sampler.
 std::shared_ptr<SubsetSamplerObj> SubsetSampler(std::vector<int64_t> indices, int64_t num_samples) {
  auto sampler = std::make_shared<SubsetSamplerObj>(std::move(indices), num_samples);
  // Input validation
  if (sampler->ValidateParams().IsError()) {
    return nullptr;
  }
  return sampler;
 }

 /// Function to create a Subset Random Sampler.
 std::shared_ptr<SubsetRandomSamplerObj> SubsetRandomSampler(std::vector<int64_t> indices, int64_t num_samples) {
  auto sampler = std::make_shared<SubsetRandomSamplerObj>(std::move(indices), num_samples);
@@ -340,11 +351,11 @@ std::shared_ptr<mindrecord::ShardOperator> SequentialSamplerObj::BuildForMindDat
 }
 #endif

 // SubsetRandomSampler
 SubsetRandomSamplerObj::SubsetRandomSamplerObj(std::vector<int64_t> indices, int64_t num_samples)
 // SubsetSampler
 SubsetSamplerObj::SubsetSamplerObj(std::vector<int64_t> indices, int64_t num_samples)
    : indices_(std::move(indices)), num_samples_(num_samples) {}

 Status SubsetRandomSamplerObj::ValidateParams() {
 Status SubsetSamplerObj::ValidateParams() {
  if (num_samples_ < 0) {
    RETURN_STATUS_UNEXPECTED("SubsetRandomSampler: invalid num_samples: " + std::to_string(num_samples_));
  }
@@ -352,6 +363,26 @@ Status SubsetRandomSamplerObj::ValidateParams() {
  return Status::OK();
 }

 std::shared_ptr<SamplerRT> SubsetSamplerObj::SamplerBuild() {
  // runtime sampler object
  auto sampler = std::make_shared<dataset::SubsetSamplerRT>(num_samples_, indices_);
  BuildChildren(sampler);
  return sampler;
 }

 #ifndef ENABLE_ANDROID
 std::shared_ptr<mindrecord::ShardOperator> SubsetSamplerObj::BuildForMindDataset() {
  // runtime mindrecord sampler object
  auto mind_sampler = std::make_shared<mindrecord::ShardSample>(indices_);

  return mind_sampler;
 }
 #endif

 // SubsetRandomSampler
 SubsetRandomSamplerObj::SubsetRandomSamplerObj(std::vector<int64_t> indices, int64_t num_samples)
    : SubsetSamplerObj(std::move(indices), num_samples) {}

 std::shared_ptr<SamplerRT> SubsetRandomSamplerObj::SamplerBuild() {
  // runtime sampler object
  auto sampler = std::make_shared<dataset::SubsetRandomSamplerRT>(num_samples_, indices_);
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/CMakeLists.txt
@@ -8,6 +8,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SRC_FILES
        sampler.cc
        sequential_sampler.cc
        subset_random_sampler.cc
        subset_sampler.cc
        weighted_random_sampler.cc
        )

--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
@@ -28,99 +28,31 @@ namespace dataset {
 // Constructor.
 SubsetRandomSamplerRT::SubsetRandomSamplerRT(int64_t num_samples, const std::vector<int64_t> &indices,
                                             int64_t samples_per_buffer)
    : SamplerRT(num_samples, samples_per_buffer), indices_(indices), sample_id_(0), buffer_id_(0) {}
    : SubsetSamplerRT(num_samples, indices, samples_per_buffer) {}

 // Initialized this Sampler.
 Status SubsetRandomSamplerRT::InitSampler() {
  if (is_initialized) {
    return Status::OK();
  }
  CHECK_FAIL_RETURN_UNEXPECTED(
    num_rows_ > 0, "Invalid parameter, num_rows must be greater than 0, but got " + std::to_string(num_rows_) + ".\n");

  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
  // In this case, the id's are provided by the user.  Cap the num_samples on the number of id's given.
  if (num_samples_ == 0 || num_samples_ > static_cast<int64_t>(indices_.size())) {
    num_samples_ = static_cast<int64_t>(indices_.size());
  }
  // Initialize random generator with seed from config manager
  rand_gen_.seed(GetSeed());

  if (samples_per_buffer_ > num_samples_) {
    samples_per_buffer_ = num_samples_;
  }

  // num_samples_ could be smaller than the total number of input id's.
  // We will shuffle the full set of id's, but only select the first num_samples_ of them later.
  std::shuffle(indices_.begin(), indices_.end(), rand_gen_);

  is_initialized = true;
  return Status::OK();
  return SubsetSamplerRT::InitSampler();
 }

 // Reset the internal variable to the initial state.
 Status SubsetRandomSamplerRT::ResetSampler() {
  // Reset the internal counters.
  sample_id_ = 0;
  buffer_id_ = 0;

  // Randomized the indices again.
  rand_gen_.seed(GetSeed());
  std::shuffle(indices_.begin(), indices_.end(), rand_gen_);

  if (HasChildSampler()) {
    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
  }

  return Status::OK();
 }

 // Get the sample ids.
 Status SubsetRandomSamplerRT::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
  // All samples have been drawn
  if (sample_id_ == num_samples_) {
    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
  } else {
    if (HasChildSampler()) {
      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
    }

    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> outputIds;

    int64_t last_id = sample_id_ + samples_per_buffer_;
    // Handling the return all samples at once, and when last draw is not a full batch.
    if (last_id > num_samples_) {
      last_id = num_samples_;
    }

    // Allocate tensor
    RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_));

    // Initialize tensor
    auto id_ptr = outputIds->begin<int64_t>();
    while (sample_id_ < last_id) {
      if (indices_[sample_id_] >= num_rows_) {
        std::string err_msg = "Generated indice is out of bound, expect range [0, num_data-1], got indice: " +
                              std::to_string(indices_[sample_id_]) + ", num_data: " + std::to_string(num_rows_ - 1);
        RETURN_STATUS_UNEXPECTED(err_msg);
      }

      int64_t sampled_id = ((indices_[sample_id_] % num_rows_) + num_rows_) % num_rows_;
      if (HasChildSampler()) {
        RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
      }

      *id_ptr = sampled_id;
      id_ptr++;
      sample_id_++;
    }

    // Create a TensorTable from that single tensor and push into DataBuffer
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, TensorRow(1, outputIds)));
  }

  return Status::OK();
  return SubsetSamplerRT::ResetSampler();
 }

 void SubsetRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
@@ -134,19 +66,8 @@ void SubsetRandomSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const

 Status SubsetRandomSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  RETURN_IF_NOT_OK(SubsetSamplerRT::to_json(&args));
  args["sampler_name"] = "SubsetRandomSampler";
  args["indices"] = indices_;
  args["num_samples"] = num_samples_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }
 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h
@@ -21,39 +21,35 @@
 #include <vector>

 #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/subset_sampler.h"

 namespace mindspore {
 namespace dataset {
 // Randomly samples elements from a given list of indices, without replacement.
 class SubsetRandomSamplerRT : public SamplerRT {
 /// Randomly samples elements from a given list of indices, without replacement.
 class SubsetRandomSamplerRT : public SubsetSamplerRT {
 public:
  // Constructor.
  // @param num_samples The number of samples to draw. 0 for the full amount.
  // @param indices List of indices from where we will randomly draw samples.
  // @param samples_per_buffer The number of ids we draw on each call to GetNextBuffer().
  // When samplesPerBuffer=0, GetNextBuffer() will draw all the sample ids and return them at once.
  /// Constructor.
  /// \param num_samples The number of samples to draw. 0 for the full amount.
  /// \param indices List of indices from where we will randomly draw samples.
  /// \param samples_per_buffer The number of ids we draw on each call to GetNextBuffer().
  /// When samples_per_buffer=0, GetNextBuffer() will draw all the sample ids and return them at once.
  SubsetRandomSamplerRT(int64_t num_samples, const std::vector<int64_t> &indices,
                        std::int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());

  // Destructor.
  /// Destructor.
  ~SubsetRandomSamplerRT() = default;

  // Initialize the sampler.
  // @return Status
  /// Initialize the sampler.
  /// \return Status
  Status InitSampler() override;

  // Reset the internal variable to the initial state and reshuffle the indices.
  // @return Status
  /// Reset the internal variable to the initial state and reshuffle the indices.
  /// \return Status
  Status ResetSampler() override;

  // Get the sample ids.
  // @param[out] out_buffer The address of a unique_ptr to DataBuffer where the sample ids will be placed.
  // @note the sample ids (int64_t) will be placed in one Tensor and be placed into pBuffer.
  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;

  // Printer for debugging purposes.
  // @param out - output stream to write to
  // @param show_all - bool to show detailed vs summary
  /// Printer for debugging purposes.
  /// \param out - output stream to write to
  /// \param show_all - bool to show detailed vs summary
  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
@@ -62,15 +58,6 @@ class SubsetRandomSamplerRT : public SamplerRT {
  Status to_json(nlohmann::json *out_json) override;

 private:
  // A list of indices (already randomized in constructor).
  std::vector<int64_t> indices_;

  // Current sample id.
  int64_t sample_id_;

  // Current buffer id.
  int64_t buffer_id_;

  // A random number generator.
  std::mt19937 rand_gen_;
 };
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_sampler.cc
@@ -0,0 +1,148 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "minddata/dataset/engine/datasetops/source/sampler/subset_sampler.h"

 #include <algorithm>
 #include <memory>
 #include <string>

 namespace mindspore {
 namespace dataset {
 // Constructor.
 SubsetSamplerRT::SubsetSamplerRT(int64_t num_samples, const std::vector<int64_t> &indices, int64_t samples_per_buffer)
    : SamplerRT(num_samples, samples_per_buffer), indices_(indices), sample_id_(0), buffer_id_(0) {}

 // Initialized this Sampler.
 Status SubsetSamplerRT::InitSampler() {
  if (is_initialized) {
    return Status::OK();
  }
  CHECK_FAIL_RETURN_UNEXPECTED(
    num_rows_ > 0, "Invalid parameter, num_rows must be greater than 0, but got " + std::to_string(num_rows_) + ".\n");

  // Special value of 0 for num_samples means that the user wants to sample the entire set of data.
  // In this case, the id's are provided by the user.  Cap the num_samples on the number of id's given.
  if (num_samples_ == 0 || num_samples_ > static_cast<int64_t>(indices_.size())) {
    num_samples_ = static_cast<int64_t>(indices_.size());
  }

  if (samples_per_buffer_ > num_samples_) {
    samples_per_buffer_ = num_samples_;
  }

  is_initialized = true;
  return Status::OK();
 }

 // Reset the internal variable to the initial state.
 Status SubsetSamplerRT::ResetSampler() {
  // Reset the internal counters.
  sample_id_ = 0;
  buffer_id_ = 0;

  if (HasChildSampler()) {
    RETURN_IF_NOT_OK(child_[0]->ResetSampler());
  }

  return Status::OK();
 }

 // Get the sample ids.
 Status SubsetSamplerRT::GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) {
  // All samples have been drawn
  if (sample_id_ == num_samples_) {
    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
  } else {
    if (HasChildSampler()) {
      RETURN_IF_NOT_OK(child_[0]->GetNextSample(&child_ids_));
    }

    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> outputIds;

    int64_t last_id = sample_id_ + samples_per_buffer_;
    // Handling the return all samples at once, and when last draw is not a full batch.
    if (last_id > num_samples_) {
      last_id = num_samples_;
    }

    // Allocate tensor
    RETURN_IF_NOT_OK(CreateSamplerTensor(&outputIds, last_id - sample_id_));

    // Initialize tensor
    auto id_ptr = outputIds->begin<int64_t>();
    while (sample_id_ < last_id) {
      if (indices_[sample_id_] >= num_rows_ || indices_[sample_id_] < 0) {
        std::string err_msg = "Sample ID (" + std::to_string(indices_[sample_id_]) +
                              ") is out of bound, expected range [0, " + std::to_string(num_rows_ - 1) + "]";
        RETURN_STATUS_UNEXPECTED(err_msg);
      }

      int64_t sampled_id = ((indices_[sample_id_] % num_rows_) + num_rows_) % num_rows_;
      if (HasChildSampler()) {
        RETURN_IF_NOT_OK(GetAssociatedChildId(&sampled_id, sampled_id));
      }

      *id_ptr = sampled_id;
      id_ptr++;
      sample_id_++;
    }

    // Create a TensorTable from that single tensor and push into DataBuffer
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, TensorRow(1, outputIds)));
  }

  return Status::OK();
 }

 void SubsetSamplerRT::SamplerPrint(std::ostream &out, bool show_all) const {
  out << "\nSampler: SubsetSampler";
  if (show_all) {
    // Call the super class for displaying any common detailed info
    SamplerRT::SamplerPrint(out, show_all);
    // Then add our own info if any
  }
 }

 Status SubsetSamplerRT::to_json(nlohmann::json *out_json) {
  nlohmann::json args;
  args["sampler_name"] = "SubsetSampler";
  args["indices"] = indices_;
  args["num_samples"] = num_samples_;
  if (this->HasChildSampler()) {
    std::vector<nlohmann::json> children_args;
    for (auto child : child_) {
      nlohmann::json child_arg;
      RETURN_IF_NOT_OK(child->to_json(&child_arg));
      children_args.push_back(child_arg);
    }
    args["child_sampler"] = children_args;
  }
  *out_json = args;
  return Status::OK();
 }

 int64_t SubsetSamplerRT::CalculateNumSamples(int64_t num_rows) {
  int64_t child_num_rows = num_rows;
  if (!child_.empty()) {
    child_num_rows = child_[0]->CalculateNumSamples(num_rows);
  }
  int64_t res = (num_samples_ > 0) ? std::min(child_num_rows, num_samples_) : child_num_rows;
  res = std::min(res, static_cast<int64_t>(indices_.size()));
  return res;
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/subset_sampler.h
@@ -0,0 +1,84 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SUBSET_SAMPLER_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SUBSET_SAMPLER_H_

 #include <limits>
 #include <memory>
 #include <vector>

 #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"

 namespace mindspore {
 namespace dataset {
 /// Samples elements from a given list of indices.
 class SubsetSamplerRT : public SamplerRT {
 public:
  /// Constructor.
  /// \param num_samples The number of elements to sample. 0 for the full amount.
  /// \param indices List of indices.
  /// \param samples_per_buffer The number of ids we draw on each call to GetNextBuffer().
  /// When samples_per_buffer=0, GetNextBuffer() will draw all the sample ids and return them at once.
  SubsetSamplerRT(int64_t num_samples, const std::vector<int64_t> &indices,
                  std::int64_t samples_per_buffer = std::numeric_limits<int64_t>::max());

  /// Destructor.
  ~SubsetSamplerRT() = default;

  /// Initialize the sampler.
  /// \return Status
  Status InitSampler() override;

  /// Reset the internal variable to the initial state and reshuffle the indices.
  /// \return Status
  Status ResetSampler() override;

  /// Get the sample ids.
  /// \param[out] out_buffer The address of a unique_ptr to DataBuffer where the sample ids will be placed.
  /// @note the sample ids (int64_t) will be placed in one Tensor and be placed into pBuffer.
  Status GetNextSample(std::unique_ptr<DataBuffer> *out_buffer) override;

  /// Printer for debugging purposes.
  /// \param out - output stream to write to
  /// \param show_all - bool to show detailed vs summary
  void SamplerPrint(std::ostream &out, bool show_all) const override;

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
  /// \return Status of the function
  Status to_json(nlohmann::json *out_json) override;

  /// Calculate num samples. Unlike GetNumSamples, it is not a getter and doesn't necessarily return the value of
  /// num_samples_
  /// \param num_rows the size of the dataset this sampler will be applied to.
  /// \return number of samples
  int64_t CalculateNumSamples(int64_t num_rows) override;

 protected:
  /// A list of indices (already randomized in constructor).
  std::vector<int64_t> indices_;

 private:
  /// Current sample id.
  int64_t sample_id_;

  /// Current buffer id.
  int64_t buffer_id_;
 };
 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SUBSET_SAMPLER_H_
--- a/mindspore/ccsrc/minddata/dataset/include/samplers.h
+++ b/mindspore/ccsrc/minddata/dataset/include/samplers.h
@@ -86,6 +86,7 @@ class PKSamplerObj;
 class PreBuiltSamplerObj;
 class RandomSamplerObj;
 class SequentialSamplerObj;
 class SubsetSamplerObj;
 class SubsetRandomSamplerObj;
 class WeightedRandomSamplerObj;

@@ -127,6 +128,13 @@ std::shared_ptr<RandomSamplerObj> RandomSampler(bool replacement = false, int64_
 /// \return Shared pointer to the current Sampler.
 std::shared_ptr<SequentialSamplerObj> SequentialSampler(int64_t start_index = 0, int64_t num_samples = 0);

 /// Function to create a Subset  Sampler.
 /// \notes Samples the elements from a sequence of indices.
 /// \param[in] indices - A vector sequence of indices.
 /// \param[in] num_samples - The number of samples to draw (default to all elements).
 /// \return Shared pointer to the current Sampler.
 std::shared_ptr<SubsetSamplerObj> SubsetSampler(std::vector<int64_t> indices, int64_t num_samples = 0);

 /// Function to create a Subset Random Sampler.
 /// \notes Samples the elements randomly from a sequence of indices.
 /// \param[in] indices - A vector sequence of indices.
@@ -293,16 +301,16 @@ class SequentialSamplerObj : public SamplerObj {
  int64_t num_samples_;
 };

 class SubsetRandomSamplerObj : public SamplerObj {
 class SubsetSamplerObj : public SamplerObj {
 public:
  SubsetRandomSamplerObj(std::vector<int64_t> indices, int64_t num_samples);
  SubsetSamplerObj(std::vector<int64_t> indices, int64_t num_samples);

  ~SubsetRandomSamplerObj() = default;
  ~SubsetSamplerObj() = default;

  std::shared_ptr<SamplerRT> SamplerBuild() override;

  std::shared_ptr<SamplerObj> SamplerCopy() override {
    auto sampler = std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
    auto sampler = std::make_shared<SubsetSamplerObj>(indices_, num_samples_);
    for (auto child : children_) {
      sampler->AddChildSampler(child);
    }
@@ -315,11 +323,34 @@ class SubsetRandomSamplerObj : public SamplerObj {

  Status ValidateParams() override;

 private:
 protected:
  const std::vector<int64_t> indices_;
  int64_t num_samples_;
 };

 class SubsetRandomSamplerObj : public SubsetSamplerObj {
 public:
  SubsetRandomSamplerObj(std::vector<int64_t> indices, int64_t num_samples);

  ~SubsetRandomSamplerObj() = default;

  std::shared_ptr<SamplerRT> SamplerBuild() override;

  std::shared_ptr<SamplerObj> SamplerCopy() override {
    auto sampler = std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
    for (auto child : children_) {
      sampler->AddChildSampler(child);
    }
    return sampler;
  }

 #ifndef ENABLE_ANDROID
  std::shared_ptr<mindrecord::ShardOperator> BuildForMindDataset() override;
 #endif

 private:
 };

 class WeightedRandomSamplerObj : public SamplerObj {
 public:
  explicit WeightedRandomSamplerObj(std::vector<double> weights, int64_t num_samples = 0, bool replacement = true);
--- a/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
@@ -77,7 +77,7 @@ enum TaskType {
  kCommonTask = 0,
  kPaddedTask = 1,
 };
 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler };
 enum SamplerType { kCustomTopNSampler, kCustomTopPercentSampler, kSubsetRandomSampler, kPKSampler, kSubsetSampler };

 enum ShuffleType { kShuffleCategory, kShuffleSample };

@@ -144,7 +144,7 @@ const std::unordered_map<std::string, std::string> kTypesMap = {
  {"float16", "float32"}, {"float32", "float32"}, {"float64", "float64"}, {"string", "string"}};
 /// \brief split a string using a character
 /// \param[in] field target string
 /// \param[in] separator a character for spliting
 /// \param[in] separator a character for splitting
 /// \return vector type result
 std::vector<std::string> StringSplit(const std::string &field, char separator);

--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_sample.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_sample.h
@@ -34,12 +34,16 @@ class __attribute__((visibility("default"))) ShardSample : public ShardOperator

  ShardSample(int num, int den, int par, int no_of_samples = 0, int offset = -1);

  ShardSample(const std::vector<int64_t> &indices);

  ShardSample(const std::vector<int64_t> &indices, uint32_t seed);

  ~ShardSample() override{};

  MSRStatus Execute(ShardTask &tasks) override;

  MSRStatus UpdateTasks(ShardTask &tasks, int taking);

  MSRStatus SufExecute(ShardTask &tasks) override;

  int64_t GetNumSamples(int64_t dataset_size, int64_t num_classes) override;
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_sample.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_sample.cc
@@ -49,13 +49,16 @@ ShardSample::ShardSample(int num, int den, int par, int no_of_samples, int offse
      sampler_type_(kCustomTopPercentSampler),
      offset_(offset) {}

 ShardSample::ShardSample(const std::vector<int64_t> &indices, uint32_t seed)
 ShardSample::ShardSample(const std::vector<int64_t> &indices)
    : numerator_(0),
      denominator_(0),
      partition_id_(0),
      no_of_samples_(0),
      indices_(indices),
      sampler_type_(kSubsetRandomSampler) {
      sampler_type_(kSubsetSampler) {}

 ShardSample::ShardSample(const std::vector<int64_t> &indices, uint32_t seed) : ShardSample(indices) {
  sampler_type_ = kSubsetRandomSampler;
  shuffle_op_ = std::make_shared<ShardShuffle>(seed);
 }

@@ -71,55 +74,17 @@ int64_t ShardSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
      return dataset_size / denominator_ * numerator_ + 1;
    }
  }
  if (sampler_type_ == kSubsetRandomSampler) {
  if (sampler_type_ == kSubsetRandomSampler || sampler_type_ == kSubsetSampler) {
    return indices_.size();
  }
  return 0;
 }

 MSRStatus ShardSample::Execute(ShardTask &tasks) {
  if (offset_ != -1) {
    int64_t old_v = 0;
    int num_rows_ = static_cast<int>(tasks.Size());
    for (int x = 0; x < denominator_; x++) {
      int samples_per_buffer_ = (num_rows_ + offset_) / denominator_;
      int remainder = (num_rows_ + offset_) % denominator_;
      if (x < remainder) samples_per_buffer_++;
      if (x < offset_) samples_per_buffer_--;
      old_v += samples_per_buffer_;
      // nums_per_shard_ is used to save the current shard's ending index
      nums_per_shard_.push_back(old_v);
    }
  }
  int no_of_categories = static_cast<int>(tasks.categories);
  int total_no = static_cast<int>(tasks.Size());  // make sure task_size

  int taking = 0;
  if (sampler_type_ == kCustomTopNSampler) {  // non sharding case constructor #1
    no_of_samples_ = std::min(no_of_samples_, total_no);
    taking = no_of_samples_ - no_of_samples_ % no_of_categories;
  } else if (sampler_type_ == kSubsetRandomSampler) {
    if (indices_.size() > total_no) {
      MS_LOG(ERROR) << "parameter indices's size is greater than dataset size.";
      return FAILED;
    }
  } else {  // constructor TopPercent
    if (numerator_ > 0 && denominator_ > 0 && numerator_ <= denominator_) {
      if (numerator_ == 1 && denominator_ > 1) {  // sharding
        taking = (total_no + denominator_ - 1) / denominator_;
      } else {  // non sharding
        taking = total_no * numerator_ / denominator_;
        taking -= (taking % no_of_categories);
      }
    } else {
      MS_LOG(ERROR) << "parameter numerator or denominator is illegal";
      return FAILED;
    }
  }
 MSRStatus ShardSample::UpdateTasks(ShardTask &tasks, int taking) {
  if (tasks.permutation_.empty()) {
    ShardTask new_tasks;
    total_no = static_cast<int>(tasks.Size());
    if (sampler_type_ == kSubsetRandomSampler) {
    int total_no = static_cast<int>(tasks.Size());
    if (sampler_type_ == kSubsetRandomSampler || sampler_type_ == kSubsetSampler) {
      for (int i = 0; i < indices_.size(); ++i) {
        int index = ((indices_[i] % total_no) + total_no) % total_no;
        new_tasks.InsertTask(tasks.GetTaskByID(index));  // different mod result between c and python
@@ -148,7 +113,7 @@ MSRStatus ShardSample::Execute(ShardTask &tasks) {
    if (taking > static_cast<int>(tasks.permutation_.size())) {
      return FAILED;
    }
    total_no = static_cast<int>(tasks.permutation_.size());
    int total_no = static_cast<int>(tasks.permutation_.size());
    int count = 0;
    for (size_t i = partition_id_ * taking; i < (partition_id_ + 1) * taking; i++) {
      if (no_of_samples_ != 0 && count == no_of_samples_) break;
@@ -160,6 +125,48 @@ MSRStatus ShardSample::Execute(ShardTask &tasks) {
  return SUCCESS;
 }

 MSRStatus ShardSample::Execute(ShardTask &tasks) {
  if (offset_ != -1) {
    int64_t old_v = 0;
    int num_rows_ = static_cast<int>(tasks.Size());
    for (int x = 0; x < denominator_; x++) {
      int samples_per_buffer_ = (num_rows_ + offset_) / denominator_;
      int remainder = (num_rows_ + offset_) % denominator_;
      if (x < remainder) samples_per_buffer_++;
      if (x < offset_) samples_per_buffer_--;
      old_v += samples_per_buffer_;
      // nums_per_shard_ is used to save the current shard's ending index
      nums_per_shard_.push_back(old_v);
    }
  }
  int no_of_categories = static_cast<int>(tasks.categories);
  int total_no = static_cast<int>(tasks.Size());  // make sure task_size

  int taking = 0;
  if (sampler_type_ == kCustomTopNSampler) {  // non sharding case constructor #1
    no_of_samples_ = std::min(no_of_samples_, total_no);
    taking = no_of_samples_ - no_of_samples_ % no_of_categories;
  } else if (sampler_type_ == kSubsetRandomSampler || sampler_type_ == kSubsetSampler) {
    if (indices_.size() > total_no) {
      MS_LOG(ERROR) << "parameter indices's size is greater than dataset size.";
      return FAILED;
    }
  } else {  // constructor TopPercent
    if (numerator_ > 0 && denominator_ > 0 && numerator_ <= denominator_) {
      if (numerator_ == 1 && denominator_ > 1) {  // sharding
        taking = (total_no + denominator_ - 1) / denominator_;
      } else {  // non sharding
        taking = total_no * numerator_ / denominator_;
        taking -= (taking % no_of_categories);
      }
    } else {
      MS_LOG(ERROR) << "parameter numerator or denominator is illegal";
      return FAILED;
    }
  }
  return UpdateTasks(tasks, taking);
 }

 MSRStatus ShardSample::SufExecute(ShardTask &tasks) {
  if (sampler_type_ == kSubsetRandomSampler) {
    if (SUCCESS != (*shuffle_op_)(tasks)) {
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -3321,7 +3321,7 @@ class MindDataset(MappableDataset):
            logger.warning("WARN: global shuffle is not used.")

        if sampler is not None:
            if isinstance(sampler, (samplers.SubsetRandomSampler, samplers.PKSampler,
            if isinstance(sampler, (samplers.SubsetRandomSampler, samplers.SubsetSampler, samplers.PKSampler,
                                    samplers.DistributedSampler, samplers.RandomSampler,
                                    samplers.SequentialSampler)) is False:
                raise ValueError("The sampler is not supported yet.")
@@ -3849,9 +3849,7 @@ class GeneratorDataset(MappableDataset):
        if hasattr(self, "__total_batch__"):
            new_op.__total_batch__ = self.__total_batch__
        if new_op.sampler is not None and hasattr(self.source, "__getitem__"):
            if isinstance(new_op.sampler, (samplers.SequentialSampler, samplers.DistributedSampler,
                                           samplers.RandomSampler, samplers.SubsetRandomSampler,
                                           samplers.WeightedRandomSampler, samplers.Sampler)):
            if isinstance(new_op.sampler, samplers.BuiltinSampler):
                if new_op.num_parallel_workers > 1:
                    sample_fn = SamplerFn(self.source, new_op.num_parallel_workers, self.python_multiprocessing)
                    new_op.source = (lambda sample_ids: _cpp_sampler_fn_mp(sample_ids, sample_fn))
--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@@ -25,103 +25,6 @@ import mindspore._c_dataengine as cde
 import mindspore.dataset as ds


 class Sampler:
    """
    Base class for user defined sampler.
    A user defined sampler can be used with any existing dataset with sampler support.

    A required  _iter_() method should by overridden by the user for sample index generation.
    An optional reset() method can be overridden for per repeat reset,

    dataset_size and num_samples will be set by dataset once a dataset iterator is created.

    Examples:
        >>> import mindspore.dataset as ds
        >>>
        >>> class ReverseSampler(ds,Sampler):
        >>>     def __iter__(self):
        >>>         for i in range(self.dataset_size - 1, -1, -1):
        >>>             yield i
        >>>
        >>> ds = ds.ImageFolderDataset(path, sampler=ReverseSampler())
    """

    def __init__(self, num_samples=None):
        self.dataset_size = 0
        self.child_sampler = None
        self.num_samples = num_samples

    def __iter__(self):
        """
        User defined iterator, must be overridden.
        _handshake is guaranteed to be called prior to iterator construction.
        """
        raise NotImplementedError

    def reset(self):
        """
        Per repeat reset callback, override this method if necessary
        """

    # Initialization handshake callback
    # Do not override this method!
    def _handshake(self, ds_size, num_samples):
        self.dataset_size = ds_size
        self.num_samples = num_samples

    # Indices fetcher
    # Do not override this method!
    def _get_indices(self):
        sampler_iter = iter(self)
        ret = []
        for _ in range(self.num_samples):
            try:
                idx = next(sampler_iter)
                ret.append(idx)
            except StopIteration:
                break
        return np.array(ret)

    # Instance fetcher
    # Do not override this method!
    def create(self):
        num_samples = self.num_samples if self.num_samples is not None else 0
        c_sampler = cde.PythonSampler(num_samples, self)
        c_child_sampler = self.create_child()
        c_sampler.add_child(c_child_sampler)
        return c_sampler

    def add_child(self, sampler):
        self.child_sampler = sampler

    def get_child(self):
        return self.child_sampler

    def create_child(self):
        c_child_sampler = None
        if self.child_sampler is not None:
            c_child_sampler = self.child_sampler.create()

        return c_child_sampler

    def is_shuffled(self):
        if self.child_sampler is None:
            return False

        return self.child_sampler.is_shuffled()

    def is_sharded(self):
        if self.child_sampler is None:
            return False

        return self.child_sampler.is_sharded()

    def get_num_samples(self):
        if self.num_samples is None:
            return None
        return self._get_indices().size


 class BuiltinSampler:
    """
    Base class for BuiltinSampler.
@@ -231,6 +134,89 @@ class BuiltinSampler:
        return self.num_samples


 class Sampler(BuiltinSampler):
    """
    Base class for user defined sampler.
    A user defined sampler can be used with any existing dataset with sampler support.

    A required  _iter_() method should by overridden by the user for sample index generation.
    An optional reset() method can be overridden for per repeat reset,

    dataset_size and num_samples will be set by dataset once a dataset iterator is created.

    Examples:
        >>> import mindspore.dataset as ds
        >>>
        >>> class ReverseSampler(ds,Sampler):
        >>>     def __iter__(self):
        >>>         for i in range(self.dataset_size - 1, -1, -1):
        >>>             yield i
        >>>
        >>> ds = ds.ImageFolderDataset(path, sampler=ReverseSampler())
    """

    def __init__(self, num_samples=None):
        super().__init__(num_samples)
        self.dataset_size = 0

    def __iter__(self):
        """
        User defined iterator, must be overridden.
        _handshake is guaranteed to be called prior to iterator construction.
        """
        raise NotImplementedError

    def reset(self):
        """
        Per repeat reset callback, override this method if necessary
        """

    # Initialization handshake callback
    # Do not override this method!
    def _handshake(self, ds_size, num_samples):
        self.dataset_size = ds_size
        self.num_samples = num_samples

    # Indices fetcher
    # Do not override this method!
    def _get_indices(self):
        sampler_iter = iter(self)
        ret = []
        for _ in range(self.num_samples):
            try:
                idx = next(sampler_iter)
                ret.append(idx)
            except StopIteration:
                break
        return np.array(ret)

    # Instance fetcher
    # Do not override this method!
    def create(self):
        num_samples = self.num_samples if self.num_samples is not None else 0
        c_sampler = cde.PythonSampler(num_samples, self)
        c_child_sampler = self.create_child()
        c_sampler.add_child(c_child_sampler)
        return c_sampler

    def is_shuffled(self):
        if self.child_sampler is None:
            return False

        return self.child_sampler.is_shuffled()

    def is_sharded(self):
        if self.child_sampler is None:
            return False

        return self.child_sampler.is_sharded()

    def get_num_samples(self):
        if self.num_samples is None:
            return None
        return self._get_indices().size


 class DistributedSampler(BuiltinSampler):
    """
    A sampler that accesses a shard of the dataset.
@@ -518,9 +504,9 @@ class SequentialSampler(BuiltinSampler):
        return self.child_sampler.is_sharded()


 class SubsetRandomSampler(BuiltinSampler):
 class SubsetSampler(BuiltinSampler):
    """
    Samples the elements randomly from a sequence of indices.
    Samples the elements from a sequence of indices.

    Args:
        indices (list[int]): A sequence of indices.
@@ -533,8 +519,8 @@ class SubsetRandomSampler(BuiltinSampler):
        >>>
        >>> indices = [0, 1, 2, 3, 7, 88, 119]
        >>>
        >>> # creates a SubsetRandomSampler, will sample from the provided indices
        >>> sampler = ds.SubsetRandomSampler(indices)
        >>> # creates a SubsetSampler, will sample from the provided indices
        >>> sampler = ds.SubsetSampler(indices)
        >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
    """

@@ -552,13 +538,13 @@ class SubsetRandomSampler(BuiltinSampler):

    def create(self):
        num_samples = self.num_samples if self.num_samples is not None else 0
        c_sampler = cde.SubsetRandomSampler(num_samples, self.indices)
        c_sampler = cde.SubsetSampler(num_samples, self.indices)
        c_child_sampler = self.create_child()
        c_sampler.add_child(c_child_sampler)
        return c_sampler

    def is_shuffled(self):
        return True
        return False

    def is_sharded(self):
        if self.child_sampler is None:
@@ -567,7 +553,7 @@ class SubsetRandomSampler(BuiltinSampler):
        return self.child_sampler.is_sharded()

    def create_for_minddataset(self):
        c_sampler = cde.MindrecordSubsetRandomSampler(self.indices, ds.config.get_seed())
        c_sampler = cde.MindrecordSubsetSampler(self.indices)
        c_child_sampler = self.create_child_for_minddataset()
        c_sampler.add_child(c_child_sampler)
        return c_sampler
@@ -580,6 +566,43 @@ class SubsetRandomSampler(BuiltinSampler):
        return min(len(self.indices), num_samples)


 class SubsetRandomSampler(SubsetSampler):
    """
    Samples the elements randomly from a sequence of indices.

    Args:
        indices (list[int]): A sequence of indices.
        num_samples (int, optional): Number of elements to sample (default=None, all elements).

    Examples:
        >>> import mindspore.dataset as ds
        >>>
        >>> dataset_dir = "path/to/imagefolder_directory"
        >>>
        >>> indices = [0, 1, 2, 3, 7, 88, 119]
        >>>
        >>> # creates a SubsetRandomSampler, will sample from the provided indices
        >>> sampler = ds.SubsetRandomSampler(indices)
        >>> data = ds.ImageFolderDataset(dataset_dir, num_parallel_workers=8, sampler=sampler)
    """

    def create(self):
        num_samples = self.num_samples if self.num_samples is not None else 0
        c_sampler = cde.SubsetRandomSampler(num_samples, self.indices)
        c_child_sampler = self.create_child()
        c_sampler.add_child(c_child_sampler)
        return c_sampler

    def is_shuffled(self):
        return True

    def create_for_minddataset(self):
        c_sampler = cde.MindrecordSubsetSampler(self.indices, ds.config.get_seed())
        c_child_sampler = self.create_child_for_minddataset()
        c_sampler.add_child(c_child_sampler)
        return c_sampler


 class WeightedRandomSampler(BuiltinSampler):
    """
    Samples the elements from [0, len(weights) - 1] randomly with the given weights (probabilities).
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -410,9 +410,7 @@ def check_generatordataset(method):
        if sampler is not None:
            if isinstance(sampler, samplers.PKSampler):
                raise ValueError("GeneratorDataset doesn't support PKSampler.")
            if not isinstance(sampler, (samplers.SequentialSampler, samplers.DistributedSampler,
                                        samplers.RandomSampler, samplers.SubsetRandomSampler,
                                        samplers.WeightedRandomSampler, samplers.Sampler)):
            if not isinstance(sampler, samplers.BuiltinSampler):
                try:
                    iter(sampler)
                except TypeError:
--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@@ -119,6 +119,7 @@ SET(DE_UT_SRCS
        status_test.cc
        storage_container_test.cc
        subset_random_sampler_test.cc
        subset_sampler_test.cc
        swap_red_blue_test.cc
        take_op_test.cc
        task_manager_test.cc
--- a/tests/ut/cpp/dataset/c_api_dataset_minddata_test.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_minddata_test.cc
@@ -242,8 +242,11 @@ TEST_F(MindDataTestPipeline, TestMindDataSuccess6) {
  std::shared_ptr<Dataset> ds5 = MindData(file_list, {}, SubsetRandomSampler({0, 1, 2}, 10));
  EXPECT_NE(ds5, nullptr);

  std::vector<std::shared_ptr<Dataset>> ds = {ds1, ds2, ds3, ds4, ds5};
  std::vector<int32_t> expected_samples = {5, 5, 2, 3, 3};
  std::shared_ptr<Dataset> ds6 = MindData(file_list, {}, SubsetSampler({1, 2}, 10));
  EXPECT_NE(ds5, nullptr);

  std::vector<std::shared_ptr<Dataset>> ds = {ds1, ds2, ds3, ds4, ds5, ds6};
  std::vector<int32_t> expected_samples = {5, 5, 2, 3, 3, 2};

  for (int32_t i = 0; i < ds.size(); i++) {
    // Create an iterator over the result of the above dataset
--- a/tests/ut/cpp/dataset/c_api_samplers_test.cc
+++ b/tests/ut/cpp/dataset/c_api_samplers_test.cc
@@ -42,6 +42,9 @@ TEST_F(MindDataTestPipeline, TestImageFolderWithSamplers) {
  EXPECT_NE(sampl, nullptr);

  std::vector<int64_t> indices = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23};
  sampl = SubsetSampler(indices);
  EXPECT_NE(sampl, nullptr);

  sampl = SubsetRandomSampler(indices);
  EXPECT_NE(sampl, nullptr);

@@ -138,7 +141,7 @@ TEST_F(MindDataTestPipeline, TestCalculateNumSamples) {
  EXPECT_NE(sampl4, nullptr);
  std::shared_ptr<SamplerRT> sampler_rt4 = sampl4->SamplerBuild();
  sampler_rt4->AddChild(sampler_rt3);
  EXPECT_EQ(sampler_rt4->CalculateNumSamples(num_rows), 12);
  EXPECT_EQ(sampler_rt4->CalculateNumSamples(num_rows), 11);

  // Child doesn't have num_samples
  std::shared_ptr<SamplerObj> sampl5 = RandomSampler(false);
--- a/tests/ut/cpp/dataset/subset_sampler_test.cc
+++ b/tests/ut/cpp/dataset/subset_sampler_test.cc
@@ -0,0 +1,144 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "common/common.h"
 #include "gtest/gtest.h"

 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/engine/data_buffer.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/subset_sampler.h"

 #include <vector>
 #include <unordered_set>

 using namespace mindspore::dataset;

 class MindDataTestSubsetSampler : public UT::Common {
 public:
  class DummyRandomAccessOp : public RandomAccessOp {
   public:
    DummyRandomAccessOp(int64_t num_rows) {
      num_rows_ = num_rows;  // base class
    };
  };
 };

 TEST_F(MindDataTestSubsetSampler, TestAllAtOnce) {
  std::vector<int64_t> in({3, 1, 4, 0, 1});
  std::unordered_set<int64_t> in_set(in.begin(), in.end());
  int64_t num_samples = 0;
  SubsetSamplerRT sampler(num_samples, in);

  DummyRandomAccessOp dummyRandomAccessOp(5);
  sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);

  std::unique_ptr<DataBuffer> db;
  TensorRow row;
  std::vector<int64_t> out;
  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  db->PopRow(&row);
  for (const auto &t : row) {
    for (auto it = t->begin<int64_t>(); it != t->end<int64_t>(); it++) {
      out.push_back(*it);
    }
  }
  ASSERT_EQ(in.size(), out.size());
  for (int i = 0; i < in.size(); i++) {
    ASSERT_EQ(in[i], out[i]);
  }

  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  ASSERT_EQ(db->eoe(), true);
 }

 TEST_F(MindDataTestSubsetSampler, TestGetNextBuffer) {
  int64_t total_samples = 100000 - 5;
  int64_t samples_per_buffer = 10;
  int64_t num_samples = 0;
  std::vector<int64_t> input(total_samples, 1);
  SubsetSamplerRT sampler(num_samples, input, samples_per_buffer);

  DummyRandomAccessOp dummyRandomAccessOp(total_samples);
  sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);

  std::unique_ptr<DataBuffer> db;
  TensorRow row;
  std::vector<int64_t> out;

  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  int epoch = 0;
  while (!db->eoe()) {
    epoch++;
    db->PopRow(&row);
    for (const auto &t : row) {
      for (auto it = t->begin<int64_t>(); it != t->end<int64_t>(); it++) {
        out.push_back(*it);
      }
    }
    db.reset();

    ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  }

  ASSERT_EQ(epoch, (total_samples + samples_per_buffer - 1) / samples_per_buffer);
  ASSERT_EQ(input.size(), out.size());
 }

 TEST_F(MindDataTestSubsetSampler, TestReset) {
  std::vector<int64_t> in({0, 1, 2, 3, 4});
  std::unordered_set<int64_t> in_set(in.begin(), in.end());
  int64_t num_samples = 0;
  SubsetSamplerRT sampler(num_samples, in);

  DummyRandomAccessOp dummyRandomAccessOp(5);
  sampler.HandshakeRandomAccessOp(&dummyRandomAccessOp);

  std::unique_ptr<DataBuffer> db;
  TensorRow row;
  std::vector<int64_t> out;

  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  db->PopRow(&row);
  for (const auto &t : row) {
    for (auto it = t->begin<int64_t>(); it != t->end<int64_t>(); it++) {
      out.push_back(*it);
    }
  }
  ASSERT_EQ(in.size(), out.size());
  for (int i = 0; i < in.size(); i++) {
    ASSERT_EQ(in[i], out[i]);
  }

  sampler.ResetSampler();

  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  ASSERT_EQ(db->eoe(), false);
  db->PopRow(&row);
  out.clear();
  for (const auto &t : row) {
    for (auto it = t->begin<int64_t>(); it != t->end<int64_t>(); it++) {
      out.push_back(*it);
    }
  }
  ASSERT_EQ(in.size(), out.size());
  for (int i = 0; i < in.size(); i++) {
    ASSERT_EQ(in[i], out[i]);
  }

  ASSERT_EQ(sampler.GetNextSample(&db), Status::OK());
  ASSERT_EQ(db->eoe(), true);
 }
--- a/tests/ut/python/dataset/test_minddataset_sampler.py
+++ b/tests/ut/python/dataset/test_minddataset_sampler.py
@@ -61,6 +61,7 @@ def add_and_remove_cv_file():
            os.remove("{}".format(x))
            os.remove("{}.db".format(x))


 def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    num_readers = 4
@@ -101,6 +102,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file):
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1


 def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
@@ -142,6 +144,7 @@ def test_cv_minddataset_pk_sample_shuffle_1(add_and_remove_cv_file):
        num_iter += 1
    assert num_iter == 5


 def test_cv_minddataset_pk_sample_shuffle_2(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
@@ -182,6 +185,7 @@ def test_cv_minddataset_pk_sample_out_of_range_0(add_and_remove_cv_file):
        num_iter += 1
    assert num_iter == 15


 def test_cv_minddataset_pk_sample_out_of_range_1(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
@@ -201,6 +205,7 @@ def test_cv_minddataset_pk_sample_out_of_range_1(add_and_remove_cv_file):
        num_iter += 1
    assert num_iter == 15


 def test_cv_minddataset_pk_sample_out_of_range_2(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
@@ -226,22 +231,23 @@ def test_cv_minddataset_subset_random_sample_basic(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    indices = [1, 2, 3, 5, 7]
    sampler = ds.SubsetRandomSampler(indices)
    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                              sampler=sampler)
    assert data_set.get_dataset_size() == 5
    num_iter = 0
    for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info(
            "-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info(
            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
        logger.info(
            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info(
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 5
    samplers = (ds.SubsetRandomSampler(indices), ds.SubsetSampler(indices))
    for sampler in samplers:
        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                                  sampler=sampler)
        assert data_set.get_dataset_size() == 5
        num_iter = 0
        for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
            logger.info(
                "-------------- cv reader basic: {} ------------------------".format(num_iter))
            logger.info(
                "-------------- item[data]: {}  -----------------------------".format(item["data"]))
            logger.info(
                "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
            logger.info(
                "-------------- item[label]: {} ----------------------------".format(item["label"]))
            num_iter += 1
        assert num_iter == 5


 def test_cv_minddataset_subset_random_sample_replica(add_and_remove_cv_file):
@@ -249,22 +255,23 @@ def test_cv_minddataset_subset_random_sample_replica(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    indices = [1, 2, 2, 5, 7, 9]
    sampler = ds.SubsetRandomSampler(indices)
    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                              sampler=sampler)
    assert data_set.get_dataset_size() == 6
    num_iter = 0
    for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info(
            "-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info(
            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
        logger.info(
            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info(
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 6
    samplers = ds.SubsetRandomSampler(indices), ds.SubsetSampler(indices)
    for sampler in samplers:
        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                                  sampler=sampler)
        assert data_set.get_dataset_size() == 6
        num_iter = 0
        for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
            logger.info(
                "-------------- cv reader basic: {} ------------------------".format(num_iter))
            logger.info(
                "-------------- item[data]: {}  -----------------------------".format(item["data"]))
            logger.info(
                "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
            logger.info(
                "-------------- item[label]: {} ----------------------------".format(item["label"]))
            num_iter += 1
        assert num_iter == 6


 def test_cv_minddataset_subset_random_sample_empty(add_and_remove_cv_file):
@@ -272,22 +279,23 @@ def test_cv_minddataset_subset_random_sample_empty(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    indices = []
    sampler = ds.SubsetRandomSampler(indices)
    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                              sampler=sampler)
    assert data_set.get_dataset_size() == 0
    num_iter = 0
    for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info(
            "-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info(
            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
        logger.info(
            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info(
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 0
    samplers = ds.SubsetRandomSampler(indices), ds.SubsetSampler(indices)
    for sampler in samplers:
        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                                  sampler=sampler)
        assert data_set.get_dataset_size() == 0
        num_iter = 0
        for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
            logger.info(
                "-------------- cv reader basic: {} ------------------------".format(num_iter))
            logger.info(
                "-------------- item[data]: {}  -----------------------------".format(item["data"]))
            logger.info(
                "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
            logger.info(
                "-------------- item[label]: {} ----------------------------".format(item["label"]))
            num_iter += 1
        assert num_iter == 0


 def test_cv_minddataset_subset_random_sample_out_of_range(add_and_remove_cv_file):
@@ -295,44 +303,46 @@ def test_cv_minddataset_subset_random_sample_out_of_range(add_and_remove_cv_file
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    indices = [1, 2, 4, 11, 13]
    sampler = ds.SubsetRandomSampler(indices)
    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                              sampler=sampler)
    assert data_set.get_dataset_size() == 5
    num_iter = 0
    for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info(
            "-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info(
            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
        logger.info(
            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info(
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 5
    samplers = ds.SubsetRandomSampler(indices), ds.SubsetSampler(indices)
    for sampler in samplers:
        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                                  sampler=sampler)
        assert data_set.get_dataset_size() == 5
        num_iter = 0
        for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
            logger.info(
                "-------------- cv reader basic: {} ------------------------".format(num_iter))
            logger.info(
                "-------------- item[data]: {}  -----------------------------".format(item["data"]))
            logger.info(
                "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
            logger.info(
                "-------------- item[label]: {} ----------------------------".format(item["label"]))
            num_iter += 1
        assert num_iter == 5


 def test_cv_minddataset_subset_random_sample_negative(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    indices = [1, 2, 4, -1, -2]
    sampler = ds.SubsetRandomSampler(indices)
    data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                              sampler=sampler)
    assert data_set.get_dataset_size() == 5
    num_iter = 0
    for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info(
            "-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info(
            "-------------- item[data]: {}  -----------------------------".format(item["data"]))
        logger.info(
            "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info(
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 5
    samplers = ds.SubsetRandomSampler(indices), ds.SubsetSampler(indices)
    for sampler in samplers:
        data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
                                  sampler=sampler)
        assert data_set.get_dataset_size() == 5
        num_iter = 0
        for item in data_set.create_dict_iterator(num_epochs=1, output_numpy=True):
            logger.info(
                "-------------- cv reader basic: {} ------------------------".format(num_iter))
            logger.info(
                "-------------- item[data]: {}  -----------------------------".format(item["data"]))
            logger.info(
                "-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
            logger.info(
                "-------------- item[label]: {} ----------------------------".format(item["label"]))
            num_iter += 1
        assert num_iter == 5


 def test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file):
@@ -359,6 +369,7 @@ def test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file):
    assert num_iter == 10
    assert new_dataset != [x['file_name'] for x in data]


 def test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
@@ -392,6 +403,7 @@ def test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file):
    assert epoch2_dataset not in (epoch1_dataset, epoch3_dataset)
    assert epoch3_dataset not in (epoch1_dataset, epoch2_dataset)


 def test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
@@ -412,6 +424,7 @@ def test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file):
        num_iter += 1
    assert num_iter == 5


 def test_cv_minddataset_random_sampler_replacement_false_1(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
@@ -432,6 +445,7 @@ def test_cv_minddataset_random_sampler_replacement_false_1(add_and_remove_cv_fil
        num_iter += 1
    assert num_iter == 2


 def test_cv_minddataset_random_sampler_replacement_false_2(add_and_remove_cv_file):
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
@@ -472,7 +486,7 @@ def test_cv_minddataset_sequential_sampler_basic(add_and_remove_cv_file):
        logger.info(
            "-------------- item[label]: {} ----------------------------".format(item["label"]))
        assert item['file_name'] == np.array(
            data[num_iter+1]['file_name'], dtype='S')
            data[num_iter + 1]['file_name'], dtype='S')
        num_iter += 1
    assert num_iter == 4

@@ -501,6 +515,7 @@ def test_cv_minddataset_sequential_sampler_offeset(add_and_remove_cv_file):
        num_iter += 1
    assert num_iter == 10


 def test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file):
    data = get_data(CV_DIR_NAME, True)
    columns_list = ["data", "file_name", "label"]
@@ -671,7 +686,7 @@ def test_cv_minddataset_split_deterministic(add_and_remove_cv_file):
        num_iter += 1
    assert num_iter == 2
    inter_dataset = [x for x in d1_dataset if x in d2_dataset]
    assert inter_dataset == []   # intersection of  d1 and d2
    assert inter_dataset == []  # intersection of  d1 and d2


 def test_cv_minddataset_split_sharding(add_and_remove_cv_file):
@@ -731,7 +746,7 @@ def test_cv_minddataset_split_sharding(add_and_remove_cv_file):
    assert len(epoch2_dataset) == 4
    assert len(epoch3_dataset) == 4
    inter_dataset = [x for x in d1_shard1 if x in epoch1_dataset]
    assert inter_dataset == [] # intersection of d1's shard1 and d1's shard2
    assert inter_dataset == []  # intersection of d1's shard1 and d1's shard2
    assert epoch1_dataset not in (epoch2_dataset, epoch3_dataset)
    assert epoch2_dataset not in (epoch1_dataset, epoch3_dataset)
    assert epoch3_dataset not in (epoch1_dataset, epoch2_dataset)
@@ -777,6 +792,7 @@ def get_data(dir_name, sampler=False):
            continue
    return data_list


 if __name__ == '__main__':
    test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file)
    test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file)
--- a/tests/ut/python/dataset/test_sampler.py
+++ b/tests/ut/python/dataset/test_sampler.py
@@ -165,7 +165,7 @@ def test_python_sampler():
    assert list(sp1.get_indices()) == [0, 1, 2, 3, 4]


 def test_subset_sampler():
 def test_sequential_sampler2():
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"
    map_ = {(172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4}

@@ -191,6 +191,48 @@ def test_subset_sampler():
    assert test_config(4, None) == [4]


 def test_subset_sampler():
    def test_config(indices, num_samples=None, exception_msg=None):
        def pipeline():
            sampler = ds.SubsetSampler(indices, num_samples)
            data = ds.NumpySlicesDataset(list(range(0, 10)), sampler=sampler)
            dataset_size = data.get_dataset_size()
            return [d[0] for d in data.create_tuple_iterator(num_epochs=1, output_numpy=True)], dataset_size

        if exception_msg is None:
            res, size = pipeline()
            assert indices[:num_samples] == res
            assert len(indices[:num_samples]) == size
        else:
            with pytest.raises(Exception) as error_info:
                pipeline()
            print(str(error_info))
            assert exception_msg in str(error_info)

    test_config([1, 2, 3])
    test_config(list(range(10)))
    test_config([0])
    test_config([9])
    test_config(list(range(0, 10, 2)))
    test_config(list(range(1, 10, 2)))
    test_config(list(range(9, 0, -1)))
    test_config(list(range(9, 0, -2)))
    test_config(list(range(8, 0, -2)))
    test_config([0, 9, 3, 2])
    test_config([0, 0, 0, 0])
    test_config([0])
    test_config([0, 9, 3, 2], num_samples=2)
    test_config([0, 9, 3, 2], num_samples=5)

    test_config([20], exception_msg="Sample ID (20) is out of bound, expected range [0, 9]")
    test_config([10], exception_msg="Sample ID (10) is out of bound, expected range [0, 9]")
    test_config([0, 9, 0, 500], exception_msg="Sample ID (500) is out of bound, expected range [0, 9]")
    test_config([0, 9, -6, 2], exception_msg="Sample ID (-6) is out of bound, expected range [0, 9]")
    # test_config([], exception_msg="Indices list is empty") # temporary until we check with MindDataset
    test_config([0, 9, 3, 2], num_samples=0,
                exception_msg="num_samples should be a positive integer value, but got num_samples: 0.")


 def test_sampler_chain():
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"
    map_ = {(172876, 0): 0, (54214, 0): 1, (54214, 1): 2, (173673, 0): 3, (64631, 1): 4}
@@ -249,6 +291,7 @@ if __name__ == '__main__':
    test_random_sampler_multi_iter(True)
    test_sampler_py_api()
    test_python_sampler()
    test_sequential_sampler2()
    test_subset_sampler()
    test_sampler_chain()
    test_add_sampler_invalid_input()