/** * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_SAMPLERS_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_SAMPLERS_H_ #include #include // FIXME - This internal IR header will be removed when external API classes are provided #include "minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.h" namespace mindspore { namespace dataset { class DistributedSamplerObj; class PKSamplerObj; class PreBuiltSamplerObj; class RandomSamplerObj; class SequentialSamplerObj; class SubsetSamplerObj; class SubsetRandomSamplerObj; class WeightedRandomSamplerObj; /// Function to create a Distributed Sampler. /// \notes A Sampler that access a shard of the dataset. /// \param[in] num_shards - Number of shards to divide the dataset into. /// \param[in] shard_id - Shard ID of the current shard within num_shards. /// \param[in] shuffle - If true, the indices are shuffled. /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \param[in] seed - The seed in use when shuffle is true. /// \param[in] offset - The starting position where access to elements in the dataset begins. /// \param[in] even_dist - If true, each shard would return the same number of rows (default to true). /// If false the total rows returned by all the shards would not have overlap. /// \return Shared pointer to the current Sampler. std::shared_ptr DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle = true, int64_t num_samples = 0, uint32_t seed = 1, int64_t offset = -1, bool even_dist = true); /// Function to create a PK Sampler. /// \notes Samples K elements for each P class in the dataset. /// This will sample all classes. /// \param[in] num_val - Number of elements to sample for each class. /// \param[in] shuffle - If true, the class IDs are shuffled. /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \return Shared pointer to the current Sampler. std::shared_ptr PKSampler(int64_t num_val, bool shuffle = false, int64_t num_samples = 0); /// Function to create a Random Sampler. /// \notes Samples the elements randomly. /// \param[in] replacement - If true, put the sample ID back for the next draw. /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \return Shared pointer to the current Sampler. std::shared_ptr RandomSampler(bool replacement = false, int64_t num_samples = 0); /// Function to create a Sequential Sampler. /// \notes Samples the dataset elements sequentially, same as not having a sampler. /// \param[in] start_index - Index to start sampling at (default to start at first id). /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \return Shared pointer to the current Sampler. std::shared_ptr SequentialSampler(int64_t start_index = 0, int64_t num_samples = 0); /// Function to create a Subset Sampler. /// \notes Samples the elements from a sequence of indices. /// \param[in] indices - A vector sequence of indices. /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \return Shared pointer to the current Sampler. std::shared_ptr SubsetSampler(std::vector indices, int64_t num_samples = 0); /// Function to create a Subset Random Sampler. /// \notes Samples the elements randomly from a sequence of indices. /// \param[in] indices - A vector sequence of indices. /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \return Shared pointer to the current Sampler. std::shared_ptr SubsetRandomSampler(std::vector indices, int64_t num_samples = 0); /// Function to create a Weighted Random Sampler. /// \notes Samples the elements from [0, len(weights) - 1] randomly with the given /// weights (probabilities). /// \param[in] weights - A vector sequence of weights, not necessarily summing up to 1. /// \param[in] num_samples - The number of samples to draw (default to all elements). /// \param[in] replacement - If true, put the sample ID back for the next draw. /// \return Shared pointer to the current Sampler. std::shared_ptr WeightedRandomSampler(std::vector weights, int64_t num_samples = 0, bool replacement = true); } // namespace dataset } // namespace mindspore #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_SAMPLERS_H_