Browse Source

[feat][assistant][I3J6V7] add new data operator KMnist

tags/v1.6.0
“uccInf” 4 years ago
parent
commit
92d6d31750
15 changed files with 1213 additions and 0 deletions
  1. +23
    -0
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  2. +11
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
  3. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt
  4. +80
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/kmnist_op.cc
  5. +65
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/kmnist_op.h
  6. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
  7. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt
  8. +114
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/kmnist_node.cc
  9. +101
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/kmnist_node.h
  10. +77
    -0
      mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
  11. +1
    -0
      mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h
  12. +121
    -0
      mindspore/dataset/engine/datasets.py
  13. +1
    -0
      tests/ut/cpp/dataset/CMakeLists.txt
  14. +287
    -0
      tests/ut/cpp/dataset/c_api_dataset_kmnist_test.cc
  15. +329
    -0
      tests/ut/python/dataset/test_datasets_kmnist.py

+ 23
- 0
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -103,6 +103,7 @@
#include "minddata/dataset/engine/ir/datasetops/source/fashion_mnist_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/flickr_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/kmnist_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/lj_speech_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h"
@@ -1212,6 +1213,28 @@ ImageFolderDataset::ImageFolderDataset(const std::vector<char> &dataset_dir, boo
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

KMnistDataset::KMnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const std::shared_ptr<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler ? sampler->Parse() : nullptr;
auto ds = std::make_shared<KMnistNode>(CharToString(dataset_dir), CharToString(usage), sampler_obj, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

KMnistDataset::KMnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const Sampler *sampler, const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler ? sampler->Parse() : nullptr;
auto ds = std::make_shared<KMnistNode>(CharToString(dataset_dir), CharToString(usage), sampler_obj, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

KMnistDataset::KMnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const std::reference_wrapper<Sampler> sampler,
const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler.get().Parse();
auto ds = std::make_shared<KMnistNode>(CharToString(dataset_dir), CharToString(usage), sampler_obj, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

LJSpeechDataset::LJSpeechDataset(const std::vector<char> &dataset_dir, const std::shared_ptr<Sampler> &sampler,
const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler ? sampler->Parse() : nullptr;


+ 11
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc View File

@@ -41,6 +41,7 @@
#include "minddata/dataset/engine/ir/datasetops/source/flickr_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/generator_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/kmnist_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/random_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/speech_commands_node.h"
@@ -267,6 +268,16 @@ PYBIND_REGISTER(ImageFolderNode, 2, ([](const py::module *m) {
}));
}));

PYBIND_REGISTER(KMnistNode, 2, ([](const py::module *m) {
(void)py::class_<KMnistNode, DatasetNode, std::shared_ptr<KMnistNode>>(*m, "KMnistNode",
"to create a KMnistNode")
.def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
auto kmnist = std::make_shared<KMnistNode>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
THROW_IF_ERROR(kmnist->ValidateParams());
return kmnist;
}));
}));

PYBIND_REGISTER(LJSpeechNode, 2, ([](const py::module *m) {
(void)py::class_<LJSpeechNode, DatasetNode, std::shared_ptr<LJSpeechNode>>(*m, "LJSpeechNode",
"to create a LJSpeechNode")


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt View File

@@ -19,6 +19,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
flickr_op.cc
image_folder_op.cc
io_block.cc
kmnist_op.cc
lj_speech_op.cc
mappable_leaf_op.cc
mnist_op.cc


+ 80
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/kmnist_op.cc View File

@@ -0,0 +1,80 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/datasetops/source/kmnist_op.h"

#include <fstream>
#include <iomanip>

#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/tensor_shape.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
#include "minddata/dataset/engine/execution_tree.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace dataset {
KMnistOp::KMnistOp(const std::string &usage, int32_t num_workers, const std::string &folder_path, int32_t queue_size,
std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler)
: MnistOp(usage, num_workers, folder_path, queue_size, std::move(data_schema), std::move(sampler)) {}

Status KMnistOp::CountTotalRows(const std::string &dir, const std::string &usage, int64_t *count) {
// the logic of counting the number of samples is copied from ParseKMnistData() and uses CheckReader().
RETURN_UNEXPECTED_IF_NULL(count);
*count = 0;

const int64_t num_samples = 0;
const int64_t start_index = 0;
auto sampler = std::make_shared<SequentialSamplerRT>(start_index, num_samples);
auto schema = std::make_unique<DataSchema>();
RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
TensorShape scalar = TensorShape::CreateScalar();
RETURN_IF_NOT_OK(
schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar)));
std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
int32_t num_workers = cfg->num_parallel_workers();
int32_t op_connect_size = cfg->op_connector_size();
auto op = std::make_shared<KMnistOp>(usage, num_workers, dir, op_connect_size, std::move(schema), std::move(sampler));

RETURN_IF_NOT_OK(op->WalkAllFiles());

for (size_t i = 0; i < op->image_names_.size(); ++i) {
std::ifstream image_reader;
image_reader.open(op->image_names_[i], std::ios::binary);
CHECK_FAIL_RETURN_UNEXPECTED(image_reader.is_open(),
"Invalid file, failed to open image file: " + op->image_names_[i]);
std::ifstream label_reader;
label_reader.open(op->label_names_[i], std::ios::binary);
CHECK_FAIL_RETURN_UNEXPECTED(label_reader.is_open(),
"Invalid file, failed to open label file: " + op->label_names_[i]);
uint32_t num_images;
Status s = op->CheckImage(op->image_names_[i], &image_reader, &num_images);
image_reader.close();
RETURN_IF_NOT_OK(s);

uint32_t num_labels;
s = op->CheckLabel(op->label_names_[i], &label_reader, &num_labels);
label_reader.close();
RETURN_IF_NOT_OK(s);

CHECK_FAIL_RETURN_UNEXPECTED((num_images == num_labels),
"Invalid data, num of images is not equal to num of labels.");
*count = *count + num_images;
}

return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 65
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/kmnist_op.h View File

@@ -0,0 +1,65 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_KMNIST_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_KMNIST_OP_H_

#include <algorithm>
#include <memory>
#include <string>
#include <utility>

#include "minddata/dataset/engine/datasetops/source/mnist_op.h"

namespace mindspore {
namespace dataset {
/// \brief Forward declares.
template <typename T>
class Queue;

class KMnistOp : public MnistOp {
public:
/// \brief Constructor.
/// \param[in] usage Usage of this dataset, can be 'train', 'test' or 'all'.
/// \param[in] num_workers Number of workers reading images in parallel.
/// \param[in] folder_path Dir directory of kmnist.
/// \param[in] queue_size Connector queue size.
/// \param[in] data_schema The schema of the kmnist dataset.
/// \param[in] Sampler Tells KMnistOp what to read.
KMnistOp(const std::string &usage, int32_t num_workers, const std::string &folder_path, int32_t queue_size,
std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler);

/// \brief Destructor.
~KMnistOp() = default;

/// \brief Function to count the number of samples in the KMNIST dataset.
/// \param[in] dir Path to the KMNIST directory.
/// \param[in] usage Usage of this dataset, can be 'train', 'test' or 'all'.
/// \param[in] count Output arg that will hold the minimum of the actual dataset size and numSamples.
/// \return Status The status code returned.
static Status CountTotalRows(const std::string &dir, const std::string &usage, int64_t *count);

/// \brief Op name getter.
/// \return Name of the current Op.
std::string Name() const override { return "KMnistOp"; }

/// \brief Dataset name getter.
/// \param[in] upper Whether to get upper name.
/// \return Dataset name of the current Op.
std::string DatasetName(bool upper = false) const override { return upper ? "KMnist" : "kmnist"; }
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_KMNIST_OP_H_

+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h View File

@@ -93,6 +93,7 @@ constexpr char kFashionMnistNode[] = "FashionMnistDataset";
constexpr char kFlickrNode[] = "FlickrDataset";
constexpr char kGeneratorNode[] = "GeneratorDataset";
constexpr char kImageFolderNode[] = "ImageFolderDataset";
constexpr char kKMnistNode[] = "KMnistDataset";
constexpr char kLJSpeechNode[] = "LJSpeechDataset";
constexpr char kManifestNode[] = "ManifestDataset";
constexpr char kMindDataNode[] = "MindDataDataset";


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt View File

@@ -19,6 +19,7 @@ set(DATASET_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES
fashion_mnist_node.cc
flickr_node.cc
image_folder_node.cc
kmnist_node.cc
lj_speech_node.cc
manifest_node.cc
minddata_node.cc


+ 114
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/kmnist_node.cc View File

@@ -0,0 +1,114 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "minddata/dataset/engine/ir/datasetops/source/kmnist_node.h"

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "minddata/dataset/engine/datasetops/source/kmnist_op.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {
KMnistNode::KMnistNode(const std::string &dataset_dir, const std::string &usage, std::shared_ptr<SamplerObj> sampler,
std::shared_ptr<DatasetCache> cache)
: MappableSourceNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {}

std::shared_ptr<DatasetNode> KMnistNode::Copy() {
std::shared_ptr<SamplerObj> sampler = (sampler_ == nullptr) ? nullptr : sampler_->SamplerCopy();
auto node = std::make_shared<KMnistNode>(dataset_dir_, usage_, sampler, cache_);
return node;
}

void KMnistNode::Print(std::ostream &out) const { out << Name(); }

Status KMnistNode::ValidateParams() {
RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
RETURN_IF_NOT_OK(ValidateDatasetDirParam("KMnistNode", dataset_dir_));

RETURN_IF_NOT_OK(ValidateDatasetSampler("KMnistNode", sampler_));

RETURN_IF_NOT_OK(ValidateStringValue("KMnistNode", usage_, {"train", "test", "all"}));

return Status::OK();
}

Status KMnistNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
// Do internal Schema generation.
auto schema = std::make_unique<DataSchema>();
RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
TensorShape scalar = TensorShape::CreateScalar();
RETURN_IF_NOT_OK(
schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar)));
std::shared_ptr<SamplerRT> sampler_rt = nullptr;
RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));

auto op = std::make_shared<KMnistOp>(usage_, num_workers_, dataset_dir_, connector_que_size_, std::move(schema),
std::move(sampler_rt));
op->SetTotalRepeats(GetTotalRepeats());
op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch());
node_ops->push_back(op);

return Status::OK();
}

// Get the shard id of node.
Status KMnistNode::GetShardId(int32_t *shard_id) {
*shard_id = sampler_->ShardId();

return Status::OK();
}

// Get Dataset size.
Status KMnistNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) {
if (dataset_size_ > 0) {
*dataset_size = dataset_size_;
return Status::OK();
}
int64_t num_rows, sample_size;
RETURN_IF_NOT_OK(KMnistOp::CountTotalRows(dataset_dir_, usage_, &num_rows));
std::shared_ptr<SamplerRT> sampler_rt = nullptr;
RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
sample_size = sampler_rt->CalculateNumSamples(num_rows);
if (sample_size == -1) {
RETURN_IF_NOT_OK(size_getter->DryRun(shared_from_this(), &sample_size));
}
*dataset_size = sample_size;
dataset_size_ = *dataset_size;
return Status::OK();
}

Status KMnistNode::to_json(nlohmann::json *out_json) {
nlohmann::json args, sampler_args;
RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
args["sampler"] = sampler_args;
args["num_parallel_workers"] = num_workers_;
args["dataset_dir"] = dataset_dir_;
args["usage"] = usage_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
*out_json = args;
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 101
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/kmnist_node.h View File

@@ -0,0 +1,101 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_KMNIST_NODE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_KMNIST_NODE_H_

#include <memory>
#include <string>
#include <vector>

#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"

namespace mindspore {
namespace dataset {
class KMnistNode : public MappableSourceNode {
public:
/// \brief Constructor.
/// \param[in] dataset_dir Dataset directory of kmnist.
/// \param[in] usage Usage of this dataset, can be 'train', 'test' or 'all'.
/// \param[in] sampler Tells KMnistOp what to read.
/// \param[in] cache Tensor cache to use.
KMnistNode(const std::string &dataset_dir, const std::string &usage, std::shared_ptr<SamplerObj> sampler,
std::shared_ptr<DatasetCache> cache);

/// \brief Destructor.
~KMnistNode() = default;

/// \brief Node name getter.
/// \return Name of the current node.
std::string Name() const override { return kKMnistNode; }

/// \brief Print the description.
/// \param[in] out The output stream to write output to.
void Print(std::ostream &out) const override;

/// \brief Copy the node to a new object.
/// \return A shared pointer to the new copy.
std::shared_ptr<DatasetNode> Copy() override;

/// \brief a base class override function to create the required runtime dataset op objects for this class.
/// \param[out] node_ops A vector containing shared pointer to the Dataset Ops that this object will create.
/// \return Status Status::OK() if build successfully.
Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override;

/// \brief Parameters validation.
/// \return Status Status::OK() if all the parameters are valid.
Status ValidateParams() override;

/// \brief Get the shard id of node.
/// \param[out] shard_id The shard id.
/// \return Status Status::OK() if get shard id successfully.
Status GetShardId(int32_t *shard_id) override;

/// \brief Base-class override for GetDatasetSize.
/// \param[in] size_getter Shared pointer to DatasetSizeGetter.
/// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
/// dataset size at the expense of accuracy.
/// \param[out] dataset_size the size of the dataset.
/// \return Status of the function.
Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) override;

/// \brief Getter functions.
const std::string &DatasetDir() const { return dataset_dir_; }

/// \brief Getter functions.
const std::string &Usage() const { return usage_; }

/// \brief Get the arguments of node.
/// \param[out] out_json JSON string of all attributes.
/// \return Status of the function.
Status to_json(nlohmann::json *out_json) override;

/// \brief Sampler getter.
/// \return SamplerObj of the current node.
std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }

/// \brief Sampler setter.
void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; }

private:
std::string dataset_dir_;
std::string usage_;
std::shared_ptr<SamplerObj> sampler_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_KMNIST_NODE_H_

+ 77
- 0
mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h View File

@@ -2510,6 +2510,83 @@ inline std::shared_ptr<ImageFolderDataset> MS_API ImageFolder(const std::string
MapStringToChar(class_indexing), cache);
}

/// \class KMnistDataset.
/// \brief A source dataset for reading and parsing KMnist dataset.
class MS_API KMnistDataset : public Dataset {
public:
/// \brief Function to create a KMnistDataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage of KMNIST, can be "train", "test" or "all".
/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
/// given, a `RandomSampler` will be used to randomly iterate the entire dataset.
/// \param[in] cache Tensor cache to use.
/// \return Shared pointer to the current KMnistDataset.
explicit KMnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const std::shared_ptr<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache);

/// \brief Function to create a KMnistDataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage of Kmnist, can be "train", "test" or "all".
/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use.
/// \return Shared pointer to the current KMnistDataset.
explicit KMnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, const Sampler *sampler,
const std::shared_ptr<DatasetCache> &cache);

/// \brief Function to create a KMnistDataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage of Kmnist, can be "train", "test" or "all".
/// \param[in] sampler Sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use.
/// \return Shared pointer to the current KMnistDataset.
explicit KMnistDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const std::reference_wrapper<Sampler> sampler, const std::shared_ptr<DatasetCache> &cache);

/// \brief Destructor of KMnistDataset.
~KMnistDataset() = default;
};

/// \brief Function to create a KMnistDataset.
/// \note The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] usage of KMNIST, can be "train", "test" or "all" (default = "all").
/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
/// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()).
/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
/// \return Shared pointer to the current KMnistDataset.
inline std::shared_ptr<KMnistDataset> MS_API
KMnist(const std::string &dataset_dir, const std::string &usage = "all",
const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(),
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<KMnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
}

/// \brief Function to create a KMnistDataset.
/// \note The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage of Kmnist, can be "train", "test" or "all".
/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
/// \return Shared pointer to the current KMnistDataset.
inline std::shared_ptr<KMnistDataset> MS_API KMnist(const std::string &dataset_dir, const std::string &usage,
const Sampler *sampler,
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<KMnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
}

/// \brief Function to create a KMnistDataset.
/// \note The generated dataset has two columns ["image", "label"]
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage of Kmnist, can be "train", "test" or "all".
/// \param[in] sampler Sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use. (default=nullptr which means no cache is used).
/// \return Shared pointer to the current KMnistDataset.
inline std::shared_ptr<KMnistDataset> MS_API KMnist(const std::string &dataset_dir, const std::string &usage,
const std::reference_wrapper<Sampler> sampler,
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<KMnistDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
}

/// \class LJSpeechDataset
/// \brief A source dataset for reading and parsing LJSpeech dataset.
class MS_API LJSpeechDataset : public Dataset {


+ 1
- 0
mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h View File

@@ -46,6 +46,7 @@ class MS_API Sampler : std::enable_shared_from_this<Sampler> {
friend class FashionMnistDataset;
friend class FlickrDataset;
friend class ImageFolderDataset;
friend class KMnistDataset;
friend class LJSpeechDataset;
friend class ManifestDataset;
friend class MindDataDataset;


+ 121
- 0
mindspore/dataset/engine/datasets.py View File

@@ -3706,6 +3706,127 @@ class ImageFolderDataset(MappableDataset):
return cde.ImageFolderNode(self.dataset_dir, self.decode, self.sampler, self.extensions, self.class_indexing)


class KMnistDataset(MappableDataset):
"""
A source dataset for reading and parsing the KMNIST dataset.

The generated dataset has two columns :py:obj:`[image, label]`.
The tensor of column :py:obj:`image` is of the uint8 type.
The tensor of column :py:obj:`label` is a scalar of the uint32 type.

Args:
dataset_dir (str): Path to the root directory that contains the dataset.
usage (str, optional): Usage of this dataset, can be `train`, `test` or `all` . `train` will read from 60,000
train samples, `test` will read from 10,000 test samples, `all` will read from all 70,000 samples.
(default=None, will read all samples)
num_samples (int, optional): The number of images to be included in the dataset
(default=None, will read all images).
num_parallel_workers (int, optional): Number of workers to read the data
(default=None, will use value set in the config).
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, expected order behavior shown in the table).
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, `num_samples` reflects the maximum sample number of per shard.
shard_id (int, optional): The shard ID within `num_shards` (default=None). This
argument can only be specified when `num_shards` is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
(default=None, which means no cache is used).

Raises:
RuntimeError: If `dataset_dir` does not contain data files.
RuntimeError: If `num_parallel_workers` exceeds the max thread numbers.
RuntimeError: If `sampler` and `shuffle` are specified at the same time.
RuntimeError: If `sampler` and sharding are specified at the same time.
RuntimeError: If `num_shards` is specified but `shard_id` is None.
RuntimeError: If `shard_id` is specified but `num_shards` is None.
ValueError: If `shard_id` is invalid (out of range [0, `num_shards`]).

Note:
- This dataset can take in a `sampler`. `sampler` and `shuffle` are mutually exclusive.
The table below shows what input arguments are allowed and their expected behavior.

.. list-table:: Expected Order Behavior of Using `sampler` and `shuffle`
:widths: 25 25 50
:header-rows: 1

* - Parameter `sampler`
- Parameter `shuffle`
- Expected Order Behavior
* - None
- None
- random order
* - None
- True
- random order
* - None
- False
- sequential order
* - Sampler object
- None
- order defined by sampler
* - Sampler object
- True
- not allowed
* - Sampler object
- False
- not allowed

Examples:
>>> kmnist_dataset_dir = "/path/to/kmnist_dataset_directory"
>>>
>>> # Read 3 samples from KMNIST dataset
>>> dataset = ds.KMnistDataset(dataset_dir=kmnist_dataset_dir, num_samples=3)
>>>
>>> # Note: In kmnist_dataset dataset, each dictionary has keys "image" and "label"

About KMNIST dataset:

KMNIST is a dataset, adapted from Kuzushiji Dataset, as a drop-in replacement for MNIST dataset,
which is the most famous dataset in the machine learning community.

Here is the original KMNIST dataset structure.
You can unzip the dataset files into this directory structure and read by MindSpore's API.

.. code-block::

.
└── kmnist_dataset_dir
├── t10k-images-idx3-ubyte
├── t10k-labels-idx1-ubyte
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

Citation:

.. code-block::

@online{clanuwat2018deep,
author = {Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and
Alex Lamb and Kazuaki Yamamoto and David Ha},
title = {Deep Learning for Classical Japanese Literature},
date = {2018-12-03},
year = {2018},
eprintclass = {cs.CV},
eprinttype = {arXiv},
eprint = {cs.CV/1812.01718},
}
"""

@check_mnist_cifar_dataset
def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None,
sampler=None, num_shards=None, shard_id=None, cache=None):
super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)

self.dataset_dir = dataset_dir
self.usage = replace_none(usage, "all")

def parse(self, children=None):
return cde.KMnistNode(self.dataset_dir, self.usage, self.sampler)


class MnistDataset(MappableDataset):
"""
A source dataset for reading and parsing the MNIST dataset.


+ 1
- 0
tests/ut/cpp/dataset/CMakeLists.txt View File

@@ -29,6 +29,7 @@ SET(DE_UT_SRCS
c_api_dataset_fashion_mnist_test.cc
c_api_dataset_flickr_test.cc
c_api_dataset_iterator_test.cc
c_api_dataset_kmnist_test.cc
c_api_dataset_lj_speech_test.cc
c_api_dataset_manifest_test.cc
c_api_dataset_minddata_test.cc


+ 287
- 0
tests/ut/cpp/dataset/c_api_dataset_kmnist_test.cc View File

@@ -0,0 +1,287 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/include/dataset/datasets.h"

using namespace mindspore::dataset;
using mindspore::dataset::DataType;
using mindspore::dataset::Tensor;
using mindspore::dataset::TensorShape;

class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};

/// Feature: KMnistTestDataset.
/// Description: test basic usage of KMnistTestDataset.
/// Expectation: get correct data.
TEST_F(MindDataTestPipeline, TestKMnistTestDataset) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistTestDataset.";

// Create a KMnist Test Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = KMnist(folder_path, "test", std::make_shared<RandomSampler>(false, 10));

EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

EXPECT_NE(row.find("image"), row.end());
EXPECT_NE(row.find("label"), row.end());

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image.Shape();
ASSERT_OK(iter->GetNextRow(&row));
}

EXPECT_EQ(i, 10);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: KMnistTestDatasetWithPipeline.
/// Description: test KMnistTestDataset with pipeline.
/// Expectation: get correct data.
TEST_F(MindDataTestPipeline, TestKMnistTestDatasetWithPipeline) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistTestDatasetWithPipeline.";

std::string folder_path = datasets_root_path_ + "/testMnistData/";

// Create two KMnist Test Dataset
std::shared_ptr<Dataset> ds1 = KMnist(folder_path, "test", std::make_shared<RandomSampler>(false, 10));
std::shared_ptr<Dataset> ds2 = KMnist(folder_path, "test", std::make_shared<RandomSampler>(false, 10));
EXPECT_NE(ds1, nullptr);
EXPECT_NE(ds2, nullptr);

// Create two Repeat operation on ds
int32_t repeat_num = 1;
ds1 = ds1->Repeat(repeat_num);
EXPECT_NE(ds1, nullptr);
repeat_num = 1;
ds2 = ds2->Repeat(repeat_num);
EXPECT_NE(ds2, nullptr);

// Create two Project operation on ds
std::vector<std::string> column_project = {"image", "label"};
ds1 = ds1->Project(column_project);
EXPECT_NE(ds1, nullptr);
ds2 = ds2->Project(column_project);
EXPECT_NE(ds2, nullptr);

// Create a Concat operation on the ds
ds1 = ds1->Concat({ds2});
EXPECT_NE(ds1, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds1->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

EXPECT_NE(row.find("image"), row.end());
EXPECT_NE(row.find("label"), row.end());

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image.Shape();
ASSERT_OK(iter->GetNextRow(&row));
}

EXPECT_EQ(i, 20);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: KMnistIteratorOneColumn.
/// Description: test iterator of KMnistDataset with only the "image" column.
/// Expectation: get correct data.
TEST_F(MindDataTestPipeline, TestKMnistIteratorOneColumn) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistIteratorOneColumn.";
// Create a KMnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = KMnist(folder_path, "all", std::make_shared<RandomSampler>(false, 4));
EXPECT_NE(ds, nullptr);

// Create a Batch operation on ds
int32_t batch_size = 2;
ds = ds->Batch(batch_size);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// Only select "image" column and drop others
std::vector<std::string> columns = {"image"};
std::shared_ptr<Iterator> iter = ds->CreateIterator(columns, -1);
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::vector<mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
std::vector<int64_t> expect_image = {2, 28, 28, 1};

uint64_t i = 0;
while (row.size() != 0) {
for (auto &v : row) {
MS_LOG(INFO) << "image shape:" << v.Shape();
EXPECT_EQ(expect_image, v.Shape());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 2);

// Manually terminate the pipeline
iter->Stop();
}

/// Feature: KMnistTestDatasetSize.
/// Description: test usage of get the size of KMnistTestDataset.
/// Expectation: get correct data.
TEST_F(MindDataTestPipeline, TestGetKMnistTestDatasetSize) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGetKMnistTestDatasetSize.";

std::string folder_path = datasets_root_path_ + "/testMnistData/";

// Create a KMnist Test Dataset
std::shared_ptr<Dataset> ds = KMnist(folder_path, "test");
EXPECT_NE(ds, nullptr);

EXPECT_EQ(ds->GetDatasetSize(), 10000);
}

/// Feature: KMnistTestDatasetGetters.
/// Description: test DatasetGetters of KMnistTestDataset.
/// Expectation: get correct the value.
TEST_F(MindDataTestPipeline, TestKMnistTestDatasetGetters) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistTestDatasetGetters.";

// Create a KMnist Test Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = KMnist(folder_path, "test");
EXPECT_NE(ds, nullptr);

EXPECT_EQ(ds->GetDatasetSize(), 10000);
std::vector<DataType> types = ToDETypes(ds->GetOutputTypes());
std::vector<TensorShape> shapes = ToTensorShapeVec(ds->GetOutputShapes());
std::vector<std::string> column_names = {"image", "label"};
int64_t num_classes = ds->GetNumClasses();
EXPECT_EQ(types.size(), 2);
EXPECT_EQ(types[0].ToString(), "uint8");
EXPECT_EQ(types[1].ToString(), "uint32");
EXPECT_EQ(shapes.size(), 2);
EXPECT_EQ(shapes[0].ToString(), "<28,28,1>");
EXPECT_EQ(shapes[1].ToString(), "<>");
EXPECT_EQ(num_classes, -1);
EXPECT_EQ(ds->GetBatchSize(), 1);
EXPECT_EQ(ds->GetRepeatCount(), 1);

EXPECT_EQ(ds->GetDatasetSize(), 10000);
EXPECT_EQ(ToDETypes(ds->GetOutputTypes()), types);
EXPECT_EQ(ToTensorShapeVec(ds->GetOutputShapes()), shapes);
EXPECT_EQ(ds->GetNumClasses(), -1);

EXPECT_EQ(ds->GetColumnNames(), column_names);
EXPECT_EQ(ds->GetDatasetSize(), 10000);
EXPECT_EQ(ToDETypes(ds->GetOutputTypes()), types);
EXPECT_EQ(ToTensorShapeVec(ds->GetOutputShapes()), shapes);
EXPECT_EQ(ds->GetBatchSize(), 1);
EXPECT_EQ(ds->GetRepeatCount(), 1);
EXPECT_EQ(ds->GetNumClasses(), -1);
EXPECT_EQ(ds->GetDatasetSize(), 10000);
}

/// Feature: KMnistIteratorWrongColumn.
/// Description: test iterator of KMnistDataset with wrong column.
/// Expectation: get none piece of data.
TEST_F(MindDataTestPipeline, TestKMnistIteratorWrongColumn) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistIteratorOneColumn.";
// Create a KMnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = KMnist(folder_path, "all", std::make_shared<RandomSampler>(false, 4));
EXPECT_NE(ds, nullptr);

// Pass wrong column name
std::vector<std::string> columns = {"digital"};
std::shared_ptr<Iterator> iter = ds->CreateIterator(columns);
EXPECT_EQ(iter, nullptr);
}

/// Feature: KMnistDatasetFail.
/// Description: test failure of KMnistDataset.
/// Expectation: get none piece of data.
TEST_F(MindDataTestPipeline, TestKMnistDatasetFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistDatasetFail.";

// Create a KMnist Dataset
std::shared_ptr<Dataset> ds = KMnist("", "train", std::make_shared<RandomSampler>(false, 10));
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid KMnist input
EXPECT_EQ(iter, nullptr);
}

/// Feature: KMnistDatasetWithInvalidUsageFail.
/// Description: test KMnistDataset with invalid usage.
/// Expectation: get none piece of data.
TEST_F(MindDataTestPipeline, TestKMnistDatasetWithInvalidUsageFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistDatasetWithInvalidUsageFail.";

// Create a KMnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = KMnist(folder_path, "validation");
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid KMnist input, validation is not a valid usage
EXPECT_EQ(iter, nullptr);
}

/// Feature: KMnistDatasetWithNullSamplerFail.
/// Description: test KMnistDataset with null sampler.
/// Expectation: get none piece of data.
TEST_F(MindDataTestPipeline, TestKMnistDatasetWithNullSamplerFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestKMnistUDatasetWithNullSamplerFail.";

// Create a KMnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = KMnist(folder_path, "all", nullptr);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid KMnist input, sampler cannot be nullptr
EXPECT_EQ(iter, nullptr);
}

+ 329
- 0
tests/ut/python/dataset/test_datasets_kmnist.py View File

@@ -0,0 +1,329 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Test KMnist dataset operators
"""

import os

import matplotlib.pyplot as plt
import numpy as np
import pytest

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as vision
from mindspore import log as logger

DATA_DIR = "../data/dataset/testMnistData"


def load_kmnist(path):
"""
Feature: load_kmnist.
Description: load KMnistDataset.
Expectation: get data of KMnistDataset.
"""
labels_path = os.path.realpath(os.path.join(path, 't10k-labels-idx1-ubyte'))
images_path = os.path.realpath(os.path.join(path, 't10k-images-idx3-ubyte'))
with open(os.path.realpath(labels_path), 'rb') as lbpath:
lbpath.read(8)
labels = np.fromfile(lbpath, dtype=np.uint8)
with open(os.path.realpath(images_path), 'rb') as imgpath:
imgpath.read(16)
images = np.fromfile(imgpath, dtype=np.uint8)
images = images.reshape(-1, 28, 28, 1)
images[images > 0] = 255 # Perform binarization to maintain consistency with our API
return images, labels


def visualize_dataset(images, labels):
"""
Feature: visualize_dataset.
Description: visualize KMnistDataset.
Expectation: plot images.
"""
num_samples = len(images)
for i in range(num_samples):
plt.subplot(1, num_samples, i + 1)
plt.imshow(images[i].squeeze(), cmap=plt.cm.gray)
plt.title(labels[i])
plt.show()


def test_kmnist_content_check():
"""
Feature: test_kmnist_content_check.
Description: validate KMnistDataset image readings.
Expectation: get correct value.
"""
logger.info("Test KMnistDataset Op with content check")
data1 = ds.KMnistDataset(DATA_DIR, num_samples=100, shuffle=False)
images, labels = load_kmnist(DATA_DIR)
num_iter = 0
# in this example, each dictionary has keys "image" and "label"
image_list, label_list = [], []
for i, data in enumerate(data1.create_dict_iterator(num_epochs=1, output_numpy=True)):
image_list.append(data["image"])
label_list.append("label {}".format(data["label"]))
np.testing.assert_array_equal(data["image"], images[i])
np.testing.assert_array_equal(data["label"], labels[i])
num_iter += 1
assert num_iter == 100


def test_kmnist_basic():
"""
Feature: test_kmnist_basic.
Description: test basic usage of KMnistDataset.
Expectation: get correct data.
"""
logger.info("Test KMnistDataset Op")

# case 1: test loading whole dataset
data1 = ds.KMnistDataset(DATA_DIR)
num_iter1 = 0
for _ in data1.create_dict_iterator(num_epochs=1):
num_iter1 += 1
assert num_iter1 == 10000

# case 2: test num_samples
data2 = ds.KMnistDataset(DATA_DIR, num_samples=500)
num_iter2 = 0
for _ in data2.create_dict_iterator(num_epochs=1):
num_iter2 += 1
assert num_iter2 == 500

# case 3: test repeat
data3 = ds.KMnistDataset(DATA_DIR, num_samples=200)
data3 = data3.repeat(5)
num_iter3 = 0
for _ in data3.create_dict_iterator(num_epochs=1):
num_iter3 += 1
assert num_iter3 == 1000

# case 4: test batch with drop_remainder=False
data4 = ds.KMnistDataset(DATA_DIR, num_samples=100)
assert data4.get_dataset_size() == 100
assert data4.get_batch_size() == 1
data4 = data4.batch(batch_size=7) # drop_remainder is default to be False
assert data4.get_dataset_size() == 15
assert data4.get_batch_size() == 7
num_iter4 = 0
for _ in data4.create_dict_iterator(num_epochs=1):
num_iter4 += 1
assert num_iter4 == 15

# case 5: test batch with drop_remainder=True
data5 = ds.KMnistDataset(DATA_DIR, num_samples=100)
assert data5.get_dataset_size() == 100
assert data5.get_batch_size() == 1
data5 = data5.batch(batch_size=7, drop_remainder=True) # the rest of incomplete batch will be dropped
assert data5.get_dataset_size() == 14
assert data5.get_batch_size() == 7
num_iter5 = 0
for _ in data5.create_dict_iterator(num_epochs=1):
num_iter5 += 1
assert num_iter5 == 14

# case 6: test get_col_names
data6 = ds.KMnistDataset(DATA_DIR, "train", num_samples=10)
assert data6.get_col_names() == ["image", "label"]

#case 7: test batch
data7 = ds.KMnistDataset(DATA_DIR, num_samples=200)
data7 = data7.batch(100, drop_remainder=True)
num_iter7 = 0
for _ in data7.create_dict_iterator(num_epochs=1):
num_iter7 += 1
assert num_iter7 == 2


def test_kmnist_pk_sampler():
"""
Feature: test_kmnist_pk_sampler.
Description: test usage of KMnistDataset with PKSampler.
Expectation: get correct data.
"""
logger.info("Test KMnistDataset Op with PKSampler")
golden = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9]
sampler = ds.PKSampler(3)
data = ds.KMnistDataset(DATA_DIR, sampler=sampler)
num_iter = 0
label_list = []
for item in data.create_dict_iterator(num_epochs=1, output_numpy=True):
label_list.append(item["label"])
num_iter += 1
np.testing.assert_array_equal(golden, label_list)
assert num_iter == 30


def test_kmnist_sequential_sampler():
"""
Feature: test_kmnist_sequential_sampler.
Description: test usage of KMnistDataset with SequentialSampler.
Expectation: get correct data.
"""
logger.info("Test KMnistDataset Op with SequentialSampler")
num_samples = 50
sampler = ds.SequentialSampler(num_samples=num_samples)
data1 = ds.KMnistDataset(DATA_DIR, sampler=sampler)
data2 = ds.KMnistDataset(DATA_DIR, shuffle=False, num_samples=num_samples)
label_list1, label_list2 = [], []
num_iter = 0
for item1, item2 in zip(data1.create_dict_iterator(num_epochs=1), data2.create_dict_iterator(num_epochs=1)):
label_list1.append(item1["label"].asnumpy())
label_list2.append(item2["label"].asnumpy())
num_iter += 1
np.testing.assert_array_equal(label_list1, label_list2)
assert num_iter == num_samples


def test_kmnist_exception():
"""
Feature: test_kmnist_exception.
Description: test error cases for KMnistDataset.
Expectation: raise exception.
"""
logger.info("Test error cases for KMnistDataset")
error_msg_1 = "sampler and shuffle cannot be specified at the same time"
with pytest.raises(RuntimeError, match=error_msg_1):
ds.KMnistDataset(DATA_DIR, shuffle=False, sampler=ds.PKSampler(3))

error_msg_2 = "sampler and sharding cannot be specified at the same time"
with pytest.raises(RuntimeError, match=error_msg_2):
ds.KMnistDataset(DATA_DIR, sampler=ds.PKSampler(3), num_shards=2, shard_id=0)

error_msg_3 = "num_shards is specified and currently requires shard_id as well"
with pytest.raises(RuntimeError, match=error_msg_3):
ds.KMnistDataset(DATA_DIR, num_shards=10)

error_msg_4 = "shard_id is specified but num_shards is not"
with pytest.raises(RuntimeError, match=error_msg_4):
ds.KMnistDataset(DATA_DIR, shard_id=0)

error_msg_5 = "Input shard_id is not within the required interval"
with pytest.raises(ValueError, match=error_msg_5):
ds.KMnistDataset(DATA_DIR, num_shards=5, shard_id=-1)
with pytest.raises(ValueError, match=error_msg_5):
ds.KMnistDataset(DATA_DIR, num_shards=5, shard_id=5)
with pytest.raises(ValueError, match=error_msg_5):
ds.KMnistDataset(DATA_DIR, num_shards=2, shard_id=5)

error_msg_6 = "num_parallel_workers exceeds"
with pytest.raises(ValueError, match=error_msg_6):
ds.KMnistDataset(DATA_DIR, shuffle=False, num_parallel_workers=0)
with pytest.raises(ValueError, match=error_msg_6):
ds.KMnistDataset(DATA_DIR, shuffle=False, num_parallel_workers=256)
with pytest.raises(ValueError, match=error_msg_6):
ds.KMnistDataset(DATA_DIR, shuffle=False, num_parallel_workers=-2)

error_msg_7 = "Argument shard_id"
with pytest.raises(TypeError, match=error_msg_7):
ds.KMnistDataset(DATA_DIR, num_shards=2, shard_id="0")

def exception_func(item):
raise Exception("Error occur!")

error_msg_8 = "The corresponding data files"
with pytest.raises(RuntimeError, match=error_msg_8):
data = ds.KMnistDataset(DATA_DIR)
data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1)
for _ in data.__iter__():
pass
with pytest.raises(RuntimeError, match=error_msg_8):
data = ds.KMnistDataset(DATA_DIR)
data = data.map(operations=vision.Decode(), input_columns=["image"], num_parallel_workers=1)
data = data.map(operations=exception_func, input_columns=["image"], num_parallel_workers=1)
for _ in data.__iter__():
pass
with pytest.raises(RuntimeError, match=error_msg_8):
data = ds.KMnistDataset(DATA_DIR)
data = data.map(operations=exception_func, input_columns=["label"], num_parallel_workers=1)
for _ in data.__iter__():
pass


def test_kmnist_visualize(plot=False):
"""
Feature: test_kmnist_visualize.
Description: visualize KMnistDataset results.
Expectation: get correct data and plot them.
"""
logger.info("Test KMnistDataset visualization")

data1 = ds.KMnistDataset(DATA_DIR, num_samples=10, shuffle=False)
num_iter = 0
image_list, label_list = [], []
for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
image = item["image"]
label = item["label"]
image_list.append(image)
label_list.append("label {}".format(label))
assert isinstance(image, np.ndarray)
assert image.shape == (28, 28, 1)
assert image.dtype == np.uint8
assert label.dtype == np.uint32
num_iter += 1
assert num_iter == 10
if plot:
visualize_dataset(image_list, label_list)


def test_kmnist_usage():
"""
Feature: test_kmnist_usage.
Description: validate KMnistDataset image readings.
Expectation: get correct data.
"""
logger.info("Test KMnistDataset usage flag")

def test_config(usage, kmnist_path=None):
kmnist_path = DATA_DIR if kmnist_path is None else kmnist_path
try:
data = ds.KMnistDataset(kmnist_path, usage=usage, shuffle=False)
num_rows = 0
for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
num_rows += 1
except (ValueError, TypeError, RuntimeError) as e:
return str(e)
return num_rows

assert test_config("test") == 10000
assert test_config("all") == 10000
assert "KMnistDataset API can't read the data file (interface mismatch or no data found)" in test_config("train")
assert "usage is not within the valid set of ['train', 'test', 'all']" in test_config("invalid")
assert "Argument usage with value ['list'] is not of type [<class 'str'>]" in test_config(["list"])

# change this directory to the folder that contains all kmnist files
all_files_path = None
# the following tests on the entire datasets
if all_files_path is not None:
assert test_config("train", all_files_path) == 60000
assert test_config("test", all_files_path) == 10000
assert test_config("all", all_files_path) == 70000
assert ds.KMnistDataset(all_files_path, usage="train").get_dataset_size() == 60000
assert ds.KMnistDataset(all_files_path, usage="test").get_dataset_size() == 10000
assert ds.KMnistDataset(all_files_path, usage="all").get_dataset_size() == 70000


if __name__ == '__main__':
test_kmnist_content_check()
test_kmnist_basic()
test_kmnist_pk_sampler()
test_kmnist_sequential_sampler()
test_kmnist_exception()
test_kmnist_visualize(plot=True)
test_kmnist_usage()

Loading…
Cancel
Save