Browse Source

[feat] [assistant] [I40GYH] add new dataset loading operator CaltechDataset

tags/v1.6.0
donrichnx 4 years ago
parent
commit
9a0fc0ffc1
26 changed files with 1659 additions and 8 deletions
  1. +24
    -0
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  2. +12
    -1
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
  3. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt
  4. +32
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/caltech_op.cc
  5. +57
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/caltech_op.h
  6. +1
    -1
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc
  7. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
  8. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt
  9. +135
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/caltech256_node.cc
  10. +108
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/caltech256_node.h
  11. +90
    -3
      mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
  12. +1
    -0
      mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h
  13. +360
    -3
      mindspore/python/mindspore/dataset/engine/datasets.py
  14. +56
    -0
      mindspore/python/mindspore/dataset/engine/validators.py
  15. +1
    -0
      tests/ut/cpp/dataset/CMakeLists.txt
  16. +207
    -0
      tests/ut/cpp/dataset/c_api_dataset_caltech256_test.cc
  17. BIN
      tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/apple/image_0001.jpg
  18. BIN
      tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/apple/image_0002.jpg
  19. BIN
      tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/banana/image_0001.jpg
  20. BIN
      tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/banana/image_0002.jpg
  21. BIN
      tests/ut/data/dataset/testCaltech101Data/Annotations/apple/annotation_0001.mat
  22. BIN
      tests/ut/data/dataset/testCaltech101Data/Annotations/apple/annotation_0002.mat
  23. BIN
      tests/ut/data/dataset/testCaltech101Data/Annotations/banana/annotation_0001.mat
  24. BIN
      tests/ut/data/dataset/testCaltech101Data/Annotations/banana/annotation_0002.mat
  25. +349
    -0
      tests/ut/python/dataset/test_datasets_caltech101.py
  26. +223
    -0
      tests/ut/python/dataset/test_datasets_caltech256.py

+ 24
- 0
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -90,6 +90,7 @@
// IR leaf nodes disabled for android
#ifndef ENABLE_ANDROID
#include "minddata/dataset/engine/ir/datasetops/source/amazon_review_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/caltech256_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h"
@@ -914,6 +915,29 @@ AmazonReviewDataset::AmazonReviewDataset(const std::vector<char> &dataset_dir, c
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

Caltech256Dataset::Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode,
const std::shared_ptr<Sampler> &sampler,
const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler ? sampler->Parse() : nullptr;
auto ds = std::make_shared<Caltech256Node>(CharToString(dataset_dir), decode, sampler_obj, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

Caltech256Dataset::Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const Sampler *sampler,
const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler ? sampler->Parse() : nullptr;
auto ds = std::make_shared<Caltech256Node>(CharToString(dataset_dir), decode, sampler_obj, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

Caltech256Dataset::Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode,
const std::reference_wrapper<Sampler> sampler,
const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler.get().Parse();
auto ds = std::make_shared<Caltech256Node>(CharToString(dataset_dir), decode, sampler_obj, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

CelebADataset::CelebADataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const std::shared_ptr<Sampler> &sampler, bool decode,
const std::set<std::vector<char>> &extensions,


+ 12
- 1
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc View File

@@ -27,6 +27,7 @@
// IR leaf nodes
#include "minddata/dataset/engine/ir/datasetops/source/ag_news_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/amazon_review_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/caltech256_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h"
@@ -79,7 +80,6 @@ namespace dataset {

// PYBIND FOR LEAF NODES
// (In alphabetical order)

PYBIND_REGISTER(AGNewsNode, 2, ([](const py::module *m) {
(void)py::class_<AGNewsNode, DatasetNode, std::shared_ptr<AGNewsNode>>(*m, "AGNewsNode",
"to create an AGNewsNode")
@@ -104,6 +104,17 @@ PYBIND_REGISTER(AmazonReviewNode, 2, ([](const py::module *m) {
}));
}));

PYBIND_REGISTER(Caltech256Node, 2, ([](const py::module *m) {
(void)py::class_<Caltech256Node, DatasetNode, std::shared_ptr<Caltech256Node>>(
*m, "Caltech256Node", "to create a Caltech256Node")
.def(py::init([](std::string dataset_dir, bool decode, py::handle sampler) {
auto caltech256 =
std::make_shared<Caltech256Node>(dataset_dir, decode, toSamplerObj(sampler), nullptr);
THROW_IF_ERROR(caltech256->ValidateParams());
return caltech256;
}));
}));

PYBIND_REGISTER(CelebANode, 2, ([](const py::module *m) {
(void)py::class_<CelebANode, DatasetNode, std::shared_ptr<CelebANode>>(*m, "CelebANode",
"to create a CelebANode")


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt View File

@@ -6,6 +6,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
ag_news_op.cc
album_op.cc
amazon_review_op.cc
caltech_op.cc
celeba_op.cc
cifar_op.cc
cityscapes_op.cc


+ 32
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/caltech_op.cc View File

@@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/datasetops/source/caltech_op.h"
#include <map>
#include <memory>
#include <set>
#include <utility>
namespace mindspore {
namespace dataset {
const std::set<std::string> kExts = {".jpg", ".JPEG"};
const std::map<std::string, int32_t> kClassIndex = {};
CaltechOp::CaltechOp(int32_t num_workers, const std::string &file_dir, int32_t queue_size, bool do_decode,
std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler)
: ImageFolderOp(num_workers, file_dir, queue_size, false, do_decode, kExts, kClassIndex, std::move(data_schema),
std::move(sampler)) {}
} // namespace dataset
} // namespace mindspore

+ 57
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/caltech_op.h View File

@@ -0,0 +1,57 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CALTECH_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CALTECH_OP_H_

#include <memory>
#include <string>

#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/engine/data_schema.h"
#include "minddata/dataset/engine/datasetops/parallel_op.h"
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"

namespace mindspore {
namespace dataset {
/// \brief Read Caltech256 Dataset.
class CaltechOp : public ImageFolderOp {
public:
/// \brief Constructor.
/// \param[in] num_workers Num of workers reading images in parallel.
/// \param[in] file_dir Directory of caltech dataset.
/// \param[in] queue_size Connector queue size.
/// \param[in] do_decode Whether to decode the raw data.
/// \param[in] data_schema Data schema of caltech256 dataset.
/// \param[in] sampler Sampler tells CaltechOp what to read.
CaltechOp(int32_t num_workers, const std::string &file_dir, int32_t queue_size, bool do_decode,
std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler);

/// \brief Destructor.
~CaltechOp() = default;

/// \brief Op name getter.
/// \return Name of the current Op.
std::string Name() const override { return "CaltechOp"; }

/// \brief DatasetName name getter.
/// \param[in] upper Whether the returned name begins with uppercase.
/// \return DatasetName of the current Op.
std::string DatasetName(bool upper = false) const { return upper ? "Caltech" : "caltech"; }
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CALTECH_OP_H_

+ 1
- 1
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc View File

@@ -281,7 +281,7 @@ Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::se
" does not exist or permission denied");
}
while (dir_itr->HasNext()) {
if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) {
if (exts.empty() || exts.find(dir_itr->Next().Extension()) != exts.end()) {
++row_cnt;
}
}


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h View File

@@ -79,6 +79,7 @@ constexpr char kZipNode[] = "Zip";
constexpr char kAGNewsNode[] = "AGNewsDataset";
constexpr char kAlbumNode[] = "AlbumDataset";
constexpr char kAmazonReviewNode[] = "AmazonReviewDataset";
constexpr char kCaltech256Node[] = "Caltech256Dataset";
constexpr char kCelebANode[] = "CelebADataset";
constexpr char kCifar100Node[] = "Cifar100Dataset";
constexpr char kCifar10Node[] = "Cifar10Dataset";


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt View File

@@ -6,6 +6,7 @@ set(DATASET_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES
ag_news_node.cc
album_node.cc
amazon_review_node.cc
caltech256_node.cc
celeba_node.cc
cifar100_node.cc
cifar10_node.cc


+ 135
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/caltech256_node.cc View File

@@ -0,0 +1,135 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/ir/datasetops/source/caltech256_node.h"

#include <map>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "minddata/dataset/engine/datasetops/source/caltech_op.h"
#ifndef ENABLE_ANDROID
#include "minddata/dataset/engine/serdes.h"
#endif
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {
const std::set<std::string> kExts = {".jpg", ".JPEG"};

Caltech256Node::Caltech256Node(const std::string &dataset_dir, bool decode, std::shared_ptr<SamplerObj> sampler,
std::shared_ptr<DatasetCache> cache = nullptr)
: MappableSourceNode(std::move(cache)), dataset_dir_(dataset_dir), decode_(decode), sampler_(sampler) {}

std::shared_ptr<DatasetNode> Caltech256Node::Copy() {
std::shared_ptr<SamplerObj> sampler = (sampler_ == nullptr) ? nullptr : sampler_->SamplerCopy();
auto node = std::make_shared<Caltech256Node>(dataset_dir_, decode_, sampler, cache_);
return node;
}

void Caltech256Node::Print(std::ostream &out) const {
out << (Name() + "(path: " + dataset_dir_ + ", decode: " + (decode_ ? "true" : "false") + ")");
}

Status Caltech256Node::ValidateParams() {
RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
RETURN_IF_NOT_OK(ValidateDatasetDirParam("Caltech256Node", dataset_dir_));
RETURN_IF_NOT_OK(ValidateDatasetSampler("Caltech256Node", sampler_));
return Status::OK();
}

Status Caltech256Node::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
// Do internal Schema generation.
// This arg exists in CaltechOp, but is not externalized (in Python API).
std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
TensorShape scalar = TensorShape::CreateScalar();
RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
RETURN_IF_NOT_OK(
schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar)));
std::shared_ptr<SamplerRT> sampler_rt = nullptr;
RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));

auto op = std::make_shared<CaltechOp>(num_workers_, dataset_dir_, connector_que_size_, decode_, std::move(schema),
std::move(sampler_rt));
op->SetTotalRepeats(GetTotalRepeats());
op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch());
node_ops->push_back(op);
return Status::OK();
}

// Get the shard id of node.
Status Caltech256Node::GetShardId(int32_t *shard_id) {
*shard_id = sampler_->ShardId();
return Status::OK();
}

// Get Dataset size.
Status Caltech256Node::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) {
if (dataset_size_ > 0) {
*dataset_size = dataset_size_;
return Status::OK();
}
int64_t sample_size, num_rows;
RETURN_IF_NOT_OK(CaltechOp::CountRowsAndClasses(dataset_dir_, kExts, &num_rows, nullptr, {}));
std::shared_ptr<SamplerRT> sampler_rt = nullptr;
RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
sample_size = sampler_rt->CalculateNumSamples(num_rows);
if (sample_size == -1) {
RETURN_IF_NOT_OK(size_getter->DryRun(shared_from_this(), &sample_size));
}
*dataset_size = sample_size;
dataset_size_ = *dataset_size;
return Status::OK();
}

Status Caltech256Node::to_json(nlohmann::json *out_json) {
nlohmann::json args, sampler_args;
RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
args["sampler"] = sampler_args;
args["num_parallel_workers"] = num_workers_;
args["dataset_dir"] = dataset_dir_;
args["decode"] = decode_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
*out_json = args;
return Status::OK();
}

#ifndef ENABLE_ANDROID
Status Caltech256Node::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "num_parallel_workers", kCaltech256Node));
RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "dataset_dir", kCaltech256Node));
RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "decode", kCaltech256Node));
RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "sampler", kCaltech256Node));
std::string dataset_dir = json_obj["dataset_dir"];
bool decode = json_obj["decode"];
std::shared_ptr<SamplerObj> sampler;
RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
std::shared_ptr<DatasetCache> cache = nullptr;
RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
*ds = std::make_shared<Caltech256Node>(dataset_dir, decode, sampler, cache);
(*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
return Status::OK();
}
#endif
} // namespace dataset
} // namespace mindspore

+ 108
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/caltech256_node.h View File

@@ -0,0 +1,108 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CALTECH256_NODE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CALTECH256_NODE_H_

#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>

#include "minddata/dataset/engine/ir/cache/dataset_cache.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"

namespace mindspore {
namespace dataset {
/// \class Caltech256Node.
/// \brief A Dataset derived class to represent Caltech256 dataset.
class Caltech256Node : public MappableSourceNode {
public:
/// \brief Constructor.
Caltech256Node(const std::string &dataset_dir, bool decode, std::shared_ptr<SamplerObj> sampler,
std::shared_ptr<DatasetCache> cache);

/// \brief Destructor.
~Caltech256Node() = default;

/// \brief Node name getter.
/// \return Name of the current node.
std::string Name() const override { return kCaltech256Node; }

/// \brief Print the description.
/// \param[out] out The output stream to write output to.
void Print(std::ostream &out) const override;

/// \brief Copy the node to a new object.
/// \return A shared pointer to the new copy.
std::shared_ptr<DatasetNode> Copy() override;

/// \brief a base class override function to create the required runtime dataset op objects for this class.
/// \param[out] node_ops A vector containing shared pointer to the Dataset Ops that this object will create.
/// \return Status Status::OK() if build successfully.
Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override;

/// \brief Parameters validation.
/// \return Status Status::OK() if all the parameters are valid.
Status ValidateParams() override;

/// \brief Get the shard id of node.
/// \param[out] shard_id The shard id.
/// \return Status Status::OK() if get shard id successfully.
Status GetShardId(int32_t *shard_id) override;

/// \brief Base-class override for GetDatasetSize.
/// \param[in] size_getter Shared pointer to DatasetSizeGetter.
/// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
/// dataset size at the expense of accuracy.
/// \param[out] dataset_size The size of the dataset.
/// \return Status of the function.
Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) override;

/// \brief Getter functions.
const std::string &DatasetDir() const { return dataset_dir_; }
bool Decode() const { return decode_; }

/// \brief Get the arguments of node.
/// \param[out] out_json JSON string of all attributes.
/// \return Status of the function.
Status to_json(nlohmann::json *out_json) override;

#ifndef ENABLE_ANDROID
/// \brief Function to read dataset in json.
/// \param[in] json_obj The JSON object to be deserialized.
/// \param[out] ds Deserialized dataset.
/// \return Status The status code returned.
static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
#endif

/// \brief Sampler getter.
/// \return SamplerObj of the current node.
std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }

/// \brief Sampler setter.
void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; }

private:
std::string dataset_dir_;
bool decode_;
std::shared_ptr<SamplerObj> sampler_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CALTECH256_NODE_H_

+ 90
- 3
mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h View File

@@ -1219,6 +1219,93 @@ inline std::shared_ptr<AmazonReviewDataset> MS_API AmazonReview(const std::strin
num_shards, shard_id, cache);
}

/// \class Caltech256Dataset
/// \brief A source dataset for reading and parsing Caltech256 dataset.
class MS_API Caltech256Dataset : public Dataset {
public:
/// \brief Constructor of Caltech256Dataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] decode Decode the images after reading.
/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use.
Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const std::shared_ptr<Sampler> &sampler,
const std::shared_ptr<DatasetCache> &cache);

/// \brief Constructor of Caltech256Dataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] decode Decode the images after reading.
/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use.
Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const Sampler *sampler,
const std::shared_ptr<DatasetCache> &cache);

/// \brief Constructor of Caltech256Dataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] decode Decode the images after reading.
/// \param[in] sampler Sampler object used to choose samples from the dataset.
/// \param[in] cache Tensor cache to use.
Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const std::reference_wrapper<Sampler> sampler,
const std::shared_ptr<DatasetCache> &cache);

/// \brief Destructor of Caltech256Dataset.
~Caltech256Dataset() = default;
};

/// \brief Function to create a Caltech256Dataset.
/// \note The generated dataset has two columns ["image", "label"].
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
/// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()).
/// \param[in] decode Decode the images after reading (default=false).
/// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used).
/// \return Shared pointer to the Caltech256Dataset.
/// \par Example
/// \code
/// /* Define dataset path and MindData object */
/// std::string dataset_path = "/path/to/caltech256_dataset_directory";
/// std::shared_ptr<Dataset> ds = Caltech256(dataset_path, true, std::make_shared<RandomSampler>(false, 10));
///
/// /* Create iterator to read dataset */
/// std::shared_ptr<Iterator> iter = ds->CreateIterator();
/// std::unordered_map<std::string, mindspore::MSTensor> row;
/// iter->GetNextRow(&row);
///
/// /* Note: In Caltech256 dataset, each data dictionary has keys "image" and "label" */
/// auto image = row["image"];
/// \endcode
inline std::shared_ptr<Caltech256Dataset> MS_API
Caltech256(const std::string &dataset_dir, const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(),
bool decode = false, const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<Caltech256Dataset>(StringToChar(dataset_dir), decode, sampler, cache);
}

/// \brief Function to create a Caltech256Dataset
/// \note The generated dataset has two columns ["image", "label"].
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
/// \param[in] decode Decode the images after reading (default=false).
/// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used).
/// \return Shared pointer to the Caltech256Dataset.
inline std::shared_ptr<Caltech256Dataset> MS_API Caltech256(const std::string &dataset_dir, const Sampler *sampler,
bool decode = false,
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<Caltech256Dataset>(StringToChar(dataset_dir), decode, sampler, cache);
}

/// \brief Function to create a Caltech256Dataset.
/// \note The generated dataset has two columns ["image", "label"].
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] sampler Sampler object used to choose samples from the dataset.
/// \param[in] decode Decode the images after reading (default=false).
/// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used).
/// \return Shared pointer to the Caltech256Dataset.
inline std::shared_ptr<Caltech256Dataset> MS_API Caltech256(const std::string &dataset_dir,
const std::reference_wrapper<Sampler> sampler,
bool decode = false,
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<Caltech256Dataset>(StringToChar(dataset_dir), decode, sampler, cache);
}

/// \class CelebADataset
/// \brief A source dataset for reading and parsing CelebA dataset.
class MS_API CelebADataset : public Dataset {
@@ -2864,7 +2951,7 @@ class MS_API LJSpeechDataset : public Dataset {
};

/// \brief Function to create a LJSpeech Dataset.
/// \notes The generated dataset has four columns ["waveform", "sample_rate", "transcription",
/// \note The generated dataset has four columns ["waveform", "sample_rate", "transcription",
/// "normalized_transcription"].
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
@@ -2878,7 +2965,7 @@ LJSpeech(const std::string &dataset_dir, const std::shared_ptr<Sampler> &sampler
}

/// \brief Function to create a LJSpeech Dataset.
/// \notes The generated dataset has four columns ["waveform", "sample_rate", "transcription",
/// \note The generated dataset has four columns ["waveform", "sample_rate", "transcription",
/// "normalized_transcription"].
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
@@ -2890,7 +2977,7 @@ inline std::shared_ptr<LJSpeechDataset> MS_API LJSpeech(const std::string &datas
}

/// \brief Function to create a LJSpeech Dataset.
/// \notes The generated dataset has four columns ["waveform", "sample_rate", "transcription",
/// \note The generated dataset has four columns ["waveform", "sample_rate", "transcription",
/// "normalized_transcription"].
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] sampler Sampler object used to choose samples from the dataset.


+ 1
- 0
mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h View File

@@ -33,6 +33,7 @@ class SamplerObj;
/// \brief An abstract base class to represent a sampler in the data pipeline.
class MS_API Sampler : std::enable_shared_from_this<Sampler> {
friend class AlbumDataset;
friend class Caltech256Dataset;
friend class CelebADataset;
friend class Cifar10Dataset;
friend class Cifar100Dataset;


+ 360
- 3
mindspore/python/mindspore/dataset/engine/datasets.py View File

@@ -74,8 +74,9 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
check_photo_tour_dataset, check_ag_news_dataset, check_dbpedia_dataset, check_lj_speech_dataset, \
check_yes_no_dataset, check_speech_commands_dataset, check_tedlium_dataset, check_svhn_dataset, \
check_stl10_dataset, check_yelp_review_dataset, check_penn_treebank_dataset, check_iwslt2016_dataset, \
check_iwslt2017_dataset, check_sogou_news_dataset, check_yahoo_answers_dataset, check_udpos_dataset,\
check_conll2000_dataset, check_amazon_review_dataset, check_semeion_dataset
check_iwslt2017_dataset, check_sogou_news_dataset, check_yahoo_answers_dataset, check_udpos_dataset, \
check_conll2000_dataset, check_amazon_review_dataset, check_semeion_dataset, check_caltech101_dataset, \
check_caltech256_dataset
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
get_prefetch_size
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
@@ -4822,7 +4823,7 @@ def _check_shm_usage(num_worker, queue_size, max_rowsize, num_queues=1):
threshold_ratio = 0.8
if platform.system().lower() not in {"windows", "darwin"}:
shm_estimate_usage = _get_device_num() * num_worker * num_queues * \
(queue_size + 2) * max_rowsize * 1024 * 1024
(queue_size + 2) * max_rowsize * 1024 * 1024
try:
shm_available = psutil.disk_usage('/dev/shm').free
if shm_estimate_usage >= threshold_ratio * shm_available:
@@ -6555,6 +6556,362 @@ class VOCDataset(MappableDataset):
return self.class_indexing


class _Caltech101Dataset:
"""
Mainly for loading Caltech101 Dataset, and return two rows each time.
"""

def __init__(self, dataset_dir, target_type="category", decode=False):
self.dataset_dir = os.path.realpath(dataset_dir)
self.image_dir = os.path.join(self.dataset_dir, "101_ObjectCategories")
self.annotation_dir = os.path.join(self.dataset_dir, "Annotations")
self.target_type = target_type
if self.target_type == "category":
self.column_names = ["image", "category"]
elif self.target_type == "annotation":
self.column_names = ["image", "annotation"]
else:
self.column_names = ["image", "category", "annotation"]
self.decode = decode
self.classes = sorted(os.listdir(self.image_dir))
if "BACKGROUND_Google" in self.classes:
self.classes.remove("BACKGROUND_Google")
name_map = {"Faces": "Faces_2",
"Faces_easy": "Faces_3",
"Motorbikes": "Motorbikes_16",
"airplanes": "Airplanes_Side_2"}
self.annotation_classes = [name_map[class_name] if class_name in name_map else class_name
for class_name in self.classes]
self.image_index = []
self.image_label = []
for i, image_class in enumerate(self.classes):
sub_dir = os.path.join(self.image_dir, image_class)
if not os.path.isdir(sub_dir) or not os.access(sub_dir, os.R_OK):
continue
num_images = len(os.listdir(sub_dir))
self.image_index.extend(range(1, num_images + 1))
self.image_label.extend(num_images * [i])

def __getitem__(self, index):
image_file = os.path.join(self.image_dir, self.classes[self.image_label[index]],
"image_{:04d}.jpg".format(self.image_index[index]))
if not os.path.exists(image_file):
raise ValueError("The image file {} does not exist or permission denied!".format(image_file))
if self.decode:
image = np.asarray(Image.open(image_file).convert("RGB"))
else:
image = np.fromfile(image_file, dtype=np.uint8)

if self.target_type == "category":
return image, self.image_label[index]
annotation_file = os.path.join(self.annotation_dir, self.annotation_classes[self.image_label[index]],
"annotation_{:04d}.mat".format(self.image_index[index]))
if not os.path.exists(annotation_file):
raise ValueError("The annotation file {} does not exist or permission denied!".format(annotation_file))
annotation = loadmat(annotation_file)["obj_contour"]

if self.target_type == "annotation":
return image, annotation
return image, self.image_label[index], annotation

def __len__(self):
return len(self.image_index)


class Caltech101Dataset(GeneratorDataset):
"""
A source dataset that reads and parses Caltech101 dataset.

The columns of the generated dataset depend on the value of `target_type`.
When `target_type` is `category`, the columns are :py:obj:`[image, category]`.
When `target_type` is `annotation`, the columns are :py:obj:`[image, annotation]`.
When `target_type` is `all`, the columns are :py:obj:`[image, category, annotation]`.
The tensor of column :py:obj:`image` is of the uint8 type.
The tensor of column :py:obj:`category` is of the uint32 type.
The tensor of column :py:obj:`annotation` is a 2-dimensional ndarray that stores the contour of the image
and consists of a series of points.

Args:
dataset_dir (str): Path to the root directory that contains the dataset. This root directory contains two
subdirectories, one is called 101_ObjectCategories, which stores images,
and the other is called Annotations, which stores annotations.
target_type (str, optional): Target of the image. If target_type is "category", return category represents
the target class. If target_type is "annotation", return annotation.
If target_type is "all", return category and annotation (default=None, means "category").
num_samples (int, optional): The number of images to be included in the dataset
(default=None, all images).
num_parallel_workers (int, optional): Number of workers to read the data (default=1).
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, expected order behavior shown in the table).
decode (bool, optional): Whether or not to decode the images after reading (default=False).
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None). When this argument is specified, `num_samples` reflects
the maximum sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.

Raises:
RuntimeError: If dataset_dir does not contain data files.
RuntimeError: If target_type is not set correctly.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If sampler and shuffle are specified at the same time.
RuntimeError: If sampler and sharding are specified at the same time.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).

Note:
- This dataset can take in a `sampler`. `sampler` and `shuffle` are mutually exclusive.
The table below shows what input arguments are allowed and their expected behavior.

.. list-table:: Expected Order Behavior of Using `sampler` and `shuffle`
:widths: 25 25 50
:header-rows: 1

* - Parameter `sampler`
- Parameter `shuffle`
- Expected Order Behavior
* - None
- None
- random order
* - None
- True
- random order
* - None
- False
- sequential order
* - Sampler object
- None
- order defined by sampler
* - Sampler object
- True
- not allowed
* - Sampler object
- False
- not allowed

Examples:
>>> caltech101_dataset_directory = "/path/to/caltech101_dataset_directory"
>>>
>>> # 1) Read all samples (image files) in caltech101_dataset_directory with 8 threads
>>> dataset = ds.Caltech101Dataset(dataset_dir=caltech101_dataset_directory, num_parallel_workers=8)
>>>
>>> # 2) Read all samples (image files) with the target_type "annotation"
>>> dataset = ds.Caltech101Dataset(dataset_dir=caltech101_dataset_directory, target_type="annotation")

About Caltech101Dataset:

Pictures of objects belonging to 101 categories. About 40 to 800 images per category.
Most categories have about 50 images. Collected in September 2003 by Fei-Fei Li, Marco Andreetto,
and Marc 'Aurelio Ranzato. The size of each image is roughly 300 x 200 pixels.
The official provides the contour data of each object in each picture, which is the annotation.

.. code-block::

.
└── caltech101_dataset_directory
├── 101_ObjectCategories
│ ├── Faces
│ │ ├── image_0001.jpg
│ │ ├── image_0002.jpg
│ │ ...
│ ├── Faces_easy
│ │ ├── image_0001.jpg
│ │ ├── image_0002.jpg
│ │ ...
│ ├── ...
└── Annotations
├── Airplanes_Side_2
│ ├── annotation_0001.mat
│ ├── annotation_0002.mat
│ ...
├── Faces_2
│ ├── annotation_0001.mat
│ ├── annotation_0002.mat
│ ...
├── ...

Citation:

.. code-block::

@article{FeiFei2004LearningGV,
author = {Li Fei-Fei and Rob Fergus and Pietro Perona},
title = {Learning Generative Visual Models from Few Training Examples:
An Incremental Bayesian Approach Tested on 101 Object Categories},
journal = {Computer Vision and Pattern Recognition Workshop},
year = {2004},
url = {http://www.vision.caltech.edu/Image_Datasets/Caltech101/},
}
"""

@check_caltech101_dataset
def __init__(self, dataset_dir, target_type=None, num_samples=None, num_parallel_workers=1,
shuffle=None, decode=False, sampler=None, num_shards=None, shard_id=None):
self.dataset_dir = dataset_dir
self.target_type = replace_none(target_type, "category")
self.decode = replace_none(decode, False)
dataset = _Caltech101Dataset(self.dataset_dir, self.target_type, self.decode)
super().__init__(dataset, column_names=dataset.column_names, num_samples=num_samples,
num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler,
num_shards=num_shards, shard_id=shard_id)

def get_class_indexing(self):
"""
Get the class index.

Returns:
dict, a str-to-int mapping from label name to index.
"""
class_dict = {'Faces': 0, 'Faces_easy': 1, 'Leopards': 2, 'Motorbikes': 3, 'accordion': 4, 'airplanes': 5,
'anchor': 6, 'ant': 7, 'barrel': 8, 'bass': 9, 'beaver': 10, 'binocular': 11, 'bonsai': 12,
'brain': 13, 'brontosaurus': 14, 'buddha': 15, 'butterfly': 16, 'camera': 17, 'cannon': 18,
'car_side': 19, 'ceiling_fan': 20, 'cellphone': 21, 'chair': 22, 'chandelier': 23,
'cougar_body': 24, 'cougar_face': 25, 'crab': 26, 'crayfish': 27, 'crocodile': 28,
'crocodile_head': 29, 'cup': 30, 'dalmatian': 31, 'dollar_bill': 32, 'dolphin': 33,
'dragonfly': 34, 'electric_guitar': 35, 'elephant': 36, 'emu': 37, 'euphonium': 38, 'ewer': 39,
'ferry': 40, 'flamingo': 41, 'flamingo_head': 42, 'garfield': 43, 'gerenuk': 44, 'gramophone': 45,
'grand_piano': 46, 'hawksbill': 47, 'headphone': 48, 'hedgehog': 49, 'helicopter': 50, 'ibis': 51,
'inline_skate': 52, 'joshua_tree': 53, 'kangaroo': 54, 'ketch': 55, 'lamp': 56, 'laptop': 57,
'llama': 58, 'lobster': 59, 'lotus': 60, 'mandolin': 61, 'mayfly': 62, 'menorah': 63,
'metronome': 64, 'minaret': 65, 'nautilus': 66, 'octopus': 67, 'okapi': 68, 'pagoda': 69,
'panda': 70, 'pigeon': 71, 'pizza': 72, 'platypus': 73, 'pyramid': 74, 'revolver': 75,
'rhino': 76, 'rooster': 77, 'saxophone': 78, 'schooner': 79, 'scissors': 80, 'scorpion': 81,
'sea_horse': 82, 'snoopy': 83, 'soccer_ball': 84, 'stapler': 85, 'starfish': 86,
'stegosaurus': 87, 'stop_sign': 88, 'strawberry': 89, 'sunflower': 90, 'tick': 91,
'trilobite': 92, 'umbrella': 93, 'watch': 94, 'water_lilly': 95, 'wheelchair': 96, 'wild_cat': 97,
'windsor_chair': 98, 'wrench': 99, 'yin_yang': 100}
return class_dict


class Caltech256Dataset(MappableDataset):
"""
A source dataset that reads and parses Caltech256 dataset.

The generated dataset has two columns: :py:obj:`[image, label]`.
The tensor of column :py:obj:`image` is of the uint8 type.
The tensor of column :py:obj:`label` is of the uint32 type.

Args:
dataset_dir (str): Path to the root directory that contains the dataset.
num_samples (int, optional): The number of images to be included in the dataset
(default=None, all images).
num_parallel_workers (int, optional): Number of workers to read the data
(default=None, set in the config).
shuffle (bool, optional): Whether or not to perform shuffle on the dataset
(default=None, expected order behavior shown in the table).
decode (bool, optional): Whether or not to decode the images after reading (default=False).
sampler (Sampler, optional): Object used to choose samples from the
dataset (default=None, expected order behavior shown in the table).
num_shards (int, optional): Number of shards that the dataset will be divided
into (default=None). When this argument is specified, `num_samples` reflects
the maximum sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
(default=None, which means no cache is used).

Raises:
RuntimeError: If dataset_dir does not contain data files.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If sampler and shuffle are specified at the same time.
RuntimeError: If sampler and sharding are specified at the same time.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.
ValueError: If shard_id is invalid (< 0 or >= num_shards).

Note:
- This dataset can take in a `sampler`. `sampler` and `shuffle` are mutually exclusive.
The table below shows what input arguments are allowed and their expected behavior.

.. list-table:: Expected Order Behavior of Using `sampler` and `shuffle`
:widths: 25 25 50
:header-rows: 1

* - Parameter `sampler`
- Parameter `shuffle`
- Expected Order Behavior
* - None
- None
- random order
* - None
- True
- random order
* - None
- False
- sequential order
* - Sampler object
- None
- order defined by sampler
* - Sampler object
- True
- not allowed
* - Sampler object
- False
- not allowed

Examples:
>>> caltech256_dataset_dir = "/path/to/caltech256_dataset_directory"
>>>
>>> # 1) Read all samples (image files) in caltech256_dataset_dir with 8 threads
>>> dataset = ds.Caltech256Dataset(dataset_dir=caltech256_dataset_dir, num_parallel_workers=8)

About Caltech256Dataset:

Caltech-256 is an object recognition dataset containing 30,607 real-world images, of different sizes,
spanning 257 classes (256 object classes and an additional clutter class).
Each class is represented by at least 80 images. The dataset is a superset of the Caltech-101 dataset.

.. code-block::

.
└── caltech256_dataset_directory
├── 001.ak47
│ ├── 001_0001.jpg
│ ├── 001_0002.jpg
│ ...
├── 002.american-flag
│ ├── 002_0001.jpg
│ ├── 002_0002.jpg
│ ...
├── 003.backpack
│ ├── 003_0001.jpg
│ ├── 003_0002.jpg
│ ...
├── ...

Citation:

.. code-block::

@article{griffin2007caltech,
title = {Caltech-256 object category dataset},
added-at = {2021-01-21T02:54:42.000+0100},
author = {Griffin, Gregory and Holub, Alex and Perona, Pietro},
biburl = {https://www.bibsonomy.org/bibtex/21f746f23ff0307826cca3e3be45f8de7/s364315},
interhash = {bfe1e648c1778c04baa60f23d1223375},
intrahash = {1f746f23ff0307826cca3e3be45f8de7},
publisher = {California Institute of Technology},
timestamp = {2021-01-21T02:54:42.000+0100},
year = {2007}
}
"""

@check_caltech256_dataset
def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None, decode=False,
sampler=None, num_shards=None, shard_id=None, cache=None):
super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)

self.dataset_dir = dataset_dir
self.decode = replace_none(decode, False)

def parse(self, children=None):
return cde.Caltech256Node(self.dataset_dir, self.decode, self.sampler)


class CocoDataset(MappableDataset):
"""
A source dataset for reading and parsing COCO dataset.


+ 56
- 0
mindspore/python/mindspore/dataset/engine/validators.py View File

@@ -460,6 +460,62 @@ def check_usps_dataset(method):
return new_method


def check_caltech101_dataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(Caltech101Dataset)."""

@wraps(method)
def new_method(self, *args, **kwargs):
_, param_dict = parse_user_args(method, *args, **kwargs)

nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
nreq_param_bool = ['shuffle', 'decode']
nreq_param_str = ['target_type']

dataset_dir = param_dict.get('dataset_dir')
check_dir(dataset_dir)

target_type = param_dict.get('target_type')
if target_type is not None:
check_valid_str(target_type, ["category", "annotation", "all"], "target_type")

validate_dataset_param_value(nreq_param_int, param_dict, int)
validate_dataset_param_value(nreq_param_bool, param_dict, bool)
validate_dataset_param_value(nreq_param_str, param_dict, str)
check_sampler_shuffle_shard_options(param_dict)

cache = param_dict.get('cache')
check_cache_option(cache)

return method(self, *args, **kwargs)

return new_method


def check_caltech256_dataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(Caltech256Dataset)."""

@wraps(method)
def new_method(self, *args, **kwargs):
_, param_dict = parse_user_args(method, *args, **kwargs)

nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
nreq_param_bool = ['shuffle', 'decode']

dataset_dir = param_dict.get('dataset_dir')
check_dir(dataset_dir)

validate_dataset_param_value(nreq_param_int, param_dict, int)
validate_dataset_param_value(nreq_param_bool, param_dict, bool)
check_sampler_shuffle_shard_options(param_dict)

cache = param_dict.get('cache')
check_cache_option(cache)

return method(self, *args, **kwargs)

return new_method


def check_vocdataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(VOCDataset)."""



+ 1
- 0
tests/ut/cpp/dataset/CMakeLists.txt View File

@@ -17,6 +17,7 @@ SET(DE_UT_SRCS
c_api_dataset_ag_news_test.cc
c_api_dataset_album_test.cc
c_api_dataset_amazon_review_test.cc
c_api_dataset_caltech256_test.cc
c_api_dataset_cifar_test.cc
c_api_dataset_cityscapes_test.cc
c_api_dataset_clue_test.cc


+ 207
- 0
tests/ut/cpp/dataset/c_api_dataset_caltech256_test.cc View File

@@ -0,0 +1,207 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/include/dataset/datasets.h"

using namespace mindspore::dataset;
using mindspore::dataset::DataType;
using mindspore::dataset::Tensor;
using mindspore::dataset::TensorShape;

class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};

/// Feature: Caltech256Dataset
/// Description: basic test of Caltech256Dataset
/// Expectation: the data is processed successfully
TEST_F(MindDataTestPipeline, TestCaltech256Dataset) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256Dataset.";

// Create a Caltech256 Dataset.
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = Caltech256(folder_path, std::make_shared<RandomSampler>(false, 44));
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

EXPECT_NE(row.find("image"), row.end());
EXPECT_NE(row.find("label"), row.end());

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image.Shape();
ASSERT_OK(iter->GetNextRow(&row));
}

EXPECT_EQ(i, 44);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Caltech256Dataset
/// Description: test Caltech256Dataset in pipeline mode
/// Expectation: the data is processed successfully
TEST_F(MindDataTestPipeline, TestCaltech256DatasetWithPipeline) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256DatasetWithPipeline.";

// Create two Caltech256 Dataset.
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds1 = Caltech256(folder_path, std::make_shared<RandomSampler>(false, 3));
std::shared_ptr<Dataset> ds2 = Caltech256(folder_path, std::make_shared<RandomSampler>(false, 3));
EXPECT_NE(ds1, nullptr);
EXPECT_NE(ds2, nullptr);

// Create two Repeat operation on ds.
int32_t repeat_num = 1;
ds1 = ds1->Repeat(repeat_num);
EXPECT_NE(ds1, nullptr);
repeat_num = 1;
ds2 = ds2->Repeat(repeat_num);
EXPECT_NE(ds2, nullptr);

// Create two Project operation on ds.
std::vector<std::string> column_project = {"image", "label"};
ds1 = ds1->Project(column_project);
EXPECT_NE(ds1, nullptr);
ds2 = ds2->Project(column_project);
EXPECT_NE(ds2, nullptr);

// Create a Concat operation on the ds.
ds1 = ds1->Concat({ds2});
EXPECT_NE(ds1, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds1->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

EXPECT_NE(row.find("image"), row.end());
EXPECT_NE(row.find("label"), row.end());

uint64_t i = 0;
while (row.size() != 0) {
i++;
auto image = row["image"];
MS_LOG(INFO) << "Tensor image shape: " << image.Shape();
ASSERT_OK(iter->GetNextRow(&row));
}

EXPECT_EQ(i, 6);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Caltech256Dataset
/// Description: test getting size of Caltech256Dataset
/// Expectation: the size is correct
TEST_F(MindDataTestPipeline, TestCaltech256GetDatasetSize) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256GetDatasetSize.";

// Create a Caltech256 Dataset.
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = Caltech256(folder_path);
EXPECT_NE(ds, nullptr);

EXPECT_EQ(ds->GetDatasetSize(), 44);
}

/// Feature: Caltech256Dataset
/// Description: test Caltech256Dataset with mix getter
/// Expectation: the data is processed successfully
TEST_F(MindDataTestPipeline, TestCaltech256Getters) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256MixGetter.";

// Create a Caltech256 Dataset.
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = Caltech256(folder_path);
EXPECT_NE(ds, nullptr);

EXPECT_EQ(ds->GetDatasetSize(), 44);
std::vector<DataType> types = ToDETypes(ds->GetOutputTypes());
std::vector<TensorShape> shapes = ToTensorShapeVec(ds->GetOutputShapes());
std::vector<std::string> column_names = {"image", "label"};
int64_t num_classes = ds->GetNumClasses();
EXPECT_EQ(types.size(), 2);
EXPECT_EQ(types[0].ToString(), "uint8");
EXPECT_EQ(types[1].ToString(), "int32");
EXPECT_EQ(shapes.size(), 2);
EXPECT_EQ(num_classes, 4);
EXPECT_EQ(ds->GetBatchSize(), 1);
EXPECT_EQ(ds->GetRepeatCount(), 1);

EXPECT_EQ(ds->GetDatasetSize(), 44);
EXPECT_EQ(ToDETypes(ds->GetOutputTypes()), types);
EXPECT_EQ(ToTensorShapeVec(ds->GetOutputShapes()), shapes);
EXPECT_EQ(ds->GetNumClasses(), 4);

EXPECT_EQ(ds->GetColumnNames(), column_names);
EXPECT_EQ(ds->GetDatasetSize(), 44);
EXPECT_EQ(ToDETypes(ds->GetOutputTypes()), types);
EXPECT_EQ(ToTensorShapeVec(ds->GetOutputShapes()), shapes);
EXPECT_EQ(ds->GetBatchSize(), 1);
EXPECT_EQ(ds->GetRepeatCount(), 1);
EXPECT_EQ(ds->GetNumClasses(), 4);
EXPECT_EQ(ds->GetDatasetSize(), 44);
}

/// Feature: Caltech256Dataset
/// Description: test Caltech256Dataset with the fail of reading dataset
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestCaltech256DatasetFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256DatasetFail.";

// Create a Caltech256 Dataset.
std::shared_ptr<Dataset> ds = Caltech256("", std::make_shared<RandomSampler>(false, 10));
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid Caltech256 input.
EXPECT_EQ(iter, nullptr);
}

/// Feature: Caltech256Dataset
/// Description: test Caltech256Dataset with the null sampler
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestCaltech256DatasetWithNullSamplerFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256DatasetWithNullSamplerFail.";

// Create a Caltech256 Dataset.
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = Caltech256(folder_path, nullptr);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid Caltech256 input, sampler cannot be nullptr.
EXPECT_EQ(iter, nullptr);
}

BIN
tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/apple/image_0001.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 440 kB

BIN
tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/apple/image_0002.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 432 kB

BIN
tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/banana/image_0001.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 176 kB

BIN
tests/ut/data/dataset/testCaltech101Data/101_ObjectCategories/banana/image_0002.jpg View File

Before After
Width: 4032  |  Height: 2268  |  Size: 174 kB

BIN
tests/ut/data/dataset/testCaltech101Data/Annotations/apple/annotation_0001.mat View File


BIN
tests/ut/data/dataset/testCaltech101Data/Annotations/apple/annotation_0002.mat View File


BIN
tests/ut/data/dataset/testCaltech101Data/Annotations/banana/annotation_0001.mat View File


BIN
tests/ut/data/dataset/testCaltech101Data/Annotations/banana/annotation_0002.mat View File


+ 349
- 0
tests/ut/python/dataset/test_datasets_caltech101.py View File

@@ -0,0 +1,349 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Test Caltech101 dataset operators
"""
import os

import matplotlib.pyplot as plt
import numpy as np
import pytest
from PIL import Image
from scipy.io import loadmat

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as c_vision
from mindspore import log as logger

DATASET_DIR = "../data/dataset/testCaltech101Data"
WRONG_DIR = "../data/dataset/notExist"


def get_index_info():
dataset_dir = os.path.realpath(DATASET_DIR)
image_dir = os.path.join(dataset_dir, "101_ObjectCategories")
classes = sorted(os.listdir(image_dir))
if "BACKGROUND_Google" in classes:
classes.remove("BACKGROUND_Google")
name_map = {"Faces": "Faces_2",
"Faces_easy": "Faces_3",
"Motorbikes": "Motorbikes_16",
"airplanes": "Airplanes_Side_2"}
annotation_classes = [name_map[class_name] if class_name in name_map else class_name for class_name in classes]
image_index = []
image_label = []
for i, c in enumerate(classes):
sub_dir = os.path.join(image_dir, c)
if not os.path.isdir(sub_dir) or not os.access(sub_dir, os.R_OK):
continue
num_images = len(os.listdir(sub_dir))
image_index.extend(range(1, num_images + 1))
image_label.extend(num_images * [i])
return image_index, image_label, classes, annotation_classes


def load_caltech101(target_type="category", decode=False):
"""
load Caltech101 data
"""
dataset_dir = os.path.realpath(DATASET_DIR)
image_dir = os.path.join(dataset_dir, "101_ObjectCategories")
annotation_dir = os.path.join(dataset_dir, "Annotations")
image_index, image_label, classes, annotation_classes = get_index_info()
images, categories, annotations = [], [], []
num_images = len(image_index)
for i in range(num_images):
image_file = os.path.join(image_dir, classes[image_label[i]], "image_{:04d}.jpg".format(image_index[i]))
if not os.path.exists(image_file):
raise ValueError("The image file {} does not exist or permission denied!".format(image_file))
if decode:
image = np.asarray(Image.open(image_file).convert("RGB"))
else:
image = np.fromfile(image_file, dtype=np.uint8)
images.append(image)
if target_type == "category":
for i in range(num_images):
categories.append(image_label[i])
return images, categories
for i in range(num_images):
annotation_file = os.path.join(annotation_dir, annotation_classes[image_label[i]],
"annotation_{:04d}.mat".format(image_index[i]))
if not os.path.exists(annotation_file):
raise ValueError("The annotation file {} does not exist or permission denied!".format(annotation_file))
annotation = loadmat(annotation_file)["obj_contour"]
annotations.append(annotation)
if target_type == "annotation":
return images, annotations
for i in range(num_images):
categories.append(image_label[i])
return images, categories, annotations


def visualize_dataset(images, labels):
"""
Helper function to visualize the dataset samples
"""
num_samples = len(images)
for i in range(num_samples):
plt.subplot(1, num_samples, i + 1)
plt.imshow(images[i].squeeze())
plt.title(labels[i])
plt.show()


def test_caltech101_content_check():
"""
Feature: Caltech101Dataset
Description: check if the image data of caltech101 dataset is read correctly
Expectation: the data is processed successfully
"""
logger.info("Test Caltech101Dataset Op with content check")
all_data = ds.Caltech101Dataset(DATASET_DIR, target_type="annotation", num_samples=4, shuffle=False, decode=True)
images, annotations = load_caltech101(target_type="annotation", decode=True)
num_iter = 0
for i, data in enumerate(all_data.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(data["image"], images[i])
np.testing.assert_array_equal(data["annotation"], annotations[i])
num_iter += 1
assert num_iter == 4

all_data = ds.Caltech101Dataset(DATASET_DIR, target_type="all", num_samples=4, shuffle=False, decode=True)
images, categories, annotations = load_caltech101(target_type="all", decode=True)
num_iter = 0
for i, data in enumerate(all_data.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(data["image"], images[i])
np.testing.assert_array_equal(data["category"], categories[i])
np.testing.assert_array_equal(data["annotation"], annotations[i])
num_iter += 1
assert num_iter == 4


def test_caltech101_basic():
"""
Feature: Caltech101Dataset
Description: basic test of Caltech101Dataset
Expectation: the data is processed successfully
"""
logger.info("Test Caltech101Dataset Op")

# case 1: test target_type
all_data_1 = ds.Caltech101Dataset(DATASET_DIR, shuffle=False)
all_data_2 = ds.Caltech101Dataset(DATASET_DIR, shuffle=False)

num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["category"], item2["category"])
num_iter += 1
assert num_iter == 4

# case 2: test decode
all_data_1 = ds.Caltech101Dataset(DATASET_DIR, decode=True, shuffle=False)
all_data_2 = ds.Caltech101Dataset(DATASET_DIR, decode=True, shuffle=False)

num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["image"], item2["image"])
num_iter += 1
assert num_iter == 4

# case 3: test num_samples
all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4)
num_iter = 0
for _ in all_data.create_dict_iterator(num_epochs=1):
num_iter += 1
assert num_iter == 4

# case 4: test repeat
all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4)
all_data = all_data.repeat(2)
num_iter = 0
for _ in all_data.create_dict_iterator(num_epochs=1):
num_iter += 1
assert num_iter == 8

# case 5: test get_dataset_size, resize and batch
all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4)
all_data = all_data.map(operations=[c_vision.Decode(), c_vision.Resize((224, 224))], input_columns=["image"],
num_parallel_workers=1)

assert all_data.get_dataset_size() == 4
assert all_data.get_batch_size() == 1
# drop_remainder is default to be False
all_data = all_data.batch(batch_size=3)
assert all_data.get_batch_size() == 3
assert all_data.get_dataset_size() == 2

num_iter = 0
for _ in all_data.create_dict_iterator(num_epochs=1):
num_iter += 1
assert num_iter == 2

# case 6: test get_class_indexing
all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4)
class_indexing = all_data.get_class_indexing()
assert class_indexing["Faces"] == 0
assert class_indexing["yin_yang"] == 100


def test_caltech101_target_type():
"""
Feature: Caltech101Dataset
Description: test Caltech101Dataset with target_type
Expectation: the data is processed successfully
"""
logger.info("Test Caltech101Dataset Op with target_type")
all_data_1 = ds.Caltech101Dataset(DATASET_DIR, target_type="annotation", shuffle=False)
all_data_2 = ds.Caltech101Dataset(DATASET_DIR, target_type="annotation", shuffle=False)
num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["annotation"], item2["annotation"])
num_iter += 1
assert num_iter == 4
all_data_1 = ds.Caltech101Dataset(DATASET_DIR, target_type="all", shuffle=False)
all_data_2 = ds.Caltech101Dataset(DATASET_DIR, target_type="all", shuffle=False)
num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["category"], item2["category"])
np.testing.assert_array_equal(item1["annotation"], item2["annotation"])
num_iter += 1
assert num_iter == 4
all_data_1 = ds.Caltech101Dataset(DATASET_DIR, target_type="category", shuffle=False)
all_data_2 = ds.Caltech101Dataset(DATASET_DIR, target_type="category", shuffle=False)
num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["category"], item2["category"])
num_iter += 1
assert num_iter == 4


def test_caltech101_sequential_sampler():
"""
Feature: Caltech101Dataset
Description: test Caltech101Dataset with SequentialSampler
Expectation: the data is processed successfully
"""
logger.info("Test Caltech101Dataset Op with SequentialSampler")
num_samples = 4
sampler = ds.SequentialSampler(num_samples=num_samples)
all_data_1 = ds.Caltech101Dataset(DATASET_DIR, sampler=sampler)
all_data_2 = ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_samples=num_samples)
label_list_1, label_list_2 = [], []
num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1),
all_data_2.create_dict_iterator(num_epochs=1)):
label_list_1.append(item1["category"].asnumpy())
label_list_2.append(item2["category"].asnumpy())
num_iter += 1
np.testing.assert_array_equal(label_list_1, label_list_2)
assert num_iter == num_samples


def test_caltech101_exception():
"""
Feature: Caltech101Dataset
Description: test error cases for Caltech101Dataset
Expectation: throw correct error and message
"""
logger.info("Test error cases for Caltech101Dataset")
error_msg_1 = "sampler and shuffle cannot be specified at the same time"
with pytest.raises(RuntimeError, match=error_msg_1):
ds.Caltech101Dataset(DATASET_DIR, shuffle=False, sampler=ds.SequentialSampler(1))

error_msg_2 = "sampler and sharding cannot be specified at the same time"
with pytest.raises(RuntimeError, match=error_msg_2):
ds.Caltech101Dataset(DATASET_DIR, sampler=ds.SequentialSampler(1), num_shards=2, shard_id=0)

error_msg_3 = "num_shards is specified and currently requires shard_id as well"
with pytest.raises(RuntimeError, match=error_msg_3):
ds.Caltech101Dataset(DATASET_DIR, num_shards=10)

error_msg_4 = "shard_id is specified but num_shards is not"
with pytest.raises(RuntimeError, match=error_msg_4):
ds.Caltech101Dataset(DATASET_DIR, shard_id=0)

error_msg_5 = "Input shard_id is not within the required interval"
with pytest.raises(ValueError, match=error_msg_5):
ds.Caltech101Dataset(DATASET_DIR, num_shards=5, shard_id=-1)

with pytest.raises(ValueError, match=error_msg_5):
ds.Caltech101Dataset(DATASET_DIR, num_shards=5, shard_id=5)

with pytest.raises(ValueError, match=error_msg_5):
ds.Caltech101Dataset(DATASET_DIR, num_shards=2, shard_id=5)

error_msg_6 = "num_parallel_workers exceeds"
with pytest.raises(ValueError, match=error_msg_6):
ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_parallel_workers=0)
with pytest.raises(ValueError, match=error_msg_6):
ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_parallel_workers=256)
with pytest.raises(ValueError, match=error_msg_6):
ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_parallel_workers=-2)

error_msg_7 = "Argument shard_id"
with pytest.raises(TypeError, match=error_msg_7):
ds.Caltech101Dataset(DATASET_DIR, num_shards=2, shard_id="0")

error_msg_8 = "does not exist or is not a directory or permission denied!"
with pytest.raises(ValueError, match=error_msg_8):
all_data = ds.Caltech101Dataset(WRONG_DIR, WRONG_DIR)
for _ in all_data.create_dict_iterator(num_epochs=1):
pass

error_msg_9 = "Input target_type is not within the valid set of \\['category', 'annotation', 'all'\\]."
with pytest.raises(ValueError, match=error_msg_9):
all_data = ds.Caltech101Dataset(DATASET_DIR, target_type="cate")
for _ in all_data.create_dict_iterator(num_epochs=1):
pass


def test_caltech101_visualize(plot=False):
"""
Feature: Caltech101Dataset
Description: visualize Caltech101Dataset results
Expectation: the data is processed successfully
"""
logger.info("Test Caltech101Dataset visualization")

all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4, decode=True, shuffle=False)
num_iter = 0
image_list, category_list = [], []
for item in all_data.create_dict_iterator(num_epochs=1, output_numpy=True):
image = item["image"]
category = item["category"]
image_list.append(image)
category_list.append("label {}".format(category))
assert isinstance(image, np.ndarray)
assert len(image.shape) == 3
assert image.shape[-1] == 3
assert image.dtype == np.uint8
assert category.dtype == np.int64
num_iter += 1
assert num_iter == 4
if plot:
visualize_dataset(image_list, category_list)


if __name__ == '__main__':
test_caltech101_content_check()
test_caltech101_basic()
test_caltech101_target_type()
test_caltech101_sequential_sampler()
test_caltech101_exception()
test_caltech101_visualize(plot=True)

+ 223
- 0
tests/ut/python/dataset/test_datasets_caltech256.py View File

@@ -0,0 +1,223 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Test Caltech256 dataset operators
"""
import numpy as np
import pytest

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as c_vision
from mindspore import log as logger

IMAGE_DATA_DIR = "../data/dataset/testPK/data"
WRONG_DIR = "../data/dataset/notExist"


def test_caltech256_basic():
"""
Feature: Caltech256Dataset
Description: basic test of Caltech256Dataset
Expectation: the data is processed successfully
"""
logger.info("Test Caltech256Dataset Op")

# case 1: test read all data
all_data_1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False)
all_data_2 = ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False)

num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["label"], item2["label"])
num_iter += 1
assert num_iter == 44

# case 2: test decode
all_data_1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, decode=True, shuffle=False)
all_data_2 = ds.Caltech256Dataset(IMAGE_DATA_DIR, decode=True, shuffle=False)

num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True),
all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)):
np.testing.assert_array_equal(item1["label"], item2["label"])
num_iter += 1
assert num_iter == 44

# case 3: test num_samples
all_data = ds.Caltech256Dataset(IMAGE_DATA_DIR, num_samples=4)
num_iter = 0
for _ in all_data.create_dict_iterator(num_epochs=1):
num_iter += 1
assert num_iter == 4

# case 4: test repeat
all_data = ds.Caltech256Dataset(IMAGE_DATA_DIR, num_samples=4)
all_data = all_data.repeat(2)
num_iter = 0
for _ in all_data.create_dict_iterator(num_epochs=1):
num_iter += 1
assert num_iter == 8

# case 5: test get_dataset_size, resize and batch
all_data = ds.Caltech256Dataset(IMAGE_DATA_DIR, num_samples=4)
all_data = all_data.map(operations=[c_vision.Decode(), c_vision.Resize((224, 224))], input_columns=["image"],
num_parallel_workers=1)

assert all_data.get_dataset_size() == 4
assert all_data.get_batch_size() == 1
# drop_remainder is default to be False
all_data = all_data.batch(batch_size=3)
assert all_data.get_batch_size() == 3
assert all_data.get_dataset_size() == 2

num_iter = 0
for _ in all_data.create_dict_iterator(num_epochs=1):
num_iter += 1
assert num_iter == 2


def test_caltech256_decode():
"""
Feature: Caltech256Dataset
Description: validate Caltech256Dataset with decode
Expectation: the data is processed successfully
"""
logger.info("Validate Caltech256Dataset with decode")
# define parameters
repeat_count = 1

data1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, decode=True)
data1 = data1.repeat(repeat_count)

num_iter = 0
# each data is a dictionary
for item in data1.create_dict_iterator(num_epochs=1):
# in this example, each dictionary has keys "image" and "label"
logger.info("image is {}".format(item["image"]))
logger.info("label is {}".format(item["label"]))
num_iter += 1

logger.info("Number of data in data1: {}".format(num_iter))
assert num_iter == 44


def test_caltech256_sequential_sampler():
"""
Feature: Caltech256Dataset
Description: test Caltech256Dataset with SequentialSampler
Expectation: the data is processed successfully
"""
logger.info("Test Caltech256Dataset Op with SequentialSampler")
num_samples = 4
sampler = ds.SequentialSampler(num_samples=num_samples)
all_data_1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, sampler=sampler)
all_data_2 = ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_samples=num_samples)
label_list_1, label_list_2 = [], []
num_iter = 0
for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1),
all_data_2.create_dict_iterator(num_epochs=1)):
label_list_1.append(item1["label"].asnumpy())
label_list_2.append(item2["label"].asnumpy())
num_iter += 1
np.testing.assert_array_equal(label_list_1, label_list_2)
assert num_iter == num_samples


def test_caltech256_random_sampler():
"""
Feature: Caltech256Dataset
Description: test Caltech256Dataset with RandomSampler
Expectation: the data is processed successfully
"""
logger.info("Test Caltech256Dataset Op with RandomSampler")
# define parameters
repeat_count = 1

# apply dataset operations
sampler = ds.RandomSampler()
data1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, sampler=sampler)
data1 = data1.repeat(repeat_count)

num_iter = 0
# each data is a dictionary
for item in data1.create_dict_iterator(num_epochs=1):
# in this example, each dictionary has keys "image" and "label"
logger.info("image is {}".format(item["image"]))
logger.info("label is {}".format(item["label"]))
num_iter += 1

logger.info("Number of data in data1: {}".format(num_iter))
assert num_iter == 44


def test_caltech256_exception():
"""
Feature: Caltech256Dataset
Description: test error cases for Caltech256Dataset
Expectation: throw correct error and message
"""
logger.info("Test error cases for Caltech256Dataset")
error_msg_1 = "sampler and shuffle cannot be specified at the same time"
with pytest.raises(RuntimeError, match=error_msg_1):
ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, sampler=ds.SequentialSampler(1))

error_msg_2 = "sampler and sharding cannot be specified at the same time"
with pytest.raises(RuntimeError, match=error_msg_2):
ds.Caltech256Dataset(IMAGE_DATA_DIR, sampler=ds.SequentialSampler(1), num_shards=2, shard_id=0)

error_msg_3 = "num_shards is specified and currently requires shard_id as well"
with pytest.raises(RuntimeError, match=error_msg_3):
ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=10)

error_msg_4 = "shard_id is specified but num_shards is not"
with pytest.raises(RuntimeError, match=error_msg_4):
ds.Caltech256Dataset(IMAGE_DATA_DIR, shard_id=0)

error_msg_5 = "Input shard_id is not within the required interval"
with pytest.raises(ValueError, match=error_msg_5):
ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=5, shard_id=-1)

with pytest.raises(ValueError, match=error_msg_5):
ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=5, shard_id=5)

with pytest.raises(ValueError, match=error_msg_5):
ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=2, shard_id=5)

error_msg_6 = "num_parallel_workers exceeds"
with pytest.raises(ValueError, match=error_msg_6):
ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_parallel_workers=0)
with pytest.raises(ValueError, match=error_msg_6):
ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_parallel_workers=256)
with pytest.raises(ValueError, match=error_msg_6):
ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_parallel_workers=-2)

error_msg_7 = "Argument shard_id"
with pytest.raises(TypeError, match=error_msg_7):
ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=2, shard_id="0")

error_msg_8 = "does not exist or is not a directory or permission denied!"
with pytest.raises(ValueError, match=error_msg_8):
all_data = ds.Caltech256Dataset(WRONG_DIR)
for _ in all_data.create_dict_iterator(num_epochs=1):
pass


if __name__ == '__main__':
test_caltech256_basic()
test_caltech256_decode()
test_caltech256_sequential_sampler()
test_caltech256_random_sampler()
test_caltech256_exception()

Loading…
Cancel
Save