| @@ -90,6 +90,7 @@ | |||
| // IR leaf nodes disabled for android | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/engine/ir/datasetops/source/amazon_review_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/caltech256_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" | |||
| @@ -914,6 +915,29 @@ AmazonReviewDataset::AmazonReviewDataset(const std::vector<char> &dataset_dir, c | |||
| ir_node_ = std::static_pointer_cast<DatasetNode>(ds); | |||
| } | |||
| Caltech256Dataset::Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, | |||
| const std::shared_ptr<Sampler> &sampler, | |||
| const std::shared_ptr<DatasetCache> &cache) { | |||
| auto sampler_obj = sampler ? sampler->Parse() : nullptr; | |||
| auto ds = std::make_shared<Caltech256Node>(CharToString(dataset_dir), decode, sampler_obj, cache); | |||
| ir_node_ = std::static_pointer_cast<DatasetNode>(ds); | |||
| } | |||
| Caltech256Dataset::Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const Sampler *sampler, | |||
| const std::shared_ptr<DatasetCache> &cache) { | |||
| auto sampler_obj = sampler ? sampler->Parse() : nullptr; | |||
| auto ds = std::make_shared<Caltech256Node>(CharToString(dataset_dir), decode, sampler_obj, cache); | |||
| ir_node_ = std::static_pointer_cast<DatasetNode>(ds); | |||
| } | |||
| Caltech256Dataset::Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, | |||
| const std::reference_wrapper<Sampler> sampler, | |||
| const std::shared_ptr<DatasetCache> &cache) { | |||
| auto sampler_obj = sampler.get().Parse(); | |||
| auto ds = std::make_shared<Caltech256Node>(CharToString(dataset_dir), decode, sampler_obj, cache); | |||
| ir_node_ = std::static_pointer_cast<DatasetNode>(ds); | |||
| } | |||
| CelebADataset::CelebADataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, | |||
| const std::shared_ptr<Sampler> &sampler, bool decode, | |||
| const std::set<std::vector<char>> &extensions, | |||
| @@ -27,6 +27,7 @@ | |||
| // IR leaf nodes | |||
| #include "minddata/dataset/engine/ir/datasetops/source/ag_news_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/amazon_review_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/caltech256_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h" | |||
| @@ -79,7 +80,6 @@ namespace dataset { | |||
| // PYBIND FOR LEAF NODES | |||
| // (In alphabetical order) | |||
| PYBIND_REGISTER(AGNewsNode, 2, ([](const py::module *m) { | |||
| (void)py::class_<AGNewsNode, DatasetNode, std::shared_ptr<AGNewsNode>>(*m, "AGNewsNode", | |||
| "to create an AGNewsNode") | |||
| @@ -104,6 +104,17 @@ PYBIND_REGISTER(AmazonReviewNode, 2, ([](const py::module *m) { | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER(Caltech256Node, 2, ([](const py::module *m) { | |||
| (void)py::class_<Caltech256Node, DatasetNode, std::shared_ptr<Caltech256Node>>( | |||
| *m, "Caltech256Node", "to create a Caltech256Node") | |||
| .def(py::init([](std::string dataset_dir, bool decode, py::handle sampler) { | |||
| auto caltech256 = | |||
| std::make_shared<Caltech256Node>(dataset_dir, decode, toSamplerObj(sampler), nullptr); | |||
| THROW_IF_ERROR(caltech256->ValidateParams()); | |||
| return caltech256; | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER(CelebANode, 2, ([](const py::module *m) { | |||
| (void)py::class_<CelebANode, DatasetNode, std::shared_ptr<CelebANode>>(*m, "CelebANode", | |||
| "to create a CelebANode") | |||
| @@ -6,6 +6,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES | |||
| ag_news_op.cc | |||
| album_op.cc | |||
| amazon_review_op.cc | |||
| caltech_op.cc | |||
| celeba_op.cc | |||
| cifar_op.cc | |||
| cityscapes_op.cc | |||
| @@ -0,0 +1,32 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/engine/datasetops/source/caltech_op.h" | |||
| #include <map> | |||
| #include <memory> | |||
| #include <set> | |||
| #include <utility> | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| const std::set<std::string> kExts = {".jpg", ".JPEG"}; | |||
| const std::map<std::string, int32_t> kClassIndex = {}; | |||
| CaltechOp::CaltechOp(int32_t num_workers, const std::string &file_dir, int32_t queue_size, bool do_decode, | |||
| std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler) | |||
| : ImageFolderOp(num_workers, file_dir, queue_size, false, do_decode, kExts, kClassIndex, std::move(data_schema), | |||
| std::move(sampler)) {} | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,57 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CALTECH_OP_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CALTECH_OP_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/engine/data_schema.h" | |||
| #include "minddata/dataset/engine/datasetops/parallel_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/image_folder_op.h" | |||
| #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| /// \brief Read Caltech256 Dataset. | |||
| class CaltechOp : public ImageFolderOp { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] num_workers Num of workers reading images in parallel. | |||
| /// \param[in] file_dir Directory of caltech dataset. | |||
| /// \param[in] queue_size Connector queue size. | |||
| /// \param[in] do_decode Whether to decode the raw data. | |||
| /// \param[in] data_schema Data schema of caltech256 dataset. | |||
| /// \param[in] sampler Sampler tells CaltechOp what to read. | |||
| CaltechOp(int32_t num_workers, const std::string &file_dir, int32_t queue_size, bool do_decode, | |||
| std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler); | |||
| /// \brief Destructor. | |||
| ~CaltechOp() = default; | |||
| /// \brief Op name getter. | |||
| /// \return Name of the current Op. | |||
| std::string Name() const override { return "CaltechOp"; } | |||
| /// \brief DatasetName name getter. | |||
| /// \param[in] upper Whether the returned name begins with uppercase. | |||
| /// \return DatasetName of the current Op. | |||
| std::string DatasetName(bool upper = false) const { return upper ? "Caltech" : "caltech"; } | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CALTECH_OP_H_ | |||
| @@ -281,7 +281,7 @@ Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::se | |||
| " does not exist or permission denied"); | |||
| } | |||
| while (dir_itr->HasNext()) { | |||
| if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) { | |||
| if (exts.empty() || exts.find(dir_itr->Next().Extension()) != exts.end()) { | |||
| ++row_cnt; | |||
| } | |||
| } | |||
| @@ -79,6 +79,7 @@ constexpr char kZipNode[] = "Zip"; | |||
| constexpr char kAGNewsNode[] = "AGNewsDataset"; | |||
| constexpr char kAlbumNode[] = "AlbumDataset"; | |||
| constexpr char kAmazonReviewNode[] = "AmazonReviewDataset"; | |||
| constexpr char kCaltech256Node[] = "Caltech256Dataset"; | |||
| constexpr char kCelebANode[] = "CelebADataset"; | |||
| constexpr char kCifar100Node[] = "Cifar100Dataset"; | |||
| constexpr char kCifar10Node[] = "Cifar10Dataset"; | |||
| @@ -6,6 +6,7 @@ set(DATASET_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES | |||
| ag_news_node.cc | |||
| album_node.cc | |||
| amazon_review_node.cc | |||
| caltech256_node.cc | |||
| celeba_node.cc | |||
| cifar100_node.cc | |||
| cifar10_node.cc | |||
| @@ -0,0 +1,135 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/engine/ir/datasetops/source/caltech256_node.h" | |||
| #include <map> | |||
| #include <memory> | |||
| #include <set> | |||
| #include <string> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/datasetops/source/caltech_op.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/engine/serdes.h" | |||
| #endif | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| const std::set<std::string> kExts = {".jpg", ".JPEG"}; | |||
| Caltech256Node::Caltech256Node(const std::string &dataset_dir, bool decode, std::shared_ptr<SamplerObj> sampler, | |||
| std::shared_ptr<DatasetCache> cache = nullptr) | |||
| : MappableSourceNode(std::move(cache)), dataset_dir_(dataset_dir), decode_(decode), sampler_(sampler) {} | |||
| std::shared_ptr<DatasetNode> Caltech256Node::Copy() { | |||
| std::shared_ptr<SamplerObj> sampler = (sampler_ == nullptr) ? nullptr : sampler_->SamplerCopy(); | |||
| auto node = std::make_shared<Caltech256Node>(dataset_dir_, decode_, sampler, cache_); | |||
| return node; | |||
| } | |||
| void Caltech256Node::Print(std::ostream &out) const { | |||
| out << (Name() + "(path: " + dataset_dir_ + ", decode: " + (decode_ ? "true" : "false") + ")"); | |||
| } | |||
| Status Caltech256Node::ValidateParams() { | |||
| RETURN_IF_NOT_OK(DatasetNode::ValidateParams()); | |||
| RETURN_IF_NOT_OK(ValidateDatasetDirParam("Caltech256Node", dataset_dir_)); | |||
| RETURN_IF_NOT_OK(ValidateDatasetSampler("Caltech256Node", sampler_)); | |||
| return Status::OK(); | |||
| } | |||
| Status Caltech256Node::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) { | |||
| // Do internal Schema generation. | |||
| // This arg exists in CaltechOp, but is not externalized (in Python API). | |||
| std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>(); | |||
| TensorShape scalar = TensorShape::CreateScalar(); | |||
| RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1))); | |||
| RETURN_IF_NOT_OK( | |||
| schema->AddColumn(ColDescriptor("label", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar))); | |||
| std::shared_ptr<SamplerRT> sampler_rt = nullptr; | |||
| RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt)); | |||
| auto op = std::make_shared<CaltechOp>(num_workers_, dataset_dir_, connector_que_size_, decode_, std::move(schema), | |||
| std::move(sampler_rt)); | |||
| op->SetTotalRepeats(GetTotalRepeats()); | |||
| op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch()); | |||
| node_ops->push_back(op); | |||
| return Status::OK(); | |||
| } | |||
| // Get the shard id of node. | |||
| Status Caltech256Node::GetShardId(int32_t *shard_id) { | |||
| *shard_id = sampler_->ShardId(); | |||
| return Status::OK(); | |||
| } | |||
| // Get Dataset size. | |||
| Status Caltech256Node::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate, | |||
| int64_t *dataset_size) { | |||
| if (dataset_size_ > 0) { | |||
| *dataset_size = dataset_size_; | |||
| return Status::OK(); | |||
| } | |||
| int64_t sample_size, num_rows; | |||
| RETURN_IF_NOT_OK(CaltechOp::CountRowsAndClasses(dataset_dir_, kExts, &num_rows, nullptr, {})); | |||
| std::shared_ptr<SamplerRT> sampler_rt = nullptr; | |||
| RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt)); | |||
| sample_size = sampler_rt->CalculateNumSamples(num_rows); | |||
| if (sample_size == -1) { | |||
| RETURN_IF_NOT_OK(size_getter->DryRun(shared_from_this(), &sample_size)); | |||
| } | |||
| *dataset_size = sample_size; | |||
| dataset_size_ = *dataset_size; | |||
| return Status::OK(); | |||
| } | |||
| Status Caltech256Node::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args, sampler_args; | |||
| RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args)); | |||
| args["sampler"] = sampler_args; | |||
| args["num_parallel_workers"] = num_workers_; | |||
| args["dataset_dir"] = dataset_dir_; | |||
| args["decode"] = decode_; | |||
| if (cache_ != nullptr) { | |||
| nlohmann::json cache_args; | |||
| RETURN_IF_NOT_OK(cache_->to_json(&cache_args)); | |||
| args["cache"] = cache_args; | |||
| } | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| #ifndef ENABLE_ANDROID | |||
| Status Caltech256Node::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) { | |||
| RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "num_parallel_workers", kCaltech256Node)); | |||
| RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "dataset_dir", kCaltech256Node)); | |||
| RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "decode", kCaltech256Node)); | |||
| RETURN_IF_NOT_OK(ValidateParamInJson(json_obj, "sampler", kCaltech256Node)); | |||
| std::string dataset_dir = json_obj["dataset_dir"]; | |||
| bool decode = json_obj["decode"]; | |||
| std::shared_ptr<SamplerObj> sampler; | |||
| RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler)); | |||
| std::shared_ptr<DatasetCache> cache = nullptr; | |||
| RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache)); | |||
| *ds = std::make_shared<Caltech256Node>(dataset_dir, decode, sampler, cache); | |||
| (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]); | |||
| return Status::OK(); | |||
| } | |||
| #endif | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,108 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CALTECH256_NODE_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CALTECH256_NODE_H_ | |||
| #include <map> | |||
| #include <memory> | |||
| #include <set> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "minddata/dataset/engine/ir/cache/dataset_cache.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| /// \class Caltech256Node. | |||
| /// \brief A Dataset derived class to represent Caltech256 dataset. | |||
| class Caltech256Node : public MappableSourceNode { | |||
| public: | |||
| /// \brief Constructor. | |||
| Caltech256Node(const std::string &dataset_dir, bool decode, std::shared_ptr<SamplerObj> sampler, | |||
| std::shared_ptr<DatasetCache> cache); | |||
| /// \brief Destructor. | |||
| ~Caltech256Node() = default; | |||
| /// \brief Node name getter. | |||
| /// \return Name of the current node. | |||
| std::string Name() const override { return kCaltech256Node; } | |||
| /// \brief Print the description. | |||
| /// \param[out] out The output stream to write output to. | |||
| void Print(std::ostream &out) const override; | |||
| /// \brief Copy the node to a new object. | |||
| /// \return A shared pointer to the new copy. | |||
| std::shared_ptr<DatasetNode> Copy() override; | |||
| /// \brief a base class override function to create the required runtime dataset op objects for this class. | |||
| /// \param[out] node_ops A vector containing shared pointer to the Dataset Ops that this object will create. | |||
| /// \return Status Status::OK() if build successfully. | |||
| Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override; | |||
| /// \brief Parameters validation. | |||
| /// \return Status Status::OK() if all the parameters are valid. | |||
| Status ValidateParams() override; | |||
| /// \brief Get the shard id of node. | |||
| /// \param[out] shard_id The shard id. | |||
| /// \return Status Status::OK() if get shard id successfully. | |||
| Status GetShardId(int32_t *shard_id) override; | |||
| /// \brief Base-class override for GetDatasetSize. | |||
| /// \param[in] size_getter Shared pointer to DatasetSizeGetter. | |||
| /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting | |||
| /// dataset size at the expense of accuracy. | |||
| /// \param[out] dataset_size The size of the dataset. | |||
| /// \return Status of the function. | |||
| Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate, | |||
| int64_t *dataset_size) override; | |||
| /// \brief Getter functions. | |||
| const std::string &DatasetDir() const { return dataset_dir_; } | |||
| bool Decode() const { return decode_; } | |||
| /// \brief Get the arguments of node. | |||
| /// \param[out] out_json JSON string of all attributes. | |||
| /// \return Status of the function. | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| #ifndef ENABLE_ANDROID | |||
| /// \brief Function to read dataset in json. | |||
| /// \param[in] json_obj The JSON object to be deserialized. | |||
| /// \param[out] ds Deserialized dataset. | |||
| /// \return Status The status code returned. | |||
| static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds); | |||
| #endif | |||
| /// \brief Sampler getter. | |||
| /// \return SamplerObj of the current node. | |||
| std::shared_ptr<SamplerObj> Sampler() override { return sampler_; } | |||
| /// \brief Sampler setter. | |||
| void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; } | |||
| private: | |||
| std::string dataset_dir_; | |||
| bool decode_; | |||
| std::shared_ptr<SamplerObj> sampler_; | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CALTECH256_NODE_H_ | |||
| @@ -1219,6 +1219,93 @@ inline std::shared_ptr<AmazonReviewDataset> MS_API AmazonReview(const std::strin | |||
| num_shards, shard_id, cache); | |||
| } | |||
| /// \class Caltech256Dataset | |||
| /// \brief A source dataset for reading and parsing Caltech256 dataset. | |||
| class MS_API Caltech256Dataset : public Dataset { | |||
| public: | |||
| /// \brief Constructor of Caltech256Dataset. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] decode Decode the images after reading. | |||
| /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. | |||
| /// \param[in] cache Tensor cache to use. | |||
| Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const std::shared_ptr<Sampler> &sampler, | |||
| const std::shared_ptr<DatasetCache> &cache); | |||
| /// \brief Constructor of Caltech256Dataset. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] decode Decode the images after reading. | |||
| /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. | |||
| /// \param[in] cache Tensor cache to use. | |||
| Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const Sampler *sampler, | |||
| const std::shared_ptr<DatasetCache> &cache); | |||
| /// \brief Constructor of Caltech256Dataset. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] decode Decode the images after reading. | |||
| /// \param[in] sampler Sampler object used to choose samples from the dataset. | |||
| /// \param[in] cache Tensor cache to use. | |||
| Caltech256Dataset(const std::vector<char> &dataset_dir, bool decode, const std::reference_wrapper<Sampler> sampler, | |||
| const std::shared_ptr<DatasetCache> &cache); | |||
| /// \brief Destructor of Caltech256Dataset. | |||
| ~Caltech256Dataset() = default; | |||
| }; | |||
| /// \brief Function to create a Caltech256Dataset. | |||
| /// \note The generated dataset has two columns ["image", "label"]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not | |||
| /// given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()). | |||
| /// \param[in] decode Decode the images after reading (default=false). | |||
| /// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used). | |||
| /// \return Shared pointer to the Caltech256Dataset. | |||
| /// \par Example | |||
| /// \code | |||
| /// /* Define dataset path and MindData object */ | |||
| /// std::string dataset_path = "/path/to/caltech256_dataset_directory"; | |||
| /// std::shared_ptr<Dataset> ds = Caltech256(dataset_path, true, std::make_shared<RandomSampler>(false, 10)); | |||
| /// | |||
| /// /* Create iterator to read dataset */ | |||
| /// std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| /// std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| /// iter->GetNextRow(&row); | |||
| /// | |||
| /// /* Note: In Caltech256 dataset, each data dictionary has keys "image" and "label" */ | |||
| /// auto image = row["image"]; | |||
| /// \endcode | |||
| inline std::shared_ptr<Caltech256Dataset> MS_API | |||
| Caltech256(const std::string &dataset_dir, const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(), | |||
| bool decode = false, const std::shared_ptr<DatasetCache> &cache = nullptr) { | |||
| return std::make_shared<Caltech256Dataset>(StringToChar(dataset_dir), decode, sampler, cache); | |||
| } | |||
| /// \brief Function to create a Caltech256Dataset | |||
| /// \note The generated dataset has two columns ["image", "label"]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. | |||
| /// \param[in] decode Decode the images after reading (default=false). | |||
| /// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used). | |||
| /// \return Shared pointer to the Caltech256Dataset. | |||
| inline std::shared_ptr<Caltech256Dataset> MS_API Caltech256(const std::string &dataset_dir, const Sampler *sampler, | |||
| bool decode = false, | |||
| const std::shared_ptr<DatasetCache> &cache = nullptr) { | |||
| return std::make_shared<Caltech256Dataset>(StringToChar(dataset_dir), decode, sampler, cache); | |||
| } | |||
| /// \brief Function to create a Caltech256Dataset. | |||
| /// \note The generated dataset has two columns ["image", "label"]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] sampler Sampler object used to choose samples from the dataset. | |||
| /// \param[in] decode Decode the images after reading (default=false). | |||
| /// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used). | |||
| /// \return Shared pointer to the Caltech256Dataset. | |||
| inline std::shared_ptr<Caltech256Dataset> MS_API Caltech256(const std::string &dataset_dir, | |||
| const std::reference_wrapper<Sampler> sampler, | |||
| bool decode = false, | |||
| const std::shared_ptr<DatasetCache> &cache = nullptr) { | |||
| return std::make_shared<Caltech256Dataset>(StringToChar(dataset_dir), decode, sampler, cache); | |||
| } | |||
| /// \class CelebADataset | |||
| /// \brief A source dataset for reading and parsing CelebA dataset. | |||
| class MS_API CelebADataset : public Dataset { | |||
| @@ -2864,7 +2951,7 @@ class MS_API LJSpeechDataset : public Dataset { | |||
| }; | |||
| /// \brief Function to create a LJSpeech Dataset. | |||
| /// \notes The generated dataset has four columns ["waveform", "sample_rate", "transcription", | |||
| /// \note The generated dataset has four columns ["waveform", "sample_rate", "transcription", | |||
| /// "normalized_transcription"]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not | |||
| @@ -2878,7 +2965,7 @@ LJSpeech(const std::string &dataset_dir, const std::shared_ptr<Sampler> &sampler | |||
| } | |||
| /// \brief Function to create a LJSpeech Dataset. | |||
| /// \notes The generated dataset has four columns ["waveform", "sample_rate", "transcription", | |||
| /// \note The generated dataset has four columns ["waveform", "sample_rate", "transcription", | |||
| /// "normalized_transcription"]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset. | |||
| @@ -2890,7 +2977,7 @@ inline std::shared_ptr<LJSpeechDataset> MS_API LJSpeech(const std::string &datas | |||
| } | |||
| /// \brief Function to create a LJSpeech Dataset. | |||
| /// \notes The generated dataset has four columns ["waveform", "sample_rate", "transcription", | |||
| /// \note The generated dataset has four columns ["waveform", "sample_rate", "transcription", | |||
| /// "normalized_transcription"]. | |||
| /// \param[in] dataset_dir Path to the root directory that contains the dataset. | |||
| /// \param[in] sampler Sampler object used to choose samples from the dataset. | |||
| @@ -33,6 +33,7 @@ class SamplerObj; | |||
| /// \brief An abstract base class to represent a sampler in the data pipeline. | |||
| class MS_API Sampler : std::enable_shared_from_this<Sampler> { | |||
| friend class AlbumDataset; | |||
| friend class Caltech256Dataset; | |||
| friend class CelebADataset; | |||
| friend class Cifar10Dataset; | |||
| friend class Cifar100Dataset; | |||
| @@ -74,8 +74,9 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che | |||
| check_photo_tour_dataset, check_ag_news_dataset, check_dbpedia_dataset, check_lj_speech_dataset, \ | |||
| check_yes_no_dataset, check_speech_commands_dataset, check_tedlium_dataset, check_svhn_dataset, \ | |||
| check_stl10_dataset, check_yelp_review_dataset, check_penn_treebank_dataset, check_iwslt2016_dataset, \ | |||
| check_iwslt2017_dataset, check_sogou_news_dataset, check_yahoo_answers_dataset, check_udpos_dataset,\ | |||
| check_conll2000_dataset, check_amazon_review_dataset, check_semeion_dataset | |||
| check_iwslt2017_dataset, check_sogou_news_dataset, check_yahoo_answers_dataset, check_udpos_dataset, \ | |||
| check_conll2000_dataset, check_amazon_review_dataset, check_semeion_dataset, check_caltech101_dataset, \ | |||
| check_caltech256_dataset | |||
| from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \ | |||
| get_prefetch_size | |||
| from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist | |||
| @@ -4822,7 +4823,7 @@ def _check_shm_usage(num_worker, queue_size, max_rowsize, num_queues=1): | |||
| threshold_ratio = 0.8 | |||
| if platform.system().lower() not in {"windows", "darwin"}: | |||
| shm_estimate_usage = _get_device_num() * num_worker * num_queues * \ | |||
| (queue_size + 2) * max_rowsize * 1024 * 1024 | |||
| (queue_size + 2) * max_rowsize * 1024 * 1024 | |||
| try: | |||
| shm_available = psutil.disk_usage('/dev/shm').free | |||
| if shm_estimate_usage >= threshold_ratio * shm_available: | |||
| @@ -6555,6 +6556,362 @@ class VOCDataset(MappableDataset): | |||
| return self.class_indexing | |||
| class _Caltech101Dataset: | |||
| """ | |||
| Mainly for loading Caltech101 Dataset, and return two rows each time. | |||
| """ | |||
| def __init__(self, dataset_dir, target_type="category", decode=False): | |||
| self.dataset_dir = os.path.realpath(dataset_dir) | |||
| self.image_dir = os.path.join(self.dataset_dir, "101_ObjectCategories") | |||
| self.annotation_dir = os.path.join(self.dataset_dir, "Annotations") | |||
| self.target_type = target_type | |||
| if self.target_type == "category": | |||
| self.column_names = ["image", "category"] | |||
| elif self.target_type == "annotation": | |||
| self.column_names = ["image", "annotation"] | |||
| else: | |||
| self.column_names = ["image", "category", "annotation"] | |||
| self.decode = decode | |||
| self.classes = sorted(os.listdir(self.image_dir)) | |||
| if "BACKGROUND_Google" in self.classes: | |||
| self.classes.remove("BACKGROUND_Google") | |||
| name_map = {"Faces": "Faces_2", | |||
| "Faces_easy": "Faces_3", | |||
| "Motorbikes": "Motorbikes_16", | |||
| "airplanes": "Airplanes_Side_2"} | |||
| self.annotation_classes = [name_map[class_name] if class_name in name_map else class_name | |||
| for class_name in self.classes] | |||
| self.image_index = [] | |||
| self.image_label = [] | |||
| for i, image_class in enumerate(self.classes): | |||
| sub_dir = os.path.join(self.image_dir, image_class) | |||
| if not os.path.isdir(sub_dir) or not os.access(sub_dir, os.R_OK): | |||
| continue | |||
| num_images = len(os.listdir(sub_dir)) | |||
| self.image_index.extend(range(1, num_images + 1)) | |||
| self.image_label.extend(num_images * [i]) | |||
| def __getitem__(self, index): | |||
| image_file = os.path.join(self.image_dir, self.classes[self.image_label[index]], | |||
| "image_{:04d}.jpg".format(self.image_index[index])) | |||
| if not os.path.exists(image_file): | |||
| raise ValueError("The image file {} does not exist or permission denied!".format(image_file)) | |||
| if self.decode: | |||
| image = np.asarray(Image.open(image_file).convert("RGB")) | |||
| else: | |||
| image = np.fromfile(image_file, dtype=np.uint8) | |||
| if self.target_type == "category": | |||
| return image, self.image_label[index] | |||
| annotation_file = os.path.join(self.annotation_dir, self.annotation_classes[self.image_label[index]], | |||
| "annotation_{:04d}.mat".format(self.image_index[index])) | |||
| if not os.path.exists(annotation_file): | |||
| raise ValueError("The annotation file {} does not exist or permission denied!".format(annotation_file)) | |||
| annotation = loadmat(annotation_file)["obj_contour"] | |||
| if self.target_type == "annotation": | |||
| return image, annotation | |||
| return image, self.image_label[index], annotation | |||
| def __len__(self): | |||
| return len(self.image_index) | |||
| class Caltech101Dataset(GeneratorDataset): | |||
| """ | |||
| A source dataset that reads and parses Caltech101 dataset. | |||
| The columns of the generated dataset depend on the value of `target_type`. | |||
| When `target_type` is `category`, the columns are :py:obj:`[image, category]`. | |||
| When `target_type` is `annotation`, the columns are :py:obj:`[image, annotation]`. | |||
| When `target_type` is `all`, the columns are :py:obj:`[image, category, annotation]`. | |||
| The tensor of column :py:obj:`image` is of the uint8 type. | |||
| The tensor of column :py:obj:`category` is of the uint32 type. | |||
| The tensor of column :py:obj:`annotation` is a 2-dimensional ndarray that stores the contour of the image | |||
| and consists of a series of points. | |||
| Args: | |||
| dataset_dir (str): Path to the root directory that contains the dataset. This root directory contains two | |||
| subdirectories, one is called 101_ObjectCategories, which stores images, | |||
| and the other is called Annotations, which stores annotations. | |||
| target_type (str, optional): Target of the image. If target_type is "category", return category represents | |||
| the target class. If target_type is "annotation", return annotation. | |||
| If target_type is "all", return category and annotation (default=None, means "category"). | |||
| num_samples (int, optional): The number of images to be included in the dataset | |||
| (default=None, all images). | |||
| num_parallel_workers (int, optional): Number of workers to read the data (default=1). | |||
| shuffle (bool, optional): Whether or not to perform shuffle on the dataset | |||
| (default=None, expected order behavior shown in the table). | |||
| decode (bool, optional): Whether or not to decode the images after reading (default=False). | |||
| sampler (Sampler, optional): Object used to choose samples from the | |||
| dataset (default=None, expected order behavior shown in the table). | |||
| num_shards (int, optional): Number of shards that the dataset will be divided | |||
| into (default=None). When this argument is specified, `num_samples` reflects | |||
| the maximum sample number of per shard. | |||
| shard_id (int, optional): The shard ID within num_shards (default=None). This | |||
| argument can only be specified when num_shards is also specified. | |||
| Raises: | |||
| RuntimeError: If dataset_dir does not contain data files. | |||
| RuntimeError: If target_type is not set correctly. | |||
| RuntimeError: If num_parallel_workers exceeds the max thread numbers. | |||
| RuntimeError: If sampler and shuffle are specified at the same time. | |||
| RuntimeError: If sampler and sharding are specified at the same time. | |||
| RuntimeError: If num_shards is specified but shard_id is None. | |||
| RuntimeError: If shard_id is specified but num_shards is None. | |||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | |||
| Note: | |||
| - This dataset can take in a `sampler`. `sampler` and `shuffle` are mutually exclusive. | |||
| The table below shows what input arguments are allowed and their expected behavior. | |||
| .. list-table:: Expected Order Behavior of Using `sampler` and `shuffle` | |||
| :widths: 25 25 50 | |||
| :header-rows: 1 | |||
| * - Parameter `sampler` | |||
| - Parameter `shuffle` | |||
| - Expected Order Behavior | |||
| * - None | |||
| - None | |||
| - random order | |||
| * - None | |||
| - True | |||
| - random order | |||
| * - None | |||
| - False | |||
| - sequential order | |||
| * - Sampler object | |||
| - None | |||
| - order defined by sampler | |||
| * - Sampler object | |||
| - True | |||
| - not allowed | |||
| * - Sampler object | |||
| - False | |||
| - not allowed | |||
| Examples: | |||
| >>> caltech101_dataset_directory = "/path/to/caltech101_dataset_directory" | |||
| >>> | |||
| >>> # 1) Read all samples (image files) in caltech101_dataset_directory with 8 threads | |||
| >>> dataset = ds.Caltech101Dataset(dataset_dir=caltech101_dataset_directory, num_parallel_workers=8) | |||
| >>> | |||
| >>> # 2) Read all samples (image files) with the target_type "annotation" | |||
| >>> dataset = ds.Caltech101Dataset(dataset_dir=caltech101_dataset_directory, target_type="annotation") | |||
| About Caltech101Dataset: | |||
| Pictures of objects belonging to 101 categories. About 40 to 800 images per category. | |||
| Most categories have about 50 images. Collected in September 2003 by Fei-Fei Li, Marco Andreetto, | |||
| and Marc 'Aurelio Ranzato. The size of each image is roughly 300 x 200 pixels. | |||
| The official provides the contour data of each object in each picture, which is the annotation. | |||
| .. code-block:: | |||
| . | |||
| └── caltech101_dataset_directory | |||
| ├── 101_ObjectCategories | |||
| │ ├── Faces | |||
| │ │ ├── image_0001.jpg | |||
| │ │ ├── image_0002.jpg | |||
| │ │ ... | |||
| │ ├── Faces_easy | |||
| │ │ ├── image_0001.jpg | |||
| │ │ ├── image_0002.jpg | |||
| │ │ ... | |||
| │ ├── ... | |||
| └── Annotations | |||
| ├── Airplanes_Side_2 | |||
| │ ├── annotation_0001.mat | |||
| │ ├── annotation_0002.mat | |||
| │ ... | |||
| ├── Faces_2 | |||
| │ ├── annotation_0001.mat | |||
| │ ├── annotation_0002.mat | |||
| │ ... | |||
| ├── ... | |||
| Citation: | |||
| .. code-block:: | |||
| @article{FeiFei2004LearningGV, | |||
| author = {Li Fei-Fei and Rob Fergus and Pietro Perona}, | |||
| title = {Learning Generative Visual Models from Few Training Examples: | |||
| An Incremental Bayesian Approach Tested on 101 Object Categories}, | |||
| journal = {Computer Vision and Pattern Recognition Workshop}, | |||
| year = {2004}, | |||
| url = {http://www.vision.caltech.edu/Image_Datasets/Caltech101/}, | |||
| } | |||
| """ | |||
| @check_caltech101_dataset | |||
| def __init__(self, dataset_dir, target_type=None, num_samples=None, num_parallel_workers=1, | |||
| shuffle=None, decode=False, sampler=None, num_shards=None, shard_id=None): | |||
| self.dataset_dir = dataset_dir | |||
| self.target_type = replace_none(target_type, "category") | |||
| self.decode = replace_none(decode, False) | |||
| dataset = _Caltech101Dataset(self.dataset_dir, self.target_type, self.decode) | |||
| super().__init__(dataset, column_names=dataset.column_names, num_samples=num_samples, | |||
| num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler, | |||
| num_shards=num_shards, shard_id=shard_id) | |||
| def get_class_indexing(self): | |||
| """ | |||
| Get the class index. | |||
| Returns: | |||
| dict, a str-to-int mapping from label name to index. | |||
| """ | |||
| class_dict = {'Faces': 0, 'Faces_easy': 1, 'Leopards': 2, 'Motorbikes': 3, 'accordion': 4, 'airplanes': 5, | |||
| 'anchor': 6, 'ant': 7, 'barrel': 8, 'bass': 9, 'beaver': 10, 'binocular': 11, 'bonsai': 12, | |||
| 'brain': 13, 'brontosaurus': 14, 'buddha': 15, 'butterfly': 16, 'camera': 17, 'cannon': 18, | |||
| 'car_side': 19, 'ceiling_fan': 20, 'cellphone': 21, 'chair': 22, 'chandelier': 23, | |||
| 'cougar_body': 24, 'cougar_face': 25, 'crab': 26, 'crayfish': 27, 'crocodile': 28, | |||
| 'crocodile_head': 29, 'cup': 30, 'dalmatian': 31, 'dollar_bill': 32, 'dolphin': 33, | |||
| 'dragonfly': 34, 'electric_guitar': 35, 'elephant': 36, 'emu': 37, 'euphonium': 38, 'ewer': 39, | |||
| 'ferry': 40, 'flamingo': 41, 'flamingo_head': 42, 'garfield': 43, 'gerenuk': 44, 'gramophone': 45, | |||
| 'grand_piano': 46, 'hawksbill': 47, 'headphone': 48, 'hedgehog': 49, 'helicopter': 50, 'ibis': 51, | |||
| 'inline_skate': 52, 'joshua_tree': 53, 'kangaroo': 54, 'ketch': 55, 'lamp': 56, 'laptop': 57, | |||
| 'llama': 58, 'lobster': 59, 'lotus': 60, 'mandolin': 61, 'mayfly': 62, 'menorah': 63, | |||
| 'metronome': 64, 'minaret': 65, 'nautilus': 66, 'octopus': 67, 'okapi': 68, 'pagoda': 69, | |||
| 'panda': 70, 'pigeon': 71, 'pizza': 72, 'platypus': 73, 'pyramid': 74, 'revolver': 75, | |||
| 'rhino': 76, 'rooster': 77, 'saxophone': 78, 'schooner': 79, 'scissors': 80, 'scorpion': 81, | |||
| 'sea_horse': 82, 'snoopy': 83, 'soccer_ball': 84, 'stapler': 85, 'starfish': 86, | |||
| 'stegosaurus': 87, 'stop_sign': 88, 'strawberry': 89, 'sunflower': 90, 'tick': 91, | |||
| 'trilobite': 92, 'umbrella': 93, 'watch': 94, 'water_lilly': 95, 'wheelchair': 96, 'wild_cat': 97, | |||
| 'windsor_chair': 98, 'wrench': 99, 'yin_yang': 100} | |||
| return class_dict | |||
| class Caltech256Dataset(MappableDataset): | |||
| """ | |||
| A source dataset that reads and parses Caltech256 dataset. | |||
| The generated dataset has two columns: :py:obj:`[image, label]`. | |||
| The tensor of column :py:obj:`image` is of the uint8 type. | |||
| The tensor of column :py:obj:`label` is of the uint32 type. | |||
| Args: | |||
| dataset_dir (str): Path to the root directory that contains the dataset. | |||
| num_samples (int, optional): The number of images to be included in the dataset | |||
| (default=None, all images). | |||
| num_parallel_workers (int, optional): Number of workers to read the data | |||
| (default=None, set in the config). | |||
| shuffle (bool, optional): Whether or not to perform shuffle on the dataset | |||
| (default=None, expected order behavior shown in the table). | |||
| decode (bool, optional): Whether or not to decode the images after reading (default=False). | |||
| sampler (Sampler, optional): Object used to choose samples from the | |||
| dataset (default=None, expected order behavior shown in the table). | |||
| num_shards (int, optional): Number of shards that the dataset will be divided | |||
| into (default=None). When this argument is specified, `num_samples` reflects | |||
| the maximum sample number of per shard. | |||
| shard_id (int, optional): The shard ID within num_shards (default=None). This | |||
| argument can only be specified when num_shards is also specified. | |||
| cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing. | |||
| (default=None, which means no cache is used). | |||
| Raises: | |||
| RuntimeError: If dataset_dir does not contain data files. | |||
| RuntimeError: If num_parallel_workers exceeds the max thread numbers. | |||
| RuntimeError: If sampler and shuffle are specified at the same time. | |||
| RuntimeError: If sampler and sharding are specified at the same time. | |||
| RuntimeError: If num_shards is specified but shard_id is None. | |||
| RuntimeError: If shard_id is specified but num_shards is None. | |||
| ValueError: If shard_id is invalid (< 0 or >= num_shards). | |||
| Note: | |||
| - This dataset can take in a `sampler`. `sampler` and `shuffle` are mutually exclusive. | |||
| The table below shows what input arguments are allowed and their expected behavior. | |||
| .. list-table:: Expected Order Behavior of Using `sampler` and `shuffle` | |||
| :widths: 25 25 50 | |||
| :header-rows: 1 | |||
| * - Parameter `sampler` | |||
| - Parameter `shuffle` | |||
| - Expected Order Behavior | |||
| * - None | |||
| - None | |||
| - random order | |||
| * - None | |||
| - True | |||
| - random order | |||
| * - None | |||
| - False | |||
| - sequential order | |||
| * - Sampler object | |||
| - None | |||
| - order defined by sampler | |||
| * - Sampler object | |||
| - True | |||
| - not allowed | |||
| * - Sampler object | |||
| - False | |||
| - not allowed | |||
| Examples: | |||
| >>> caltech256_dataset_dir = "/path/to/caltech256_dataset_directory" | |||
| >>> | |||
| >>> # 1) Read all samples (image files) in caltech256_dataset_dir with 8 threads | |||
| >>> dataset = ds.Caltech256Dataset(dataset_dir=caltech256_dataset_dir, num_parallel_workers=8) | |||
| About Caltech256Dataset: | |||
| Caltech-256 is an object recognition dataset containing 30,607 real-world images, of different sizes, | |||
| spanning 257 classes (256 object classes and an additional clutter class). | |||
| Each class is represented by at least 80 images. The dataset is a superset of the Caltech-101 dataset. | |||
| .. code-block:: | |||
| . | |||
| └── caltech256_dataset_directory | |||
| ├── 001.ak47 | |||
| │ ├── 001_0001.jpg | |||
| │ ├── 001_0002.jpg | |||
| │ ... | |||
| ├── 002.american-flag | |||
| │ ├── 002_0001.jpg | |||
| │ ├── 002_0002.jpg | |||
| │ ... | |||
| ├── 003.backpack | |||
| │ ├── 003_0001.jpg | |||
| │ ├── 003_0002.jpg | |||
| │ ... | |||
| ├── ... | |||
| Citation: | |||
| .. code-block:: | |||
| @article{griffin2007caltech, | |||
| title = {Caltech-256 object category dataset}, | |||
| added-at = {2021-01-21T02:54:42.000+0100}, | |||
| author = {Griffin, Gregory and Holub, Alex and Perona, Pietro}, | |||
| biburl = {https://www.bibsonomy.org/bibtex/21f746f23ff0307826cca3e3be45f8de7/s364315}, | |||
| interhash = {bfe1e648c1778c04baa60f23d1223375}, | |||
| intrahash = {1f746f23ff0307826cca3e3be45f8de7}, | |||
| publisher = {California Institute of Technology}, | |||
| timestamp = {2021-01-21T02:54:42.000+0100}, | |||
| year = {2007} | |||
| } | |||
| """ | |||
| @check_caltech256_dataset | |||
| def __init__(self, dataset_dir, num_samples=None, num_parallel_workers=None, shuffle=None, decode=False, | |||
| sampler=None, num_shards=None, shard_id=None, cache=None): | |||
| super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples, | |||
| shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache) | |||
| self.dataset_dir = dataset_dir | |||
| self.decode = replace_none(decode, False) | |||
| def parse(self, children=None): | |||
| return cde.Caltech256Node(self.dataset_dir, self.decode, self.sampler) | |||
| class CocoDataset(MappableDataset): | |||
| """ | |||
| A source dataset for reading and parsing COCO dataset. | |||
| @@ -460,6 +460,62 @@ def check_usps_dataset(method): | |||
| return new_method | |||
| def check_caltech101_dataset(method): | |||
| """A wrapper that wraps a parameter checker around the original Dataset(Caltech101Dataset).""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| _, param_dict = parse_user_args(method, *args, **kwargs) | |||
| nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id'] | |||
| nreq_param_bool = ['shuffle', 'decode'] | |||
| nreq_param_str = ['target_type'] | |||
| dataset_dir = param_dict.get('dataset_dir') | |||
| check_dir(dataset_dir) | |||
| target_type = param_dict.get('target_type') | |||
| if target_type is not None: | |||
| check_valid_str(target_type, ["category", "annotation", "all"], "target_type") | |||
| validate_dataset_param_value(nreq_param_int, param_dict, int) | |||
| validate_dataset_param_value(nreq_param_bool, param_dict, bool) | |||
| validate_dataset_param_value(nreq_param_str, param_dict, str) | |||
| check_sampler_shuffle_shard_options(param_dict) | |||
| cache = param_dict.get('cache') | |||
| check_cache_option(cache) | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_caltech256_dataset(method): | |||
| """A wrapper that wraps a parameter checker around the original Dataset(Caltech256Dataset).""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| _, param_dict = parse_user_args(method, *args, **kwargs) | |||
| nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id'] | |||
| nreq_param_bool = ['shuffle', 'decode'] | |||
| dataset_dir = param_dict.get('dataset_dir') | |||
| check_dir(dataset_dir) | |||
| validate_dataset_param_value(nreq_param_int, param_dict, int) | |||
| validate_dataset_param_value(nreq_param_bool, param_dict, bool) | |||
| check_sampler_shuffle_shard_options(param_dict) | |||
| cache = param_dict.get('cache') | |||
| check_cache_option(cache) | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_vocdataset(method): | |||
| """A wrapper that wraps a parameter checker around the original Dataset(VOCDataset).""" | |||
| @@ -17,6 +17,7 @@ SET(DE_UT_SRCS | |||
| c_api_dataset_ag_news_test.cc | |||
| c_api_dataset_album_test.cc | |||
| c_api_dataset_amazon_review_test.cc | |||
| c_api_dataset_caltech256_test.cc | |||
| c_api_dataset_cifar_test.cc | |||
| c_api_dataset_cityscapes_test.cc | |||
| c_api_dataset_clue_test.cc | |||
| @@ -0,0 +1,207 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "common/common.h" | |||
| #include "minddata/dataset/include/dataset/datasets.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::dataset::DataType; | |||
| using mindspore::dataset::Tensor; | |||
| using mindspore::dataset::TensorShape; | |||
| class MindDataTestPipeline : public UT::DatasetOpTesting { | |||
| protected: | |||
| }; | |||
| /// Feature: Caltech256Dataset | |||
| /// Description: basic test of Caltech256Dataset | |||
| /// Expectation: the data is processed successfully | |||
| TEST_F(MindDataTestPipeline, TestCaltech256Dataset) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256Dataset."; | |||
| // Create a Caltech256 Dataset. | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data/"; | |||
| std::shared_ptr<Dataset> ds = Caltech256(folder_path, std::make_shared<RandomSampler>(false, 44)); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset. | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row. | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| EXPECT_NE(row.find("image"), row.end()); | |||
| EXPECT_NE(row.find("label"), row.end()); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| i++; | |||
| auto image = row["image"]; | |||
| MS_LOG(INFO) << "Tensor image shape: " << image.Shape(); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| } | |||
| EXPECT_EQ(i, 44); | |||
| // Manually terminate the pipeline. | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: Caltech256Dataset | |||
| /// Description: test Caltech256Dataset in pipeline mode | |||
| /// Expectation: the data is processed successfully | |||
| TEST_F(MindDataTestPipeline, TestCaltech256DatasetWithPipeline) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256DatasetWithPipeline."; | |||
| // Create two Caltech256 Dataset. | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data/"; | |||
| std::shared_ptr<Dataset> ds1 = Caltech256(folder_path, std::make_shared<RandomSampler>(false, 3)); | |||
| std::shared_ptr<Dataset> ds2 = Caltech256(folder_path, std::make_shared<RandomSampler>(false, 3)); | |||
| EXPECT_NE(ds1, nullptr); | |||
| EXPECT_NE(ds2, nullptr); | |||
| // Create two Repeat operation on ds. | |||
| int32_t repeat_num = 1; | |||
| ds1 = ds1->Repeat(repeat_num); | |||
| EXPECT_NE(ds1, nullptr); | |||
| repeat_num = 1; | |||
| ds2 = ds2->Repeat(repeat_num); | |||
| EXPECT_NE(ds2, nullptr); | |||
| // Create two Project operation on ds. | |||
| std::vector<std::string> column_project = {"image", "label"}; | |||
| ds1 = ds1->Project(column_project); | |||
| EXPECT_NE(ds1, nullptr); | |||
| ds2 = ds2->Project(column_project); | |||
| EXPECT_NE(ds2, nullptr); | |||
| // Create a Concat operation on the ds. | |||
| ds1 = ds1->Concat({ds2}); | |||
| EXPECT_NE(ds1, nullptr); | |||
| // Create an iterator over the result of the above dataset. | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds1->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row. | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| EXPECT_NE(row.find("image"), row.end()); | |||
| EXPECT_NE(row.find("label"), row.end()); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| i++; | |||
| auto image = row["image"]; | |||
| MS_LOG(INFO) << "Tensor image shape: " << image.Shape(); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| } | |||
| EXPECT_EQ(i, 6); | |||
| // Manually terminate the pipeline. | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: Caltech256Dataset | |||
| /// Description: test getting size of Caltech256Dataset | |||
| /// Expectation: the size is correct | |||
| TEST_F(MindDataTestPipeline, TestCaltech256GetDatasetSize) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256GetDatasetSize."; | |||
| // Create a Caltech256 Dataset. | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data/"; | |||
| std::shared_ptr<Dataset> ds = Caltech256(folder_path); | |||
| EXPECT_NE(ds, nullptr); | |||
| EXPECT_EQ(ds->GetDatasetSize(), 44); | |||
| } | |||
| /// Feature: Caltech256Dataset | |||
| /// Description: test Caltech256Dataset with mix getter | |||
| /// Expectation: the data is processed successfully | |||
| TEST_F(MindDataTestPipeline, TestCaltech256Getters) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256MixGetter."; | |||
| // Create a Caltech256 Dataset. | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data/"; | |||
| std::shared_ptr<Dataset> ds = Caltech256(folder_path); | |||
| EXPECT_NE(ds, nullptr); | |||
| EXPECT_EQ(ds->GetDatasetSize(), 44); | |||
| std::vector<DataType> types = ToDETypes(ds->GetOutputTypes()); | |||
| std::vector<TensorShape> shapes = ToTensorShapeVec(ds->GetOutputShapes()); | |||
| std::vector<std::string> column_names = {"image", "label"}; | |||
| int64_t num_classes = ds->GetNumClasses(); | |||
| EXPECT_EQ(types.size(), 2); | |||
| EXPECT_EQ(types[0].ToString(), "uint8"); | |||
| EXPECT_EQ(types[1].ToString(), "int32"); | |||
| EXPECT_EQ(shapes.size(), 2); | |||
| EXPECT_EQ(num_classes, 4); | |||
| EXPECT_EQ(ds->GetBatchSize(), 1); | |||
| EXPECT_EQ(ds->GetRepeatCount(), 1); | |||
| EXPECT_EQ(ds->GetDatasetSize(), 44); | |||
| EXPECT_EQ(ToDETypes(ds->GetOutputTypes()), types); | |||
| EXPECT_EQ(ToTensorShapeVec(ds->GetOutputShapes()), shapes); | |||
| EXPECT_EQ(ds->GetNumClasses(), 4); | |||
| EXPECT_EQ(ds->GetColumnNames(), column_names); | |||
| EXPECT_EQ(ds->GetDatasetSize(), 44); | |||
| EXPECT_EQ(ToDETypes(ds->GetOutputTypes()), types); | |||
| EXPECT_EQ(ToTensorShapeVec(ds->GetOutputShapes()), shapes); | |||
| EXPECT_EQ(ds->GetBatchSize(), 1); | |||
| EXPECT_EQ(ds->GetRepeatCount(), 1); | |||
| EXPECT_EQ(ds->GetNumClasses(), 4); | |||
| EXPECT_EQ(ds->GetDatasetSize(), 44); | |||
| } | |||
| /// Feature: Caltech256Dataset | |||
| /// Description: test Caltech256Dataset with the fail of reading dataset | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestCaltech256DatasetFail) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256DatasetFail."; | |||
| // Create a Caltech256 Dataset. | |||
| std::shared_ptr<Dataset> ds = Caltech256("", std::make_shared<RandomSampler>(false, 10)); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| // Expect failure: invalid Caltech256 input. | |||
| EXPECT_EQ(iter, nullptr); | |||
| } | |||
| /// Feature: Caltech256Dataset | |||
| /// Description: test Caltech256Dataset with the null sampler | |||
| /// Expectation: throw correct error and message | |||
| TEST_F(MindDataTestPipeline, TestCaltech256DatasetWithNullSamplerFail) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaltech256DatasetWithNullSamplerFail."; | |||
| // Create a Caltech256 Dataset. | |||
| std::string folder_path = datasets_root_path_ + "/testPK/data/"; | |||
| std::shared_ptr<Dataset> ds = Caltech256(folder_path, nullptr); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| // Expect failure: invalid Caltech256 input, sampler cannot be nullptr. | |||
| EXPECT_EQ(iter, nullptr); | |||
| } | |||
| @@ -0,0 +1,349 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Test Caltech101 dataset operators | |||
| """ | |||
| import os | |||
| import matplotlib.pyplot as plt | |||
| import numpy as np | |||
| import pytest | |||
| from PIL import Image | |||
| from scipy.io import loadmat | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as c_vision | |||
| from mindspore import log as logger | |||
| DATASET_DIR = "../data/dataset/testCaltech101Data" | |||
| WRONG_DIR = "../data/dataset/notExist" | |||
| def get_index_info(): | |||
| dataset_dir = os.path.realpath(DATASET_DIR) | |||
| image_dir = os.path.join(dataset_dir, "101_ObjectCategories") | |||
| classes = sorted(os.listdir(image_dir)) | |||
| if "BACKGROUND_Google" in classes: | |||
| classes.remove("BACKGROUND_Google") | |||
| name_map = {"Faces": "Faces_2", | |||
| "Faces_easy": "Faces_3", | |||
| "Motorbikes": "Motorbikes_16", | |||
| "airplanes": "Airplanes_Side_2"} | |||
| annotation_classes = [name_map[class_name] if class_name in name_map else class_name for class_name in classes] | |||
| image_index = [] | |||
| image_label = [] | |||
| for i, c in enumerate(classes): | |||
| sub_dir = os.path.join(image_dir, c) | |||
| if not os.path.isdir(sub_dir) or not os.access(sub_dir, os.R_OK): | |||
| continue | |||
| num_images = len(os.listdir(sub_dir)) | |||
| image_index.extend(range(1, num_images + 1)) | |||
| image_label.extend(num_images * [i]) | |||
| return image_index, image_label, classes, annotation_classes | |||
| def load_caltech101(target_type="category", decode=False): | |||
| """ | |||
| load Caltech101 data | |||
| """ | |||
| dataset_dir = os.path.realpath(DATASET_DIR) | |||
| image_dir = os.path.join(dataset_dir, "101_ObjectCategories") | |||
| annotation_dir = os.path.join(dataset_dir, "Annotations") | |||
| image_index, image_label, classes, annotation_classes = get_index_info() | |||
| images, categories, annotations = [], [], [] | |||
| num_images = len(image_index) | |||
| for i in range(num_images): | |||
| image_file = os.path.join(image_dir, classes[image_label[i]], "image_{:04d}.jpg".format(image_index[i])) | |||
| if not os.path.exists(image_file): | |||
| raise ValueError("The image file {} does not exist or permission denied!".format(image_file)) | |||
| if decode: | |||
| image = np.asarray(Image.open(image_file).convert("RGB")) | |||
| else: | |||
| image = np.fromfile(image_file, dtype=np.uint8) | |||
| images.append(image) | |||
| if target_type == "category": | |||
| for i in range(num_images): | |||
| categories.append(image_label[i]) | |||
| return images, categories | |||
| for i in range(num_images): | |||
| annotation_file = os.path.join(annotation_dir, annotation_classes[image_label[i]], | |||
| "annotation_{:04d}.mat".format(image_index[i])) | |||
| if not os.path.exists(annotation_file): | |||
| raise ValueError("The annotation file {} does not exist or permission denied!".format(annotation_file)) | |||
| annotation = loadmat(annotation_file)["obj_contour"] | |||
| annotations.append(annotation) | |||
| if target_type == "annotation": | |||
| return images, annotations | |||
| for i in range(num_images): | |||
| categories.append(image_label[i]) | |||
| return images, categories, annotations | |||
| def visualize_dataset(images, labels): | |||
| """ | |||
| Helper function to visualize the dataset samples | |||
| """ | |||
| num_samples = len(images) | |||
| for i in range(num_samples): | |||
| plt.subplot(1, num_samples, i + 1) | |||
| plt.imshow(images[i].squeeze()) | |||
| plt.title(labels[i]) | |||
| plt.show() | |||
| def test_caltech101_content_check(): | |||
| """ | |||
| Feature: Caltech101Dataset | |||
| Description: check if the image data of caltech101 dataset is read correctly | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech101Dataset Op with content check") | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, target_type="annotation", num_samples=4, shuffle=False, decode=True) | |||
| images, annotations = load_caltech101(target_type="annotation", decode=True) | |||
| num_iter = 0 | |||
| for i, data in enumerate(all_data.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(data["image"], images[i]) | |||
| np.testing.assert_array_equal(data["annotation"], annotations[i]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, target_type="all", num_samples=4, shuffle=False, decode=True) | |||
| images, categories, annotations = load_caltech101(target_type="all", decode=True) | |||
| num_iter = 0 | |||
| for i, data in enumerate(all_data.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(data["image"], images[i]) | |||
| np.testing.assert_array_equal(data["category"], categories[i]) | |||
| np.testing.assert_array_equal(data["annotation"], annotations[i]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| def test_caltech101_basic(): | |||
| """ | |||
| Feature: Caltech101Dataset | |||
| Description: basic test of Caltech101Dataset | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech101Dataset Op") | |||
| # case 1: test target_type | |||
| all_data_1 = ds.Caltech101Dataset(DATASET_DIR, shuffle=False) | |||
| all_data_2 = ds.Caltech101Dataset(DATASET_DIR, shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["category"], item2["category"]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| # case 2: test decode | |||
| all_data_1 = ds.Caltech101Dataset(DATASET_DIR, decode=True, shuffle=False) | |||
| all_data_2 = ds.Caltech101Dataset(DATASET_DIR, decode=True, shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["image"], item2["image"]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| # case 3: test num_samples | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4) | |||
| num_iter = 0 | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| # case 4: test repeat | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4) | |||
| all_data = all_data.repeat(2) | |||
| num_iter = 0 | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| num_iter += 1 | |||
| assert num_iter == 8 | |||
| # case 5: test get_dataset_size, resize and batch | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4) | |||
| all_data = all_data.map(operations=[c_vision.Decode(), c_vision.Resize((224, 224))], input_columns=["image"], | |||
| num_parallel_workers=1) | |||
| assert all_data.get_dataset_size() == 4 | |||
| assert all_data.get_batch_size() == 1 | |||
| # drop_remainder is default to be False | |||
| all_data = all_data.batch(batch_size=3) | |||
| assert all_data.get_batch_size() == 3 | |||
| assert all_data.get_dataset_size() == 2 | |||
| num_iter = 0 | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| num_iter += 1 | |||
| assert num_iter == 2 | |||
| # case 6: test get_class_indexing | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4) | |||
| class_indexing = all_data.get_class_indexing() | |||
| assert class_indexing["Faces"] == 0 | |||
| assert class_indexing["yin_yang"] == 100 | |||
| def test_caltech101_target_type(): | |||
| """ | |||
| Feature: Caltech101Dataset | |||
| Description: test Caltech101Dataset with target_type | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech101Dataset Op with target_type") | |||
| all_data_1 = ds.Caltech101Dataset(DATASET_DIR, target_type="annotation", shuffle=False) | |||
| all_data_2 = ds.Caltech101Dataset(DATASET_DIR, target_type="annotation", shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["annotation"], item2["annotation"]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| all_data_1 = ds.Caltech101Dataset(DATASET_DIR, target_type="all", shuffle=False) | |||
| all_data_2 = ds.Caltech101Dataset(DATASET_DIR, target_type="all", shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["category"], item2["category"]) | |||
| np.testing.assert_array_equal(item1["annotation"], item2["annotation"]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| all_data_1 = ds.Caltech101Dataset(DATASET_DIR, target_type="category", shuffle=False) | |||
| all_data_2 = ds.Caltech101Dataset(DATASET_DIR, target_type="category", shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["category"], item2["category"]) | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| def test_caltech101_sequential_sampler(): | |||
| """ | |||
| Feature: Caltech101Dataset | |||
| Description: test Caltech101Dataset with SequentialSampler | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech101Dataset Op with SequentialSampler") | |||
| num_samples = 4 | |||
| sampler = ds.SequentialSampler(num_samples=num_samples) | |||
| all_data_1 = ds.Caltech101Dataset(DATASET_DIR, sampler=sampler) | |||
| all_data_2 = ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_samples=num_samples) | |||
| label_list_1, label_list_2 = [], [] | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1), | |||
| all_data_2.create_dict_iterator(num_epochs=1)): | |||
| label_list_1.append(item1["category"].asnumpy()) | |||
| label_list_2.append(item2["category"].asnumpy()) | |||
| num_iter += 1 | |||
| np.testing.assert_array_equal(label_list_1, label_list_2) | |||
| assert num_iter == num_samples | |||
| def test_caltech101_exception(): | |||
| """ | |||
| Feature: Caltech101Dataset | |||
| Description: test error cases for Caltech101Dataset | |||
| Expectation: throw correct error and message | |||
| """ | |||
| logger.info("Test error cases for Caltech101Dataset") | |||
| error_msg_1 = "sampler and shuffle cannot be specified at the same time" | |||
| with pytest.raises(RuntimeError, match=error_msg_1): | |||
| ds.Caltech101Dataset(DATASET_DIR, shuffle=False, sampler=ds.SequentialSampler(1)) | |||
| error_msg_2 = "sampler and sharding cannot be specified at the same time" | |||
| with pytest.raises(RuntimeError, match=error_msg_2): | |||
| ds.Caltech101Dataset(DATASET_DIR, sampler=ds.SequentialSampler(1), num_shards=2, shard_id=0) | |||
| error_msg_3 = "num_shards is specified and currently requires shard_id as well" | |||
| with pytest.raises(RuntimeError, match=error_msg_3): | |||
| ds.Caltech101Dataset(DATASET_DIR, num_shards=10) | |||
| error_msg_4 = "shard_id is specified but num_shards is not" | |||
| with pytest.raises(RuntimeError, match=error_msg_4): | |||
| ds.Caltech101Dataset(DATASET_DIR, shard_id=0) | |||
| error_msg_5 = "Input shard_id is not within the required interval" | |||
| with pytest.raises(ValueError, match=error_msg_5): | |||
| ds.Caltech101Dataset(DATASET_DIR, num_shards=5, shard_id=-1) | |||
| with pytest.raises(ValueError, match=error_msg_5): | |||
| ds.Caltech101Dataset(DATASET_DIR, num_shards=5, shard_id=5) | |||
| with pytest.raises(ValueError, match=error_msg_5): | |||
| ds.Caltech101Dataset(DATASET_DIR, num_shards=2, shard_id=5) | |||
| error_msg_6 = "num_parallel_workers exceeds" | |||
| with pytest.raises(ValueError, match=error_msg_6): | |||
| ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_parallel_workers=0) | |||
| with pytest.raises(ValueError, match=error_msg_6): | |||
| ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_parallel_workers=256) | |||
| with pytest.raises(ValueError, match=error_msg_6): | |||
| ds.Caltech101Dataset(DATASET_DIR, shuffle=False, num_parallel_workers=-2) | |||
| error_msg_7 = "Argument shard_id" | |||
| with pytest.raises(TypeError, match=error_msg_7): | |||
| ds.Caltech101Dataset(DATASET_DIR, num_shards=2, shard_id="0") | |||
| error_msg_8 = "does not exist or is not a directory or permission denied!" | |||
| with pytest.raises(ValueError, match=error_msg_8): | |||
| all_data = ds.Caltech101Dataset(WRONG_DIR, WRONG_DIR) | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| pass | |||
| error_msg_9 = "Input target_type is not within the valid set of \\['category', 'annotation', 'all'\\]." | |||
| with pytest.raises(ValueError, match=error_msg_9): | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, target_type="cate") | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| pass | |||
| def test_caltech101_visualize(plot=False): | |||
| """ | |||
| Feature: Caltech101Dataset | |||
| Description: visualize Caltech101Dataset results | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech101Dataset visualization") | |||
| all_data = ds.Caltech101Dataset(DATASET_DIR, num_samples=4, decode=True, shuffle=False) | |||
| num_iter = 0 | |||
| image_list, category_list = [], [] | |||
| for item in all_data.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| image = item["image"] | |||
| category = item["category"] | |||
| image_list.append(image) | |||
| category_list.append("label {}".format(category)) | |||
| assert isinstance(image, np.ndarray) | |||
| assert len(image.shape) == 3 | |||
| assert image.shape[-1] == 3 | |||
| assert image.dtype == np.uint8 | |||
| assert category.dtype == np.int64 | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| if plot: | |||
| visualize_dataset(image_list, category_list) | |||
| if __name__ == '__main__': | |||
| test_caltech101_content_check() | |||
| test_caltech101_basic() | |||
| test_caltech101_target_type() | |||
| test_caltech101_sequential_sampler() | |||
| test_caltech101_exception() | |||
| test_caltech101_visualize(plot=True) | |||
| @@ -0,0 +1,223 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Test Caltech256 dataset operators | |||
| """ | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as c_vision | |||
| from mindspore import log as logger | |||
| IMAGE_DATA_DIR = "../data/dataset/testPK/data" | |||
| WRONG_DIR = "../data/dataset/notExist" | |||
| def test_caltech256_basic(): | |||
| """ | |||
| Feature: Caltech256Dataset | |||
| Description: basic test of Caltech256Dataset | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech256Dataset Op") | |||
| # case 1: test read all data | |||
| all_data_1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False) | |||
| all_data_2 = ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["label"], item2["label"]) | |||
| num_iter += 1 | |||
| assert num_iter == 44 | |||
| # case 2: test decode | |||
| all_data_1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, decode=True, shuffle=False) | |||
| all_data_2 = ds.Caltech256Dataset(IMAGE_DATA_DIR, decode=True, shuffle=False) | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1, output_numpy=True), | |||
| all_data_2.create_dict_iterator(num_epochs=1, output_numpy=True)): | |||
| np.testing.assert_array_equal(item1["label"], item2["label"]) | |||
| num_iter += 1 | |||
| assert num_iter == 44 | |||
| # case 3: test num_samples | |||
| all_data = ds.Caltech256Dataset(IMAGE_DATA_DIR, num_samples=4) | |||
| num_iter = 0 | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| num_iter += 1 | |||
| assert num_iter == 4 | |||
| # case 4: test repeat | |||
| all_data = ds.Caltech256Dataset(IMAGE_DATA_DIR, num_samples=4) | |||
| all_data = all_data.repeat(2) | |||
| num_iter = 0 | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| num_iter += 1 | |||
| assert num_iter == 8 | |||
| # case 5: test get_dataset_size, resize and batch | |||
| all_data = ds.Caltech256Dataset(IMAGE_DATA_DIR, num_samples=4) | |||
| all_data = all_data.map(operations=[c_vision.Decode(), c_vision.Resize((224, 224))], input_columns=["image"], | |||
| num_parallel_workers=1) | |||
| assert all_data.get_dataset_size() == 4 | |||
| assert all_data.get_batch_size() == 1 | |||
| # drop_remainder is default to be False | |||
| all_data = all_data.batch(batch_size=3) | |||
| assert all_data.get_batch_size() == 3 | |||
| assert all_data.get_dataset_size() == 2 | |||
| num_iter = 0 | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| num_iter += 1 | |||
| assert num_iter == 2 | |||
| def test_caltech256_decode(): | |||
| """ | |||
| Feature: Caltech256Dataset | |||
| Description: validate Caltech256Dataset with decode | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Validate Caltech256Dataset with decode") | |||
| # define parameters | |||
| repeat_count = 1 | |||
| data1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, decode=True) | |||
| data1 = data1.repeat(repeat_count) | |||
| num_iter = 0 | |||
| # each data is a dictionary | |||
| for item in data1.create_dict_iterator(num_epochs=1): | |||
| # in this example, each dictionary has keys "image" and "label" | |||
| logger.info("image is {}".format(item["image"])) | |||
| logger.info("label is {}".format(item["label"])) | |||
| num_iter += 1 | |||
| logger.info("Number of data in data1: {}".format(num_iter)) | |||
| assert num_iter == 44 | |||
| def test_caltech256_sequential_sampler(): | |||
| """ | |||
| Feature: Caltech256Dataset | |||
| Description: test Caltech256Dataset with SequentialSampler | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech256Dataset Op with SequentialSampler") | |||
| num_samples = 4 | |||
| sampler = ds.SequentialSampler(num_samples=num_samples) | |||
| all_data_1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, sampler=sampler) | |||
| all_data_2 = ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_samples=num_samples) | |||
| label_list_1, label_list_2 = [], [] | |||
| num_iter = 0 | |||
| for item1, item2 in zip(all_data_1.create_dict_iterator(num_epochs=1), | |||
| all_data_2.create_dict_iterator(num_epochs=1)): | |||
| label_list_1.append(item1["label"].asnumpy()) | |||
| label_list_2.append(item2["label"].asnumpy()) | |||
| num_iter += 1 | |||
| np.testing.assert_array_equal(label_list_1, label_list_2) | |||
| assert num_iter == num_samples | |||
| def test_caltech256_random_sampler(): | |||
| """ | |||
| Feature: Caltech256Dataset | |||
| Description: test Caltech256Dataset with RandomSampler | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("Test Caltech256Dataset Op with RandomSampler") | |||
| # define parameters | |||
| repeat_count = 1 | |||
| # apply dataset operations | |||
| sampler = ds.RandomSampler() | |||
| data1 = ds.Caltech256Dataset(IMAGE_DATA_DIR, sampler=sampler) | |||
| data1 = data1.repeat(repeat_count) | |||
| num_iter = 0 | |||
| # each data is a dictionary | |||
| for item in data1.create_dict_iterator(num_epochs=1): | |||
| # in this example, each dictionary has keys "image" and "label" | |||
| logger.info("image is {}".format(item["image"])) | |||
| logger.info("label is {}".format(item["label"])) | |||
| num_iter += 1 | |||
| logger.info("Number of data in data1: {}".format(num_iter)) | |||
| assert num_iter == 44 | |||
| def test_caltech256_exception(): | |||
| """ | |||
| Feature: Caltech256Dataset | |||
| Description: test error cases for Caltech256Dataset | |||
| Expectation: throw correct error and message | |||
| """ | |||
| logger.info("Test error cases for Caltech256Dataset") | |||
| error_msg_1 = "sampler and shuffle cannot be specified at the same time" | |||
| with pytest.raises(RuntimeError, match=error_msg_1): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, sampler=ds.SequentialSampler(1)) | |||
| error_msg_2 = "sampler and sharding cannot be specified at the same time" | |||
| with pytest.raises(RuntimeError, match=error_msg_2): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, sampler=ds.SequentialSampler(1), num_shards=2, shard_id=0) | |||
| error_msg_3 = "num_shards is specified and currently requires shard_id as well" | |||
| with pytest.raises(RuntimeError, match=error_msg_3): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=10) | |||
| error_msg_4 = "shard_id is specified but num_shards is not" | |||
| with pytest.raises(RuntimeError, match=error_msg_4): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, shard_id=0) | |||
| error_msg_5 = "Input shard_id is not within the required interval" | |||
| with pytest.raises(ValueError, match=error_msg_5): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=5, shard_id=-1) | |||
| with pytest.raises(ValueError, match=error_msg_5): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=5, shard_id=5) | |||
| with pytest.raises(ValueError, match=error_msg_5): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=2, shard_id=5) | |||
| error_msg_6 = "num_parallel_workers exceeds" | |||
| with pytest.raises(ValueError, match=error_msg_6): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_parallel_workers=0) | |||
| with pytest.raises(ValueError, match=error_msg_6): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_parallel_workers=256) | |||
| with pytest.raises(ValueError, match=error_msg_6): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, shuffle=False, num_parallel_workers=-2) | |||
| error_msg_7 = "Argument shard_id" | |||
| with pytest.raises(TypeError, match=error_msg_7): | |||
| ds.Caltech256Dataset(IMAGE_DATA_DIR, num_shards=2, shard_id="0") | |||
| error_msg_8 = "does not exist or is not a directory or permission denied!" | |||
| with pytest.raises(ValueError, match=error_msg_8): | |||
| all_data = ds.Caltech256Dataset(WRONG_DIR) | |||
| for _ in all_data.create_dict_iterator(num_epochs=1): | |||
| pass | |||
| if __name__ == '__main__': | |||
| test_caltech256_basic() | |||
| test_caltech256_decode() | |||
| test_caltech256_sequential_sampler() | |||
| test_caltech256_random_sampler() | |||
| test_caltech256_exception() | |||