Browse Source

[feat][assistant][I40GXN] add new loader SogouNews

tags/v1.6.0
li-qiyao 4 years ago
parent
commit
f4bfcb8c25
18 changed files with 1368 additions and 1 deletions
  1. +9
    -0
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  2. +13
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
  3. +2
    -0
      mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc
  4. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt
  5. +56
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sogou_news_op.cc
  6. +71
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sogou_news_op.h
  7. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
  8. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt
  9. +189
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/sogou_news_node.cc
  10. +136
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/sogou_news_node.h
  11. +50
    -0
      mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
  12. +88
    -1
      mindspore/dataset/engine/datasets.py
  13. +27
    -0
      mindspore/dataset/engine/validators.py
  14. +1
    -0
      tests/ut/cpp/dataset/CMakeLists.txt
  15. +532
    -0
      tests/ut/cpp/dataset/c_api_dataset_sogou_news_test.cc
  16. +3
    -0
      tests/ut/data/dataset/testSogouNews/test.csv
  17. +3
    -0
      tests/ut/data/dataset/testSogouNews/train.csv
  18. +185
    -0
      tests/ut/python/dataset/test_datasets_sogou_news.py

+ 9
- 0
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -115,6 +115,7 @@
#include "minddata/dataset/engine/ir/datasetops/source/qmnist_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/random_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/sbu_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/sogou_news_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/speech_commands_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/stl10_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/tedlium_node.h"
@@ -1634,6 +1635,14 @@ SBUDataset::SBUDataset(const std::vector<char> &dataset_dir, bool decode, const
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

SogouNewsDataset::SogouNewsDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
const std::shared_ptr<DatasetCache> &cache) {
auto ds = std::make_shared<SogouNewsNode>(CharToString(dataset_dir), CharToString(usage), num_samples, shuffle,
num_shards, shard_id, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

SpeechCommandsDataset::SpeechCommandsDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
const std::shared_ptr<Sampler> &sampler,
const std::shared_ptr<DatasetCache> &cache) {


+ 13
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc View File

@@ -63,6 +63,7 @@
#include "minddata/dataset/engine/ir/datasetops/source/places365_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/qmnist_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/sbu_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/sogou_news_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/usps_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h"
@@ -445,6 +446,18 @@ PYBIND_REGISTER(SBUNode, 2, ([](const py::module *m) {
}));
}));

PYBIND_REGISTER(SogouNewsNode, 2, ([](const py::module *m) {
(void)py::class_<SogouNewsNode, DatasetNode, std::shared_ptr<SogouNewsNode>>(
*m, "SogouNewsNode", "to create a SogouNewsNode")
.def(py::init([](std::string dataset_dir, std::string usage, int64_t num_samples, int32_t shuffle,
int32_t num_shards, int32_t shard_id) {
auto sogou_news = std::make_shared<SogouNewsNode>(
dataset_dir, usage, num_samples, toShuffleMode(shuffle), num_shards, shard_id, nullptr);
THROW_IF_ERROR(sogou_news->ValidateParams());
return sogou_news;
}));
}));

PYBIND_REGISTER(SpeechCommandsNode, 2, ([](const py::module *m) {
(void)py::class_<SpeechCommandsNode, DatasetNode, std::shared_ptr<SpeechCommandsNode>>(
*m, "SpeechCommandsNode", "to create a SpeechCommandsNode")


+ 2
- 0
mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc View File

@@ -790,6 +790,8 @@ Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_pt
Status GenerateWaveTable(std::shared_ptr<Tensor> *output, const DataType &type, Modulation modulation,
int32_t table_size, float min, float max, float phase) {
RETURN_UNEXPECTED_IF_NULL(output);
CHECK_FAIL_RETURN_UNEXPECTED(table_size > 0,
"table_size must be more than 0, but got: " + std::to_string(table_size));
int32_t phase_offset = static_cast<int32_t>(phase / PI / 2 * table_size + 0.5);
// get the offset of the i-th
std::vector<int32_t> point;


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt View File

@@ -31,6 +31,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
qmnist_op.cc
random_data_op.cc
sbu_op.cc
sogou_news_op.cc
speech_commands_op.cc
stl10_op.cc
tedlium_op.cc


+ 56
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sogou_news_op.cc View File

@@ -0,0 +1,56 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/datasetops/source/sogou_news_op.h"
#include <fstream>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "debug/common.h"
namespace mindspore {
namespace dataset {
SogouNewsOp::SogouNewsOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size,
int32_t op_connector_size, bool shuffle_files, int32_t num_devices, int32_t device_id,
char field_delim, const std::vector<std::shared_ptr<BaseRecord>> &column_default,
const std::vector<std::string> &column_name,
const std::vector<std::string> &sogou_news_files_list)
: CsvOp(sogou_news_files_list, field_delim, column_default, column_name, num_workers, num_samples,
worker_connector_size, op_connector_size, shuffle_files, num_devices, device_id) {}
void SogouNewsOp::Print(std::ostream &out, bool show_all) const {
if (!show_all) {
// Call the super class for displaying any common 1-liner info.
ParallelOp::Print(out, show_all);
// Then show any custom derived-internal 1-liner info for this op.
out << "\n";
} else {
// Call the super class for displaying any common detailed info.
ParallelOp::Print(out, show_all);
// Then show any custom derived-internal stuff.
out << "\nSample count: " << total_rows_ << "\nDevice id: " << device_id_ << "\nNumber of devices: " << num_devices_
<< "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no") << "\nSogouNews files list:\n";
for (int i = 0; i < csv_files_list_.size(); ++i) {
out << " " << csv_files_list_[i];
}
out << "\n\n";
}
}
} // namespace dataset
} // namespace mindspore

+ 71
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sogou_news_op.h View File

@@ -0,0 +1,71 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SOGOU_NEWS_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SOGOU_NEWS_OP_H_
#include <memory>
#include <string>
#include <vector>
#include "minddata/dataset/engine/datasetops/source/csv_op.h"
namespace mindspore {
namespace dataset {
class JaggedConnector;
/// \class SogouNewsOp
/// \brief A Op derived class to represent SogouNews Op.
class SogouNewsOp : public CsvOp {
public:
/// \brief Constructor of SogouNewsOp.
/// \param[in] num_workers Number of worker threads reading data from sogou_news files.
/// \param[in] num_samples The number of samples to be included in the dataset.
/// \param[in] worker_connector_size Size of each internal queue.
/// \param[in] op_connector_size Size of each queue in the connector that the child operator pulls from.
/// \param[in] shuffle_files Whether or not to shuffle the files before reading data.
/// \param[in] num_devices Number of devices that the dataset should be divided into.
/// \param[in] device_id The device ID within num_devices.
/// \param[in] field_delim A char that indicates the delimiter to separate fields.
/// \param[in] column_default List of default values for the CSV field (default={}). Each item in the list is
/// either a valid type (float, int, or string).
/// \param[in] column_name List of column names of the dataset.
/// \param[in] sogounews_files_list List of file paths for the dataset files.
SogouNewsOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size, int32_t op_connector_size,
bool shuffle_files, int32_t num_devices, int32_t device_id, char field_delim,
const std::vector<std::shared_ptr<BaseRecord>> &column_default,
const std::vector<std::string> &column_name, const std::vector<std::string> &sogou_news_files_list);
/// \brief Destructor.
~SogouNewsOp() = default;
/// \brief A print method typically used for debugging.
/// \param[out] out The output stream to write output to.
/// \param[in] show_all A bool to control if you want to show all info or just a summary.
void Print(std::ostream &out, bool show_all) const override;
/// \brief DatasetName name getter.
/// \param[in] upper A bool to control if you want to return uppercase or lowercase Op name.
/// \return DatasetName of the current Op.
std::string DatasetName(bool upper = false) const { return upper ? "SogouNews" : "sogou news"; }
/// \brief Op name getter.
/// \return Name of the current Op.
std::string Name() const override { return "SogouNewsOp"; }
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_SOGOU_NEWS_OP_H_

+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h View File

@@ -106,6 +106,7 @@ constexpr char kPlaces365Node[] = "Places365Dataset";
constexpr char kQMnistNode[] = "QMnistDataset";
constexpr char kRandomNode[] = "RandomDataset";
constexpr char kSBUNode[] = "SBUDataset";
constexpr char kSogouNewsNode[] = "SogouNewsDataset";
constexpr char kSpeechCommandsNode[] = "SpeechCommandsDataset";
constexpr char kSTL10Node[] = "STL10Dataset";
constexpr char kTedliumNode[] = "TedliumDataset";


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt View File

@@ -32,6 +32,7 @@ set(DATASET_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES
qmnist_node.cc
random_node.cc
sbu_node.cc
sogou_news_node.cc
speech_commands_node.cc
stl10_node.cc
tedlium_node.cc


+ 189
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/sogou_news_node.cc View File

@@ -0,0 +1,189 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/engine/ir/datasetops/source/sogou_news_node.h"
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace dataset {
SogouNewsNode::SogouNewsNode(const std::string &dataset_dir, const std::string &usage, int64_t num_samples,
ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
const std::shared_ptr<DatasetCache> &cache)
: NonMappableSourceNode(std::move(cache)),
dataset_dir_(dataset_dir),
num_samples_(num_samples),
shuffle_(shuffle),
num_shards_(num_shards),
shard_id_(shard_id),
usage_(usage),
sogou_news_files_list_(WalkAllFiles(usage, dataset_dir)) {
// Update the num_shards_ in global context. this number is only used for now by auto_num_worker_pass.
// User discretion is advised. Auto_num_worker_pass is currently an experimental feature which can still work
// if the num_shards_ isn't 100% correct. The reason behind is for now, PreBuildSampler doesn't offer a way to
// return num_shards. Once PreBuildSampler is phased out, this can be cleaned up.
GlobalContext::config_manager()->set_num_shards_for_auto_num_workers(num_shards_);
}
std::shared_ptr<DatasetNode> SogouNewsNode::Copy() {
auto node =
std::make_shared<SogouNewsNode>(dataset_dir_, usage_, num_samples_, shuffle_, num_shards_, shard_id_, cache_);
return node;
}
void SogouNewsNode::Print(std::ostream &out) const {
out << (Name() + "(cache: " + ((cache_ != nullptr) ? "true" : "false") +
", num_shards: " + std::to_string(num_shards_) + ", shard_id: " + std::to_string(shard_id_) + ")");
}
Status SogouNewsNode::ValidateParams() {
RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
RETURN_IF_NOT_OK(ValidateDatasetDirParam("SogouNewsNode", dataset_dir_));
RETURN_IF_NOT_OK(ValidateStringValue("SogouNewsNode", usage_, {"train", "test", "all"}));
if (num_samples_ < 0) {
std::string err_msg = "SogouNewsNode: Invalid number of samples: " + std::to_string(num_samples_);
MS_LOG(ERROR) << err_msg;
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
RETURN_IF_NOT_OK(ValidateDatasetShardParams("SogouNewsNode", num_shards_, shard_id_));
return Status::OK();
}
Status SogouNewsNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);
// Sort the dataset files in a lexicographical order.
std::vector<std::string> sorted_dataset_files = sogou_news_files_list_;
std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end());
std::vector<std::shared_ptr<SogouNewsOp::BaseRecord>> column_default;
column_default.push_back(std::make_shared<SogouNewsOp::Record<std::string>>(SogouNewsOp::STRING, ""));
column_default.push_back(std::make_shared<SogouNewsOp::Record<std::string>>(SogouNewsOp::STRING, ""));
column_default.push_back(std::make_shared<SogouNewsOp::Record<std::string>>(SogouNewsOp::STRING, ""));
std::vector<std::string> column_name = {"index", "title", "content"};
char field_delim = ',';
auto sogou_news_op = std::make_shared<SogouNewsOp>(num_workers_, num_samples_, worker_connector_size_,
connector_que_size_, shuffle_files, num_shards_, shard_id_,
field_delim, column_default, column_name, sogou_news_files_list_);
RETURN_IF_NOT_OK(sogou_news_op->Init());
// If a global shuffle is used for SogouNews, it will inject a shuffle op over the SogouNews.
// But, if there is a cache in the tree, we do not need the global shuffle and the shuffle op should not be
// built.This is achieved in the cache transform pass where we call MakeSimpleProducer to reset SogouNews
// shuffle option to false.
if (shuffle_ == ShuffleMode::kGlobal) {
// Inject ShuffleOp.
std::shared_ptr<DatasetOp> shuffle_op = nullptr;
int64_t num_rows = 0;
// First, get the number of rows in the dataset.
RETURN_IF_NOT_OK(SogouNewsOp::CountAllFileRows(sogou_news_files_list_, false, &num_rows));
// Add the shuffle op after this op.
RETURN_IF_NOT_OK(
AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, &shuffle_op));
shuffle_op->SetTotalRepeats(GetTotalRepeats());
shuffle_op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch());
node_ops->push_back(shuffle_op);
}
sogou_news_op->SetTotalRepeats(GetTotalRepeats());
sogou_news_op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch());
node_ops->push_back(sogou_news_op);
return Status::OK();
}
Status SogouNewsNode::GetShardId(int32_t *shard_id) {
*shard_id = shard_id_;
return Status::OK();
}
Status SogouNewsNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) {
if (dataset_size_ > 0) {
*dataset_size = dataset_size_;
return Status::OK();
}
int64_t num_rows, sample_size;
RETURN_IF_NOT_OK(SogouNewsOp::CountAllFileRows(sogou_news_files_list_, false, &num_rows));
sample_size = num_samples_;
num_rows = static_cast<int64_t>(ceil(num_rows / (1.0 * num_shards_)));
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows;
dataset_size_ = *dataset_size;
return Status::OK();
}
Status SogouNewsNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_parallel_workers"] = num_workers_;
args["dataset_dir"] = dataset_dir_;
args["usage"] = usage_;
args["num_samples"] = num_samples_;
args["shuffle"] = shuffle_;
args["num_shards"] = num_shards_;
args["shard_id"] = shard_id_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
*out_json = args;
return Status::OK();
}
Status SogouNewsNode::SetupSamplerForCache(std::shared_ptr<SamplerObj> *sampler) {
bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);
*sampler = SelectSampler(num_samples_, shuffle_files, num_shards_, shard_id_);
return Status::OK();
}
Status SogouNewsNode::MakeSimpleProducer() {
shard_id_ = 0;
num_shards_ = 1;
shuffle_ = ShuffleMode::kFalse;
num_samples_ = 0;
return Status::OK();
}
std::vector<std::string> SogouNewsNode::WalkAllFiles(const std::string &usage, const std::string &dataset_dir) {
std::vector<std::string> sogou_news_files_list;
Path train_prefix("train.csv");
Path test_prefix("test.csv");
Path dir(dataset_dir);
if (usage == "train") {
Path temp_path = dir / train_prefix;
sogou_news_files_list.push_back(temp_path.ToString());
} else if (usage == "test") {
Path temp_path = dir / test_prefix;
sogou_news_files_list.push_back(temp_path.ToString());
} else {
Path temp_path = dir / train_prefix;
sogou_news_files_list.push_back(temp_path.ToString());
Path temp_path1 = dir / test_prefix;
sogou_news_files_list.push_back(temp_path1.ToString());
}
return sogou_news_files_list;
}
} // namespace dataset
} // namespace mindspore

+ 136
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/sogou_news_node.h View File

@@ -0,0 +1,136 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_SOGOU_NEWS_NODE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_SOGOU_NEWS_NODE_H_
#include <memory>
#include <string>
#include <vector>
#include "minddata/dataset/engine/datasetops/source/sogou_news_op.h"
#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
namespace mindspore {
namespace dataset {
/// \class SogouNewsNode
/// \brief A Node derived class to represent SogouNews Node.
class SogouNewsNode : public NonMappableSourceNode {
public:
/// \brief Constructor of SogouNewsNode.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage Part of dataset of SogouNews, can be "train", "test" or "all" data.
/// \param[in] num_samples The number of samples to be included in the dataset.
/// \param[in] shuffle The mode for shuffling data every epoch.
/// Can be any of:
/// ShuffleMode::kFalse - No shuffling is performed.
/// ShuffleMode::kFiles - Shuffle files only.
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into.
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified.
/// \param[in] cache Tensor cache to use.
SogouNewsNode(const std::string &dataset_dir, const std::string &usage, int64_t num_samples, ShuffleMode shuffle,
int32_t num_shards, int32_t shard_id, const std::shared_ptr<DatasetCache> &cache);
/// \brief Destructor.
~SogouNewsNode() = default;
/// \brief Node name getter.
/// \return Name of the current node.
std::string Name() const override { return kSogouNewsNode; }
/// \brief Print the description.
/// \param[out] out The output stream to write output to.
void Print(std::ostream &out) const override;
/// \brief Copy the node to a new object.
/// \return A shared pointer to the new copy.
std::shared_ptr<DatasetNode> Copy() override;
/// \brief A base class override function to create the required runtime dataset op objects for this class.
/// \param node_ops A vector containing shared pointer to the Dataset Ops that this object will create.
/// \return Status Status::OK() if build successfully.
Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override;
/// \brief Parameters validation.
/// \return Status Status::OK() if all the parameters are valid.
Status ValidateParams() override;
/// \brief Get the shard id of node.
/// \param[in] shard_id The shard id.
/// \return Status Status::OK() if get shard id successfully.
Status GetShardId(int32_t *shard_id) override;
/// \brief Base-class override for GetDatasetSize.
/// \param[in] size_getter Shared pointer to DatasetSizeGetter.
/// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting.
/// dataset size at the expense of accuracy.
/// \param[out] dataset_size The size of the dataset.
/// \return Status of the function.
Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) override;
/// \brief Getter functions.
const std::string &DatasetDir() const { return dataset_dir_; }
const std::string &Usage() const { return usage_; }
int64_t NumSamples() const { return num_samples_; }
ShuffleMode Shuffle() const { return shuffle_; }
int32_t NumShards() const { return num_shards_; }
int32_t ShardId() const { return shard_id_; }
/// \brief Get the arguments of node.
/// \param[out] out_json JSON string of all attributes.
/// \return Status of the function.
Status to_json(nlohmann::json *out_json) override;
/// \brief SogouNews by itself is a non-mappable dataset that does not support sampling.
/// However, if a cache operator is injected at some other place higher in the tree, that cache can
/// inherit this sampler from the leaf, providing sampling support from the caching layer.
/// That is why we setup the sampler for a leaf node that does not use sampling.
/// Note: This function is common among NonMappableSourceNode and should be promoted to its parent class.
/// \param[in] sampler The sampler to setup.
/// \return Status of the function.
Status SetupSamplerForCache(std::shared_ptr<SamplerObj> *sampler) override;
/// \brief If a cache has been added into the ascendant tree over this clue node, then the cache will be executing
/// a sampler for fetching the data. As such, any options in the clue node need to be reset to its defaults so.
/// that this clue node will produce the full set of data into the cache.
/// Note: This function is common among NonMappableSourceNode and should be promoted to its parent class.
/// \return Status of the function.
Status MakeSimpleProducer() override;
/// \brief Generate a list of read file names according to usage.
/// \param[in] usage Part of dataset of SogouNews.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \return std::vector<std::string> A list of read file names.
std::vector<std::string> WalkAllFiles(const std::string &usage, const std::string &dataset_dir);
private:
std::string dataset_dir_;
std::string usage_;
char field_delim_;
std::vector<std::shared_ptr<CsvBase>> column_defaults_;
std::vector<std::string> column_names_;
int64_t num_samples_;
ShuffleMode shuffle_;
int32_t num_shards_;
int32_t shard_id_;
std::vector<std::string> sogou_news_files_list_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_SOGOU_NEWS_NODE_H_

+ 50
- 0
mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h View File

@@ -3786,6 +3786,56 @@ inline std::shared_ptr<SBUDataset> MS_API SBU(const std::string &dataset_dir, bo
return std::make_shared<SBUDataset>(StringToChar(dataset_dir), decode, sampler, cache);
}

/// \class SogouNewsDataset
/// \brief A source dataset for reading and parsing Sogou News dataset.
class MS_API SogouNewsDataset : public Dataset {
public:
/// \brief Constructor of SogouNewsDataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage Part of dataset of SogouNews, can be "train", "test" or "all".
/// \param[in] num_samples The number of samples to be included in the dataset.
/// \param[in] shuffle The mode for shuffling data every epoch.
/// Can be any of:
/// ShuffleMode.kFalse - No shuffling is performed.
/// ShuffleMode.kFiles - Shuffle files only.
/// ShuffleMode.kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into.
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified.
/// \param[in] cache Tensor cache to use.
SogouNewsDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, int64_t num_samples,
ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
const std::shared_ptr<DatasetCache> &cache);
/// Destructor of SogouNewsDataset.
~SogouNewsDataset() = default;
};

/// \brief Function to create a SogouNewsDataset.
/// \note This dataset includes polarity and full, which can be read according to your own needs.
/// \note The generated dataset has three columns ["index", "title" , "content"]. Their types are all string.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage Part of dataset of SogouNews, can be "train", "test" or "all" data (default="all").
/// \param[in] num_samples The number of samples to be included in the dataset
/// (Default = 0, means all samples).
/// \param[in] shuffle The mode for shuffling data every epoch (Default=ShuffleMode.kGlobal).
/// Can be any of:
/// ShuffleMode::kFalse - No shuffling is performed.
/// ShuffleMode::kFiles - Shuffle files only.
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into (Default = 1).
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified (Default = 0).
/// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used).
/// \return Shared pointer to the SogouNewsDataset.
inline std::shared_ptr<SogouNewsDataset> MS_API SogouNews(const std::string &dataset_dir,
const std::string &usage = "all", int64_t num_samples = 0,
ShuffleMode shuffle = ShuffleMode::kGlobal,
int32_t num_shards = 1, int32_t shard_id = 0,
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<SogouNewsDataset>(StringToChar(dataset_dir), StringToChar(usage), num_samples, shuffle,
num_shards, shard_id, cache);
}

/// \class SpeechCommandsDataset.
/// \brief A source dataset that reads and parses SpeechCommands dataset.
class MS_API SpeechCommandsDataset : public Dataset {


+ 88
- 1
mindspore/dataset/engine/datasets.py View File

@@ -72,7 +72,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
check_photo_tour_dataset, check_ag_news_dataset, check_dbpedia_dataset, check_lj_speech_dataset, \
check_yes_no_dataset, check_speech_commands_dataset, check_tedlium_dataset, check_svhn_dataset, \
check_stl10_dataset, check_yelp_review_dataset, check_penn_treebank_dataset, check_iwslt2016_dataset, \
check_iwslt2017_dataset
check_iwslt2017_dataset, check_sogou_news_dataset
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
get_prefetch_size, get_auto_offload
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
@@ -7156,6 +7156,93 @@ class SBUDataset(MappableDataset):
return cde.SBUNode(self.dataset_dir, self.decode, self.sampler)


class SogouNewsDataset(SourceDataset):
"""
A source dataset that reads and parses Sogou News dataset.

The generated dataset has three columns: :py:obj:`[index, title, content]`.
The tensor of column :py:obj:`index` is of the string type.
The tensor of column :py:obj:`title` is of the string type.
The tensor of column :py:obj:`content` is of the string type.

Args:
dataset_dir (str): Path to the root directory that contains the dataset.
usage (str, optional): Usage of this dataset, can be `train`, `test` or `all` .
`train` will read from 450,000 train samples, `test` will read from 60,000 test samples,
`all` will read from all 510,000 samples (default=None, all samples).
num_samples (int, optional): Number of samples (rows) to read (default=None, read all samples).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:

- Shuffle.GLOBAL: Shuffle both the files and samples.

- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, `num_samples` reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
num_parallel_workers (int, optional): Number of workers to read the data
(default=None, number set in the config).
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
(default=None, which means no cache is used).

Raises:
RuntimeError: If dataset_dir does not contain data files.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.

Examples:
>>> sogou_news_dataset_dir = "/path/to/sogou_news_dataset_dir"
>>> dataset = ds.SogouNewsDataset(dataset_files=sogou_news_dataset_dir, usage='all')

About SogouNews Dataset:

SogouNews dataset includes 3 columns, corresponding to class index (1 to 5), title and content. The title and
content are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes ("").
New lines are escaped by a backslash followed with an "n" character, that is "\n".

You can unzip the dataset files into the following structure and read by MindSpore's API:

.. code-block::

.
└── sogou_news_dir
├── classes.txt
├── readme.txt
├── test.csv
└── train.csv

Citation:

.. code-block::

@misc{zhang2015characterlevel,
title={Character-level Convolutional Networks for Text Classification},
author={Xiang Zhang and Junbo Zhao and Yann LeCun},
year={2015},
eprint={1509.01626},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
"""

@check_sogou_news_dataset
def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None,
shard_id=None, num_parallel_workers=None, cache=None):
super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
num_shards=num_shards, shard_id=shard_id, cache=cache)
self.dataset_dir = dataset_dir
self.usage = replace_none(usage, 'all')

def parse(self, children=None):
return cde.SogouNewsNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
self.num_shards, self.shard_id)


class _Flowers102Dataset:
"""
Mainly for loading Flowers102 Dataset, and return one row each time.


+ 27
- 0
mindspore/dataset/engine/validators.py View File

@@ -348,6 +348,33 @@ def check_sbu_dataset(method):
return new_method


def check_sogou_news_dataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(SogouNewsDataset)."""

@wraps(method)
def new_method(self, *args, **kwargs):
_, param_dict = parse_user_args(method, *args, **kwargs)

nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']

dataset_dir = param_dict.get('dataset_dir')
check_dir(dataset_dir)

usage = param_dict.get('usage')
if usage is not None:
check_valid_str(usage, ["train", "test", "all"], "usage")

validate_dataset_param_value(nreq_param_int, param_dict, int)
check_sampler_shuffle_shard_options(param_dict)

cache = param_dict.get('cache')
check_cache_option(cache)

return method(self, *args, **kwargs)

return new_method


def check_tfrecorddataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(TFRecordDataset)."""



+ 1
- 0
tests/ut/cpp/dataset/CMakeLists.txt View File

@@ -42,6 +42,7 @@ SET(DE_UT_SRCS
c_api_dataset_randomdata_test.cc
c_api_dataset_save.cc
c_api_dataset_sbu_test.cc
c_api_dataset_sogou_news_test.cc
c_api_dataset_speech_commands_test.cc
c_api_dataset_stl10_test.cc
c_api_dataset_tedlium_test.cc


+ 532
- 0
tests/ut/cpp/dataset/c_api_dataset_sogou_news_test.cc View File

@@ -0,0 +1,532 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/include/dataset/datasets.h"
using namespace mindspore::dataset;
class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};
/// Feature: Test SogouNews Dataset.
/// Description: read SogouNewsDataset data and get data.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetBasic) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetBasic.";
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "test", 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 3 samples
EXPECT_EQ(i, 3);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Test SogouNews Dataset(usage=all).
/// Description: read train data and test data.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetUsageAll) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetUsageAll.";
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "all" , 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Jefferson commented on thick eyebrow: he has the top five talents in the league, but he is not the"
" top five","They say he has the talent of the top five in the league. The talent of the top five in the"
" league is one of the most disrespectful statements. I say he has the talent of the top five in the league,"
" but he is not the top five players because the top five players play every night."},
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"3","Group pictures: Liu Shishi's temperament in early autumn released a large piece of micro curly long"
" hair, elegant, lazy, gentle and capable","Liu Shishi's latest group of cover magazine blockbusters are"
" released. In the photos, Liu Shishi's long hair is slightly curly, or camel colored belted woolen coat,"
" or plaid suit, which is gentle and elegant and beautiful to a new height."},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"3","Ni Ni deduces elegant retro style in different styles","Ni Ni's latest group of magazine cover"
" blockbusters released that wearing gift hats is cool, retro, unique and full of fashion expression."},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 6 samples
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Test Getters.
/// Description: includes tests for shape, type, size.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsGetters) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsGetters.";
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "test", 0, ShuffleMode::kFalse);
std::vector<std::string> column_names = {"index", "title", "content"};
EXPECT_NE(ds, nullptr);
EXPECT_EQ(ds-> GetDatasetSize(),3);
EXPECT_EQ(ds->GetColumnNames(),column_names);
}
/// Feature: Test SogouNews Dataset(num_samples = 3).
/// Description: test whether the interface meets expectations when NumSamples is equal to 3.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsNumSamples) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsNumSamples.";
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "test", 3, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 3 samples
EXPECT_EQ(i, 3);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Test SogouNewsDataset in distribution.
/// Description: test interface in a distributed state.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetDistribution) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetDistribution.";
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "test", 0, ShuffleMode::kFalse, 2, 0);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 2 samples
EXPECT_EQ(i, 2);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Error Test.
/// Description: test the wrong input.
/// Expectation: unable to read in data.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetFail.";
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::string invalid_csv_file = "./NotExistFile";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds0 = SogouNews("", "test", 0);
EXPECT_NE(ds0, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter0 = ds0->CreateIterator();
// Expect failure: invalid SogouNews input
EXPECT_EQ(iter0, nullptr);
// Create a SogouNews Dataset with invalid usage
std::shared_ptr<Dataset> ds1 = SogouNews(invalid_csv_file);
EXPECT_NE(ds1, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
// Expect failure: invalid SogouNews input
EXPECT_EQ(iter1, nullptr);
// Test invalid num_samples < -1
std::shared_ptr<Dataset> ds2 = SogouNews(dataset_dir, "test", -1, ShuffleMode::kFalse);
EXPECT_NE(ds2, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter2 = ds2->CreateIterator();
// Expect failure: invalid SogouNews input
EXPECT_EQ(iter2, nullptr);
// Test invalid num_shards < 1
std::shared_ptr<Dataset> ds3 = SogouNews(dataset_dir, "test", 0, ShuffleMode::kFalse, 0);
EXPECT_NE(ds3, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter3 = ds3->CreateIterator();
// Expect failure: invalid SogouNews input
EXPECT_EQ(iter3, nullptr);
// Test invalid shard_id >= num_shards
std::shared_ptr<Dataset> ds4 = SogouNews(dataset_dir, "test", 0, ShuffleMode::kFalse, 2, 2);
EXPECT_NE(ds4, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter4 = ds4->CreateIterator();
// Expect failure: invalid SogouNews input
EXPECT_EQ(iter4, nullptr);
}
/// Feature: Test SogouNews Dataset(ShuffleMode=kFiles).
/// Description: test SogouNews Dataset interface with different ShuffleMode.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetShuffleFilesA) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetShuffleFilesA.";
// Set configuration
uint32_t original_seed = GlobalContext::config_manager()->seed();
uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
GlobalContext::config_manager()->set_seed(130);
GlobalContext::config_manager()->set_num_parallel_workers(4);
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "all" , 0, ShuffleMode::kFiles);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"1","Jefferson commented on thick eyebrow: he has the top five talents in the league, but he is not the"
" top five","They say he has the talent of the top five in the league. The talent of the top five in the"
" league is one of the most disrespectful statements. I say he has the talent of the top five in the league,"
" but he is not the top five players because the top five players play every night."},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"3","Group pictures: Liu Shishi's temperament in early autumn released a large piece of micro curly long"
" hair, elegant, lazy, gentle and capable","Liu Shishi's latest group of cover magazine blockbusters are"
" released. In the photos, Liu Shishi's long hair is slightly curly, or camel colored belted woolen coat,"
" or plaid suit, which is gentle and elegant and beautiful to a new height."},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."},
{"3","Ni Ni deduces elegant retro style in different styles","Ni Ni's latest group of magazine cover"
" blockbusters released that wearing gift hats is cool, retro, unique and full of fashion expression."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 6 samples
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
// Restore configuration
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}
/// Feature: Test SogouNews Dataset(ShuffleMode=kInfile).
/// Description: test SogouNews Dataset interface with different ShuffleMode.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetShuffleFilesB) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetShuffleFilesB.";
// Set configuration
uint32_t original_seed = GlobalContext::config_manager()->seed();
uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
GlobalContext::config_manager()->set_seed(130);
GlobalContext::config_manager()->set_num_parallel_workers(4);
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "all" , 0, ShuffleMode::kInfile);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Jefferson commented on thick eyebrow: he has the top five talents in the league, but he is not the"
" top five","They say he has the talent of the top five in the league. The talent of the top five in the"
" league is one of the most disrespectful statements. I say he has the talent of the top five in the league,"
" but he is not the top five players because the top five players play every night."},
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"3","Group pictures: Liu Shishi's temperament in early autumn released a large piece of micro curly long"
" hair, elegant, lazy, gentle and capable","Liu Shishi's latest group of cover magazine blockbusters are"
" released. In the photos, Liu Shishi's long hair is slightly curly, or camel colored belted woolen coat,"
" or plaid suit, which is gentle and elegant and beautiful to a new height."},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"3","Ni Ni deduces elegant retro style in different styles","Ni Ni's latest group of magazine cover"
" blockbusters released that wearing gift hats is cool, retro, unique and full of fashion expression."},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 6 samples
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
// Restore configuration
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}
/// Feature: Test SogouNews Dataset(ShuffleMode=kGlobal).
/// Description: test SogouNews Dataset interface with different ShuffleMode.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestSogouNewsDatasetShuffleFilesGlobal) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSogouNewsDatasetShuffleFilesGlobal.";
// Set configuration
uint32_t original_seed = GlobalContext::config_manager()->seed();
uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
GlobalContext::config_manager()->set_seed(130);
GlobalContext::config_manager()->set_num_parallel_workers(4);
std::string dataset_dir = datasets_root_path_ + "/testSogouNews/";
std::vector<std::string> column_names = {"index", "title", "content"};
std::shared_ptr<Dataset> ds = SogouNews(dataset_dir, "all" , 0, ShuffleMode::kGlobal);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("index"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1","Make history","Su Bingtian's 100m breakthrough\\n 9.83"},
{"1","Jefferson commented on thick eyebrow: he has the top five talents in the league, but he is not the"
" top five","They say he has the talent of the top five in the league. The talent of the top five in the"
" league is one of the most disrespectful statements. I say he has the talent of the top five in the league,"
" but he is not the top five players because the top five players play every night."},
{"4","Tesla price","Tesla reduced its price by 70000 yuan"},
{"3","Ni Ni deduces elegant retro style in different styles","Ni Ni's latest group of magazine cover"
" blockbusters released that wearing gift hats is cool, retro, unique and full of fashion expression."},
{"3","Group pictures: Liu Shishi's temperament in early autumn released a large piece of micro curly long"
" hair, elegant, lazy, gentle and capable","Liu Shishi's latest group of cover magazine blockbusters are"
" released. In the photos, Liu Shishi's long hair is slightly curly, or camel colored belted woolen coat,"
" or plaid suit, which is gentle and elegant and beautiful to a new height."},
{"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, "
"the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, "
"Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic"
" Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th"
" National Games flame will burn here for 12 days."}
};
uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 6 samples
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
// Restore configuration
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}

+ 3
- 0
tests/ut/data/dataset/testSogouNews/test.csv View File

@@ -0,0 +1,3 @@
"1","Make history","Su Bingtian's 100m breakthrough\n 9.83"
"4","Tesla price","Tesla reduced its price by 70000 yuan"
"1","Opening ceremony of the 14th National Games","On the evening of September 15, Beijing time, the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center Stadium, Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the Tokyo Olympic Games and a Post-00 shooter, lit the main torch platform. From then on, to September 27, the 14th National Games flame will burn here for 12 days."

+ 3
- 0
tests/ut/data/dataset/testSogouNews/train.csv View File

@@ -0,0 +1,3 @@
"1","Jefferson commented on thick eyebrow: he has the top five talents in the league, but he is not the top five","They say he has the talent of the top five in the league. The talent of the top five in the league is one of the most disrespectful statements. I say he has the talent of the top five in the league, but he is not the top five players because the top five players play every night."
"3","Group pictures: Liu Shishi's temperament in early autumn released a large piece of micro curly long hair, elegant, lazy, gentle and capable","Liu Shishi's latest group of cover magazine blockbusters are released. In the photos, Liu Shishi's long hair is slightly curly, or camel colored belted woolen coat, or plaid suit, which is gentle and elegant and beautiful to a new height."
"3","Ni Ni deduces elegant retro style in different styles","Ni Ni's latest group of magazine cover blockbusters released that wearing gift hats is cool, retro, unique and full of fashion expression."

+ 185
- 0
tests/ut/python/dataset/test_datasets_sogou_news.py View File

@@ -0,0 +1,185 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import mindspore.dataset as ds
DATA_SOGOU_NEWS_DIR = '../data/dataset/testSogouNews/'
def test_sogou_news_dataset_basic():
"""
Feature: Test SogouNews Dataset.
Description: read data from a test.csv file.
Expectation: the data is processed successfully.
"""
buffer = []
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False)
data = data.repeat(2)
data = data.skip(2)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.append(d)
assert len(buffer) == 4
def test_sogou_news_dataset_all():
"""
Feature: Test SogouNews Dataset.
Description: read data from a test.csv and train.csv file.
Expectation: the data is processed successfully.
"""
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='all', shuffle=False)
buffer = []
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.extend([d['index'].item().decode("utf8"),
d['title'].item().decode("utf8"),
d['content'].item().decode("utf8")])
assert buffer == ["1", "Jefferson commented on thick eyebrow: he has the top five talents in the league, but he "
"is not the top five", "They say he has the talent of the top five in the league. The talent "
"of the top five in the league is one of the most disrespectful statements. I say he has the "
"talent of the top five in the league, but he is not the top five players because the top five "
"players play every night.",
"1", "Make history", "Su Bingtian's 100m breakthrough\\n 9.83",
"3", "Group pictures: Liu Shishi's temperament in early autumn released a large piece of micro "
"curly long hair, elegant, lazy, gentle and capable", "Liu Shishi's latest group of cover "
"magazine blockbusters are released. In the photos, Liu Shishi's long hair is slightly curly, "
"or camel colored belted woolen coat, or plaid suit, which is gentle and elegant and beautiful "
"to a new height.",
"4", "Tesla price", "Tesla reduced its price by 70000 yuan",
"3", "Ni Ni deduces elegant retro style in different styles", "Ni Ni's latest group of magazine "
"cover blockbusters released that wearing gift hats is cool, retro, unique and full of fashion "
"expression.",
"1", "Opening ceremony of the 14th National Games", "On the evening of September 15, Beijing "
"time, the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center "
"Stadium, Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in "
"the Tokyo Olympic Games and a Post-00 shooter, lit the main torch platform. From then on, "
"to September 27, the 14th National Games flame will burn here for 12 days."]
def test_sogou_news_dataset_quoted():
"""
Feature: Test get the SogouNews Dataset.
Description: read SogouNewsDataset data and get data.
Expectation: the data is processed successfully.
"""
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False)
buffer = []
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.extend([d['index'].item().decode("utf8"),
d['title'].item().decode("utf8"),
d['content'].item().decode("utf8")])
assert buffer == ["1", "Make history", "Su Bingtian's 100m breakthrough\\n 9.83",
"4", "Tesla price", "Tesla reduced its price by 70000 yuan",
"1", "Opening ceremony of the 14th National Games", "On the evening of September 15, Beijing time"
", the 14th games of the people's Republic of China opened in Xi'an Olympic Sports Center "
"Stadium, Shaanxi Province. Yang Qian, the first gold medalist of the Chinese delegation in the"
" Tokyo Olympic Games and a Post-00 shooter, lit the main torch platform. From then on, to "
"September 27, the 14th National Games flame will burn here for 12 days."]
def test_sogou_news_dataset_usage_all():
"""
Feature: Test SogouNews Dataset(usage=all).
Description: read train data and test data.
Expectation: the data is processed successfully.
"""
buffer = []
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='all', shuffle=False)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.append(d)
assert len(buffer) == 6
def test_sogou_news_dataset_get_datasetsize():
"""
Feature: Test Getters.
Description: test get_dataset_size of SogouNews dataset.
Expectation: the data is processed successfully.
"""
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False)
size = data.get_dataset_size()
assert size == 3
def test_sogou_news_dataset_distribution():
"""
Feature: Test SogouNewsDataset in distribution.
Description: test in a distributed state.
Expectation: the data is processed successfully.
"""
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False, num_shards=2, shard_id=0)
count = 0
for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
count += 1
assert count == 2
def test_sogou_news_dataset_num_samples():
"""
Feature: Test SogouNews Dataset(num_samples = 2).
Description: test get num_samples.
Expectation: the data is processed successfully.
"""
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False, num_samples=2)
count = 0
for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
count += 1
assert count == 2
def test_sogou_news_dataset_exception():
"""
Feature: Error Test.
Description: test the wrong input.
Expectation: unable to read in data.
"""
def exception_func(item):
raise Exception("Error occur!")
try:
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False)
data = data.map(operations=exception_func, input_columns=["index"], num_parallel_workers=1)
for _ in data.create_dict_iterator():
pass
assert False
except RuntimeError as e:
assert "map operation: [PyFunc] failed. The corresponding data files" in str(e)
try:
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False)
data = data.map(operations=exception_func, input_columns=["title"], num_parallel_workers=1)
for _ in data.create_dict_iterator():
pass
assert False
except RuntimeError as e:
assert "map operation: [PyFunc] failed. The corresponding data files" in str(e)
try:
data = ds.SogouNewsDataset(DATA_SOGOU_NEWS_DIR, usage='test', shuffle=False)
data = data.map(operations=exception_func, input_columns=["content"], num_parallel_workers=1)
for _ in data.create_dict_iterator():
pass
assert False
except RuntimeError as e:
assert "map operation: [PyFunc] failed. The corresponding data files" in str(e)
if __name__ == "__main__":
test_sogou_news_dataset_basic()
test_sogou_news_dataset_all()
test_sogou_news_dataset_quoted()
test_sogou_news_dataset_usage_all()
test_sogou_news_dataset_get_datasetsize()
test_sogou_news_dataset_distribution()
test_sogou_news_dataset_num_samples()
test_sogou_news_dataset_exception()

Loading…
Cancel
Save