Browse Source

[feat][assistant][I3T96N]add new loader YelpReview

tags/v1.6.0
lufei9026 4 years ago
parent
commit
2947f418b1
19 changed files with 1430 additions and 10 deletions
  1. +9
    -0
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  2. +13
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
  3. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt
  4. +56
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/yelp_review_op.cc
  5. +71
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/source/yelp_review_op.h
  6. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
  7. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt
  8. +187
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/yelp_review_node.cc
  9. +136
    -0
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/yelp_review_node.h
  10. +51
    -0
      mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
  11. +130
    -10
      mindspore/dataset/engine/datasets.py
  12. +28
    -0
      mindspore/dataset/engine/validators.py
  13. +1
    -0
      tests/ut/cpp/dataset/CMakeLists.txt
  14. +583
    -0
      tests/ut/cpp/dataset/c_api_dataset_yelp_review_test.cc
  15. +2
    -0
      tests/ut/data/dataset/testYelpReview/full/test.csv
  16. +3
    -0
      tests/ut/data/dataset/testYelpReview/full/train.csv
  17. +2
    -0
      tests/ut/data/dataset/testYelpReview/polarity/test.csv
  18. +3
    -0
      tests/ut/data/dataset/testYelpReview/polarity/train.csv
  19. +152
    -0
      tests/ut/python/dataset/test_datasets_yelp_review.py

+ 9
- 0
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -119,6 +119,7 @@
#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/usps_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/yelp_review_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/yes_no_node.h"
#endif

@@ -1642,6 +1643,14 @@ TFRecordDataset::TFRecordDataset(const std::vector<std::vector<char>> &dataset_f
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

YelpReviewDataset::YelpReviewDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
const std::shared_ptr<DatasetCache> &cache) {
auto ds = std::make_shared<YelpReviewNode>(CharToString(dataset_dir), CharToString(usage), num_samples, shuffle,
num_shards, shard_id, cache);
ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
}

YesNoDataset::YesNoDataset(const std::vector<char> &dataset_dir, const std::shared_ptr<Sampler> &sampler,
const std::shared_ptr<DatasetCache> &cache) {
auto sampler_obj = sampler ? sampler->Parse() : nullptr;


+ 13
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc View File

@@ -48,6 +48,7 @@
#include "minddata/dataset/engine/ir/datasetops/source/stl10_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/tedlium_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/yelp_review_node.h"
#include "minddata/dataset/engine/ir/datasetops/source/yes_no_node.h"

// IR leaf nodes disabled for android
@@ -496,6 +497,18 @@ PYBIND_REGISTER(VOCNode, 2, ([](const py::module *m) {
}));
}));

PYBIND_REGISTER(YelpReviewNode, 2, ([](const py::module *m) {
(void)py::class_<YelpReviewNode, DatasetNode, std::shared_ptr<YelpReviewNode>>(
*m, "YelpReviewNode", "to create a YelpReviewNode")
.def(py::init([](std::string dataset_dir, std::string usage, int64_t num_samples, int32_t shuffle,
int32_t num_shards, int32_t shard_id) {
std::shared_ptr<YelpReviewNode> yelp_review = std::make_shared<YelpReviewNode>(
dataset_dir, usage, num_samples, toShuffleMode(shuffle), num_shards, shard_id, nullptr);
THROW_IF_ERROR(yelp_review->ValidateParams());
return yelp_review;
}));
}));

PYBIND_REGISTER(YesNoNode, 2, ([](const py::module *m) {
(void)py::class_<YesNoNode, DatasetNode, std::shared_ptr<YesNoNode>>(*m, "YesNoNode",
"to create a YesNoNode")


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/CMakeLists.txt View File

@@ -34,6 +34,7 @@ set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
tedlium_op.cc
text_file_op.cc
usps_op.cc
yelp_review_op.cc
yes_no_op.cc
)



+ 56
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/yelp_review_op.cc View File

@@ -0,0 +1,56 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "minddata/dataset/engine/datasetops/source/yelp_review_op.h"

#include <fstream>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "debug/common.h"

namespace mindspore {
namespace dataset {
YelpReviewOp::YelpReviewOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size,
int32_t op_connector_size, bool shuffle_files, int32_t num_devices, int32_t device_id,
char field_delim, const std::vector<std::shared_ptr<BaseRecord>> &column_default,
const std::vector<std::string> &column_name,
const std::vector<std::string> &yelp_review_files_list)
: CsvOp(yelp_review_files_list, field_delim, column_default, column_name, num_workers, num_samples,
worker_connector_size, op_connector_size, shuffle_files, num_devices, device_id) {}

void YelpReviewOp::Print(std::ostream &out, bool show_all) const {
if (!show_all) {
// Call the super class for displaying any common 1-liner info.
ParallelOp::Print(out, show_all);
// Then show any custom derived-internal 1-liner info for this op.
out << "\n";
} else {
// Call the super class for displaying any common detailed info.
ParallelOp::Print(out, show_all);
// Then show any custom derived-internal stuff.
out << "\nSample count: " << total_rows_ << "\nDevice id: " << device_id_ << "\nNumber of devices: " << num_devices_
<< "\nShuffle files: " << ((shuffle_files_) ? "yes" : "no") << "\nYelpReview files list:\n";
for (int i = 0; i < csv_files_list_.size(); ++i) {
out << " " << csv_files_list_[i];
}
out << "\n\n";
}
}
} // namespace dataset
} // namespace mindspore

+ 71
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/source/yelp_review_op.h View File

@@ -0,0 +1,71 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_YELP_REVIEW_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_YELP_REVIEW_OP_H_

#include <memory>
#include <string>
#include <vector>

#include "minddata/dataset/engine/datasetops/source/csv_op.h"

namespace mindspore {
namespace dataset {
class JaggedConnector;

/// \class YelpReviewOp
/// \brief A Op derived class to represent YelpReview Op.
class YelpReviewOp : public CsvOp {
public:
/// \brief Constructor of YelpReviewOp.
/// \param[in] num_workers Number of worker threads reading data from yelp_review files.
/// \param[in] num_samples The number of samples to be included in the dataset.
/// \param[in] worker_connector_size Size of each internal queue.
/// \param[in] op_connector_size Size of each queue in the connector that the child operator pulls from.
/// \param[in] shuffle_files Whether or not to shuffle the files before reading data.
/// \param[in] num_devices Number of devices that the dataset should be divided into.
/// \param[in] device_id The device ID within num_devices.
/// \param[in] field_delim A char that indicates the delimiter to separate fields.
/// \param[in] column_default List of default values for the CSV field (default={}). Each item in the list is
/// either a valid type (float, int, or string).
/// \param[in] column_name List of column names of the dataset.
/// \param[in] yelp_review_files_list List of file paths for the dataset files.
YelpReviewOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size, int32_t op_connector_size,
bool shuffle_files, int32_t num_devices, int32_t device_id, char field_delim,
const std::vector<std::shared_ptr<BaseRecord>> &column_default,
const std::vector<std::string> &column_name, const std::vector<std::string> &yelp_review_files_list);

/// \brief Destructor.
~YelpReviewOp() = default;

/// \brief A print method typically used for debugging.
/// \param[out] out The output stream to write output to.
/// \param[in] show_all A bool to control if you want to show all info or just a summary.
void Print(std::ostream &out, bool show_all) const override;

/// \brief DatasetName name getter.
/// \param[in] upper A bool to control if you want to return uppercase or lowercase Op name.
/// \return DatasetName of the current Op.
std::string DatasetName(bool upper = false) const { return upper ? "YelpReview" : "yelp review"; }

/// \brief Op name getter.
/// \return Name of the current Op.
std::string Name() const override { return "YelpReviewOp"; }
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_YELP_REVIEW_OP_H_

+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h View File

@@ -110,6 +110,7 @@ constexpr char kTextFileNode[] = "TextFileDataset";
constexpr char kTFRecordNode[] = "TFRecordDataset";
constexpr char kUSPSNode[] = "USPSDataset";
constexpr char kVOCNode[] = "VOCDataset";
constexpr char kYelpReviewNode[] = "YelpReviewDataset";
constexpr char kYesNoNode[] = "YesNoDataset";

Status AddShuffleOp(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows,


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/CMakeLists.txt View File

@@ -36,6 +36,7 @@ set(DATASET_ENGINE_IR_DATASETOPS_SOURCE_SRC_FILES
tf_record_node.cc
usps_node.cc
voc_node.cc
yelp_review_node.cc
yes_no_node.cc
)



+ 187
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/yelp_review_node.cc View File

@@ -0,0 +1,187 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "minddata/dataset/engine/ir/datasetops/source/yelp_review_node.h"

#include <algorithm>
#include <string>
#include <utility>
#include <vector>

#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {
YelpReviewNode::YelpReviewNode(const std::string &dataset_dir, const std::string &usage, int64_t num_samples,
ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
const std::shared_ptr<DatasetCache> &cache)
: NonMappableSourceNode(std::move(cache)),
dataset_dir_(dataset_dir),
num_samples_(num_samples),
shuffle_(shuffle),
num_shards_(num_shards),
shard_id_(shard_id),
usage_(usage),
yelp_review_files_list_(WalkAllFiles(usage, dataset_dir)) {
// Update the num_shards_ in global context. this number is only used for now by auto_num_worker_pass.
// User discretion is advised. Auto_num_worker_pass is currently an experimental feature which can still work
// if the num_shards_ isn't 100% correct. The reason behind is for now, PreBuildSampler doesn't offer a way to
// return num_shards. Once PreBuildSampler is phased out, this can be cleaned up.
GlobalContext::config_manager()->set_num_shards_for_auto_num_workers(num_shards_);
}

std::shared_ptr<DatasetNode> YelpReviewNode::Copy() {
auto node =
std::make_shared<YelpReviewNode>(dataset_dir_, usage_, num_samples_, shuffle_, num_shards_, shard_id_, cache_);
return node;
}

void YelpReviewNode::Print(std::ostream &out) const {
out << (Name() + "(cache: " + ((cache_ != nullptr) ? "true" : "false") +
", num_shards: " + std::to_string(num_shards_) + ", shard_id: " + std::to_string(shard_id_) + ")");
}

Status YelpReviewNode::ValidateParams() {
RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
RETURN_IF_NOT_OK(ValidateDatasetDirParam("YelpReviewNode", dataset_dir_));
RETURN_IF_NOT_OK(ValidateStringValue("YelpReviewNode", usage_, {"train", "test", "all"}));
if (num_samples_ < 0) {
std::string err_msg = "YelpReviewNode: Invalid number of samples: " + std::to_string(num_samples_);
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
}

RETURN_IF_NOT_OK(ValidateDatasetShardParams("YelpReviewNode", num_shards_, shard_id_));
return Status::OK();
}

Status YelpReviewNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);

// Sort the dataset files in a lexicographical order.
std::vector<std::string> sorted_dataset_files = yelp_review_files_list_;
std::sort(sorted_dataset_files.begin(), sorted_dataset_files.end());

std::vector<std::shared_ptr<YelpReviewOp::BaseRecord>> column_default;
column_default.push_back(std::make_shared<YelpReviewOp::Record<std::string>>(YelpReviewOp::STRING, ""));
column_default.push_back(std::make_shared<YelpReviewOp::Record<std::string>>(YelpReviewOp::STRING, ""));

std::vector<std::string> column_name = {"label", "text"};
char field_delim = ',';
std::shared_ptr<YelpReviewOp> yelp_review_op = std::make_shared<YelpReviewOp>(
num_workers_, num_samples_, worker_connector_size_, connector_que_size_, shuffle_files, num_shards_, shard_id_,
field_delim, column_default, column_name, yelp_review_files_list_);
RETURN_IF_NOT_OK(yelp_review_op->Init());

// If a global shuffle is used for YelpReview, it will inject a shuffle op over the YelpReview.
// But, if there is a cache in the tree, we do not need the global shuffle and the shuffle op should not be
// built.This is achieved in the cache transform pass where we call MakeSimpleProducer to reset YelpReview's
// shuffle option to false.
if (shuffle_ == ShuffleMode::kGlobal) {
// Inject ShuffleOp.
std::shared_ptr<DatasetOp> shuffle_op = nullptr;
int64_t num_rows = 0;

// First, get the number of rows in the dataset.
RETURN_IF_NOT_OK(YelpReviewOp::CountAllFileRows(yelp_review_files_list_, false, &num_rows));
// Add the shuffle op after this op.
RETURN_IF_NOT_OK(
AddShuffleOp(sorted_dataset_files.size(), num_shards_, num_rows, 0, connector_que_size_, &shuffle_op));
shuffle_op->SetTotalRepeats(GetTotalRepeats());
shuffle_op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch());
node_ops->push_back(shuffle_op);
}
yelp_review_op->SetTotalRepeats(GetTotalRepeats());
yelp_review_op->SetNumRepeatsPerEpoch(GetNumRepeatsPerEpoch());
node_ops->push_back(yelp_review_op);
return Status::OK();
}

Status YelpReviewNode::GetShardId(int32_t *shard_id) {
*shard_id = shard_id_;
return Status::OK();
}

Status YelpReviewNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) {
if (dataset_size_ > 0) {
*dataset_size = dataset_size_;
return Status::OK();
}

int64_t num_rows, sample_size;
RETURN_IF_NOT_OK(YelpReviewOp::CountAllFileRows(yelp_review_files_list_, false, &num_rows));
sample_size = num_samples_;
num_rows = static_cast<int64_t>(ceil(num_rows / (1.0 * num_shards_)));
*dataset_size = sample_size > 0 ? std::min(num_rows, sample_size) : num_rows;
dataset_size_ = *dataset_size;
return Status::OK();
}

Status YelpReviewNode::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["num_parallel_workers"] = num_workers_;
args["dataset_dir"] = dataset_dir_;
args["usage"] = usage_;
args["num_samples"] = num_samples_;
args["shuffle"] = shuffle_;
args["num_shards"] = num_shards_;
args["shard_id"] = shard_id_;
if (cache_ != nullptr) {
nlohmann::json cache_args;
RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
args["cache"] = cache_args;
}
*out_json = args;
return Status::OK();
}

Status YelpReviewNode::SetupSamplerForCache(std::shared_ptr<SamplerObj> *sampler) {
bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);
*sampler = SelectSampler(num_samples_, shuffle_files, num_shards_, shard_id_);
return Status::OK();
}

Status YelpReviewNode::MakeSimpleProducer() {
shard_id_ = 0;
num_shards_ = 1;
shuffle_ = ShuffleMode::kFalse;
num_samples_ = 0;
return Status::OK();
}

std::vector<std::string> YelpReviewNode::WalkAllFiles(const std::string &usage, const std::string &dataset_dir) {
std::vector<std::string> yelp_review_files_list;
Path train_prefix("train.csv");
Path test_prefix("test.csv");
Path dir(dataset_dir);

if (usage == "train") {
Path temp_path = dir / train_prefix;
yelp_review_files_list.push_back(temp_path.ToString());
} else if (usage == "test") {
Path temp_path = dir / test_prefix;
yelp_review_files_list.push_back(temp_path.ToString());
} else {
Path temp_path = dir / train_prefix;
yelp_review_files_list.push_back(temp_path.ToString());
Path temp_path1 = dir / test_prefix;
yelp_review_files_list.push_back(temp_path1.ToString());
}
return yelp_review_files_list;
}
} // namespace dataset
} // namespace mindspore

+ 136
- 0
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/yelp_review_node.h View File

@@ -0,0 +1,136 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_YELP_REVIEW_NODE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_YELP_REVIEW_NODE_H_

#include <memory>
#include <string>
#include <vector>

#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
#include "minddata/dataset/engine/datasetops/source/yelp_review_op.h"

namespace mindspore {
namespace dataset {
/// \class YelpReviewNode
/// \brief A Node derived class to represent YelpReview Node.
class YelpReviewNode : public NonMappableSourceNode {
public:
/// \brief Constructor of YelpReviewNode.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage Part of dataset of YelpReview, can be "train", "test" or "all" data.
/// \param[in] num_samples The number of samples to be included in the dataset.
/// \param[in] shuffle The mode for shuffling data every epoch.
/// Can be any of:
/// ShuffleMode::kFalse - No shuffling is performed.
/// ShuffleMode::kFiles - Shuffle files only.
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into.
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified.
/// \param[in] cache Tensor cache to use.
YelpReviewNode(const std::string &dataset_dir, const std::string &usage, int64_t num_samples, ShuffleMode shuffle,
int32_t num_shards, int32_t shard_id, const std::shared_ptr<DatasetCache> &cache);

/// \brief Destructor.
~YelpReviewNode() = default;

/// \brief Node name getter.
/// \return Name of the current node.
std::string Name() const override { return kYelpReviewNode; }

/// \brief Print the description.
/// \param[out] out The output stream to write output to.
void Print(std::ostream &out) const override;

/// \brief Copy the node to a new object.
/// \return A shared pointer to the new copy.
std::shared_ptr<DatasetNode> Copy() override;

/// \brief A base class override function to create the required runtime dataset op objects for this class.
/// \param node_ops A vector containing shared pointer to the Dataset Ops that this object will create.
/// \return Status Status::OK() if build successfully.
Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override;

/// \brief Parameters validation.
/// \return Status Status::OK() if all the parameters are valid.
Status ValidateParams() override;

/// \brief Get the shard id of node.
/// \param[in] shard_id The shard id.
/// \return Status Status::OK() if get shard id successfully.
Status GetShardId(int32_t *shard_id) override;

/// \brief Base-class override for GetDatasetSize.
/// \param[in] size_getter Shared pointer to DatasetSizeGetter.
/// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
/// dataset size at the expense of accuracy.
/// \param[out] dataset_size The size of the dataset.
/// \return Status of the function.
Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
int64_t *dataset_size) override;

/// \brief Getter functions.
const std::string &DatasetDir() const { return dataset_dir_; }
const std::string &Usage() const { return usage_; }
int64_t NumSamples() const { return num_samples_; }
ShuffleMode Shuffle() const { return shuffle_; }
int32_t NumShards() const { return num_shards_; }
int32_t ShardId() const { return shard_id_; }

/// \brief Get the arguments of node.
/// \param[out] out_json JSON string of all attributes.
/// \return Status of the function.
Status to_json(nlohmann::json *out_json) override;

/// \brief YelpReview by itself is a non-mappable dataset that does not support sampling.
/// However, if a cache operator is injected at some other place higher in the tree, that cache can
/// inherit this sampler from the leaf, providing sampling support from the caching layer.
/// That is why we setup the sampler for a leaf node that does not use sampling.
/// Note: This function is common among NonMappableSourceNode and should be promoted to its parent class.
/// \param[in] sampler The sampler to setup.
/// \return Status of the function.
Status SetupSamplerForCache(std::shared_ptr<SamplerObj> *sampler) override;

/// \brief If a cache has been added into the ascendant tree over this clue node, then the cache will be executing
/// a sampler for fetching the data. As such, any options in the clue node need to be reset to its defaults so.
/// that this clue node will produce the full set of data into the cache.
/// Note: This function is common among NonMappableSourceNode and should be promoted to its parent class.
/// \return Status of the function.
Status MakeSimpleProducer() override;

/// \brief Generate a list of read file names according to usage.
/// \param[in] usage Part of dataset of YelpReview.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \return std::vector<std::string> A list of read file names.
std::vector<std::string> WalkAllFiles(const std::string &usage, const std::string &dataset_dir);

private:
std::string dataset_dir_;
std::string usage_;
char field_delim_;
std::vector<std::shared_ptr<CsvBase>> column_defaults_;
std::vector<std::string> column_names_;
int64_t num_samples_;
ShuffleMode shuffle_;
int32_t num_shards_;
int32_t shard_id_;
std::vector<std::string> yelp_review_files_list_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_YELP_REVIEW_NODE_H_

+ 51
- 0
mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h View File

@@ -4261,6 +4261,57 @@ inline std::shared_ptr<VOCDataset> MS_API VOC(const std::string &dataset_dir, co
MapStringToChar(class_indexing), decode, sampler, cache, extra_metadata);
}

/// \class YelpReviewDataset
/// \brief A source dataset for reading and parsing Yelp Review dataset.
class MS_API YelpReviewDataset : public Dataset {
public:
/// \brief Constructor of YelpReviewDataset.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage Part of dataset of YelpReview, can be "train", "test" or "all".
/// \param[in] num_samples The number of samples to be included in the dataset.
/// \param[in] shuffle The mode for shuffling data every epoch.
/// Can be any of:
/// ShuffleMode.kFalse - No shuffling is performed.
/// ShuffleMode.kFiles - Shuffle files only.
/// ShuffleMode.kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into.
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified.
/// \param[in] cache Tensor cache to use.
YelpReviewDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, int64_t num_samples,
ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
const std::shared_ptr<DatasetCache> &cache);

/// Destructor of YelpReviewDataset.
~YelpReviewDataset() = default;
};

/// \brief Function to create a YelpReviewDataset.
/// \note This dataset includes polarity and full, which can be read according to your own needs.
/// \note The generated dataset has two columns ["label", "text"]. Their types are all string.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] usage Part of dataset of YelpReview, can be "train", "test" or "all" (default="all").
/// \param[in] num_samples The number of samples to be included in the dataset
/// (Default = 0, means all samples).
/// \param[in] shuffle The mode for shuffling data every epoch (Default=ShuffleMode.kGlobal).
/// Can be any of:
/// ShuffleMode::kFalse - No shuffling is performed.
/// ShuffleMode::kFiles - Shuffle files only.
/// ShuffleMode::kGlobal - Shuffle both the files and samples.
/// \param[in] num_shards Number of shards that the dataset should be divided into (Default = 1).
/// \param[in] shard_id The shard ID within num_shards. This argument should be
/// specified only when num_shards is also specified (Default = 0).
/// \param[in] cache Tensor cache to use (default=nullptr, which means no cache is used).
/// \return Shared pointer to the YelpReviewDataset.
inline std::shared_ptr<YelpReviewDataset> MS_API YelpReview(const std::string &dataset_dir,
const std::string &usage = "all", int64_t num_samples = 0,
ShuffleMode shuffle = ShuffleMode::kGlobal,
int32_t num_shards = 1, int32_t shard_id = 0,
const std::shared_ptr<DatasetCache> &cache = nullptr) {
return std::make_shared<YelpReviewDataset>(StringToChar(dataset_dir), StringToChar(usage), num_samples, shuffle,
num_shards, shard_id, cache);
}

/// \class YesNoDataset.
/// \brief A source dataset for reading and parsing YesNo dataset.
class MS_API YesNoDataset : public Dataset {


+ 130
- 10
mindspore/dataset/engine/datasets.py View File

@@ -71,7 +71,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
check_sbu_dataset, check_qmnist_dataset, check_emnist_dataset, check_fake_image_dataset, check_places365_dataset, \
check_photo_tour_dataset, check_ag_news_dataset, check_dbpedia_dataset, check_lj_speech_dataset, \
check_yes_no_dataset, check_speech_commands_dataset, check_tedlium_dataset, check_svhn_dataset, \
check_stl10_dataset
check_stl10_dataset, check_yelp_review_dataset
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
get_prefetch_size, get_auto_offload
from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
@@ -8637,6 +8637,129 @@ class DIV2KDataset(MappableDataset):
return cde.DIV2KNode(self.dataset_dir, self.usage, self.downgrade, self.scale, self.decode, self.sampler)


class YelpReviewDataset(SourceDataset):
"""
A source dataset that reads and parses Yelp Review Polarity and Yelp Review Full dataset.

The generated dataset has two columns: :py:obj:`[label, text]`.
The tensor of column :py:obj:`label` is of the string type.
The tensor of column :py:obj:`text` is of the string type.

Args:
dataset_dir (str): Path to the root directory that contains the dataset.
usage (str, optional): Usage of this dataset, can be `train`, `test` or `all`.
For Polarity, `train` will read from 560,000 train samples, `test` will read from 38,000 test samples,
`all` will read from all 598,000 samples.
For Full, `train` will read from 650,000 train samples, `test` will read from 50,000 test samples,
`all` will read from all 700,000 samples (default=None, all samples).
num_samples (int, optional): Number of samples (rows) to read (default=None, reads all samples).
shuffle (Union[bool, Shuffle level], optional): Perform reshuffling of the data every epoch
(default=Shuffle.GLOBAL).
If shuffle is False, no shuffling will be performed;
If shuffle is True, the behavior is the same as setting shuffle to be Shuffle.GLOBAL
Otherwise, there are two levels of shuffling:

- Shuffle.GLOBAL: Shuffle both the files and samples.

- Shuffle.FILES: Shuffle files only.
num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
When this argument is specified, `num_samples` reflects the max sample number of per shard.
shard_id (int, optional): The shard ID within num_shards (default=None). This
argument can only be specified when num_shards is also specified.
num_parallel_workers (int, optional): Number of workers to read the data
(default=None, number set in the config).
cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
(default=None, which means no cache is used).

Raises:
RuntimeError: If dataset_dir does not contain data files.
RuntimeError: If num_parallel_workers exceeds the max thread numbers.
RuntimeError: If num_shards is specified but shard_id is None.
RuntimeError: If shard_id is specified but num_shards is None.

Examples:
>>> yelp_review_dataset_dir = "/path/to/yelp_review_dataset_dir"
>>> dataset = ds.YelpReviewDataset(dataset_dir=yelp_review_dataset_dir, usage='all')

About YelpReview Dataset:

The Yelp Review Full dataset consists of reviews from Yelp. It is extracted from the Yelp Dataset Challenge 2015
data, and it is mainly used for text classification.

The Yelp Review Polarity dataset is constructed from the above dataset, by considering stars 1 and 2 negative, and 3
and 4 positive.

The directory structures of these two datasets are the same.
You can unzip the dataset files into the following structure and read by MindSpore's API:

.. code-block::

.
└── yelp_review_dir
├── train.csv
├── test.csv
└── readme.txt

Citation:

For Yelp Review Polarity:

.. code-block::

@article{zhangCharacterlevelConvolutionalNetworks2015,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1509.01626},
primaryClass = {cs},
title = {Character-Level {{Convolutional Networks}} for {{Text Classification}}},
abstract = {This article offers an empirical exploration on the use of character-level convolutional networks
(ConvNets) for text classification. We constructed several large-scale datasets to show that
character-level convolutional networks could achieve state-of-the-art or competitive results.
Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF
variants, and deep learning models such as word-based ConvNets and recurrent neural networks.},
journal = {arXiv:1509.01626 [cs]},
author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
month = sep,
year = {2015},
}

Citation:

For Yelp Review Full:

.. code-block::

@article{zhangCharacterlevelConvolutionalNetworks2015,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1509.01626},
primaryClass = {cs},
title = {Character-Level {{Convolutional Networks}} for {{Text Classification}}},
abstract = {This article offers an empirical exploration on the use of character-level convolutional networks
(ConvNets) for text classification. We constructed several large-scale datasets to show that
character-level convolutional networks could achieve state-of-the-art or competitive results.
Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF
variants, and deep learning models such as word-based ConvNets and recurrent neural networks.},
journal = {arXiv:1509.01626 [cs]},
author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
month = sep,
year = {2015},
}
"""

@check_yelp_review_dataset
def __init__(self, dataset_dir, usage=None, num_samples=None, shuffle=Shuffle.GLOBAL, num_shards=None,
shard_id=None, num_parallel_workers=None, cache=None):
super().__init__(num_parallel_workers=num_parallel_workers, num_samples=num_samples, shuffle=shuffle,
num_shards=num_shards, shard_id=shard_id, cache=cache)
self.dataset_dir = dataset_dir
self.usage = replace_none(usage, 'all')

def parse(self, children=None):
return cde.YelpReviewNode(self.dataset_dir, self.usage, self.num_samples, self.shuffle_flag,
self.num_shards, self.shard_id)


class YesNoDataset(MappableDataset):
"""
A source dataset for reading and parsing the YesNo dataset.
@@ -8833,20 +8956,17 @@ class TedliumDataset(MappableDataset):
- not allowed

Examples:
>>> tedlium_dataset_dir = "/path/to/tedlium_dataset_directory"
>>> tedlium_dataset_release = ["release1", "release2", "release3"]
>>>
>>> # 1) Get all train samples from TEDLIUM_release1 dataset in sequence.
>>> dataset = ds.TedliumDataset(dataset_dir=tedlium_dataset_dir, release=tedlium_dataset_release[0],
... shuffle=False)
>>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium1_dataset_directory",
... release="release1", shuffle=False)
>>>
>>> # 2) Randomly select 10 samples from TEDLIUM_release2 dataset.
>>> dataset = ds.TedliumDataset(dataset_dir=tedlium_dataset_dir, release=tedlium_dataset_release[1],
... num_samples=10, shuffle=True)
>>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium2_dataset_directory",
... release="release2", num_samples=10, shuffle=True)
>>>
>>> # 3) Get samples from TEDLIUM_release-3 dataset for shard 0 in a 2-way distributed training.
>>> dataset = ds.TedliumDataset(dataset_dir=tedlium_dataset_dir, release=tedlium_dataset_release[2],
... num_shards=2, shard_id=0)
>>> dataset = ds.TedliumDataset(dataset_dir="/path/to/tedlium3_dataset_directory",
... release="release3", num_shards=2, shard_id=0)
>>>
>>> # In TEDLIUM dataset, each dictionary has keys : waveform, sample_rate, transcript, talk_id,
>>> # speaker_id and identifier.


+ 28
- 0
mindspore/dataset/engine/validators.py View File

@@ -1839,6 +1839,34 @@ def check_dbpedia_dataset(method):
return new_method


def check_yelp_review_dataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(YelpReviewDataset)."""

@wraps(method)
def new_method(self, *args, **kwargs):
_, param_dict = parse_user_args(method, *args, **kwargs)

nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']

dataset_dir = param_dict.get('dataset_dir')
check_dir(dataset_dir)

# check usage
usage = param_dict.get('usage')
if usage is not None:
check_valid_str(usage, ["train", "test", "all"], "usage")

validate_dataset_param_value(nreq_param_int, param_dict, int)
check_sampler_shuffle_shard_options(param_dict)

cache = param_dict.get('cache')
check_cache_option(cache)

return method(self, *args, **kwargs)

return new_method


def check_yes_no_dataset(method):
"""A wrapper that wraps a parameter checker around the original Dataset(YesNoDataset)."""



+ 1
- 0
tests/ut/cpp/dataset/CMakeLists.txt View File

@@ -47,6 +47,7 @@ SET(DE_UT_SRCS
c_api_dataset_tfrecord_test.cc
c_api_dataset_usps_test.cc
c_api_dataset_voc_test.cc
c_api_dataset_yelp_review_test.cc
c_api_dataset_yes_no_test.cc
c_api_datasets_test.cc
c_api_epoch_ctrl_test.cc


+ 583
- 0
tests/ut/cpp/dataset/c_api_dataset_yelp_review_test.cc View File

@@ -0,0 +1,583 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/core/global_context.h"
#include "minddata/dataset/include/dataset/datasets.h"

using namespace mindspore::dataset;

class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};

/// Feature: Test YelpReviewPolarity Dataset.
/// Description: read YelpReviewPolarityDataset data and get data.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewPolarityDatasetBasic) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewPolarityDatasetBasic.";

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"2", "\\\"Yelp\\\" service was very good.\\n"},
{"1", "\\\"Yelp\\\" service was very bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 2 samples.
EXPECT_EQ(i, 2);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Test YelpReviewFull Dataset.
/// Description: read YelpReviewFullDataset data and get data.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewFullDatasetBasic) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewFullDatasetBasic.";

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/full";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1", "\\\"YelpFull\\\" service was very good.\\n"},
{"1", "\\\"YelpFull\\\" service was very bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 2 samples.
EXPECT_EQ(i, 2);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Test YelpReviewPolarity Dataset(usage=all).
/// Description: read train data and test data.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewDatasetUsageAll) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewDatasetUsageAll.";

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "all" , 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1", "The food today is terrible.\\n"},
{"2", "\\\"Yelp\\\" service was very good.\\n"},
{"2", "The food is delicious today.\\n"},
{"1", "\\\"Yelp\\\" service was very bad.\\n"},
{"1", "Today's drink tastes bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 5 samples.
EXPECT_EQ(i, 5);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Test Getters.
/// Description: includes tests for shape, type, size.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewDatasetGetters) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewDatasetGetters.";

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse);
std::vector<std::string> column_names = {"label", "text"};
EXPECT_NE(ds, nullptr);

std::vector<DataType> types = ToDETypes(ds->GetOutputTypes());
std::vector<TensorShape> shapes = ToTensorShapeVec(ds->GetOutputShapes());
EXPECT_EQ(types.size(), 2);
EXPECT_EQ(types[0].ToString(), "string");
EXPECT_EQ(types[1].ToString(), "string");
EXPECT_EQ(shapes.size(), 2);
EXPECT_EQ(shapes[0].ToString(), "<>");
EXPECT_EQ(shapes[1].ToString(), "<>");
EXPECT_EQ(ds->GetBatchSize(), 1);
EXPECT_EQ(ds->GetRepeatCount(), 1);
EXPECT_EQ(ds->GetDatasetSize(), 2);
EXPECT_EQ(ds->GetColumnNames(), column_names);
}

/// Feature: Test YelpReview Dataset(num_samples = 2).
/// Description: test whether the interface meets expectations when NumSamples is equal to 2.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewDatasetNumSamples) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewDatasetNumSamples.";

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "test", 2, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"2", "\\\"Yelp\\\" service was very good.\\n"},
{"1", "\\\"Yelp\\\" service was very bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 2 samples.
EXPECT_EQ(i, 2);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Test YelpReviewDataset in distribution.
/// Description: test interface in a distributed state.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewDatasetDistribution) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewDatasetDistribution.";

// Create a YelpReviewDataset.
std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse, 2, 0);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"2", "\\\"Yelp\\\" service was very good.\\n"},
{"1", "\\\"Yelp\\\" service was very bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 1 samples.
EXPECT_EQ(i, 1);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Error Test.
/// Description: test the wrong input.
/// Expectation: unable to read in data.
TEST_F(MindDataTestPipeline, TestYelpReviewDatasetFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewDatasetFail.";

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";

std::shared_ptr<Dataset> ds0 = YelpReview("NotExistFile", "test", 0, ShuffleMode::kFalse);
EXPECT_NE(ds0, nullptr);
// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter0 = ds0->CreateIterator();
// Expect failure: invalid YelpReview input.
EXPECT_EQ(iter0, nullptr);

// Create a YelpReview Dataset with invalid usage.
std::shared_ptr<Dataset> ds1 = YelpReview(dataset_dir, "invalid_usage", 0, ShuffleMode::kFalse);
EXPECT_NE(ds1, nullptr);

// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
// Expect failure: invalid YelpReview input.
EXPECT_EQ(iter1, nullptr);

// Test invalid num_samples < -1.
std::shared_ptr<Dataset> ds2 = YelpReview(dataset_dir, "test", -1, ShuffleMode::kFalse);
EXPECT_NE(ds2, nullptr);
// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter2 = ds2->CreateIterator();
// Expect failure: invalid YelpReview input.
EXPECT_EQ(iter2, nullptr);

// Test invalid num_shards < 1.
std::shared_ptr<Dataset> ds3 = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse, 0);
EXPECT_NE(ds3, nullptr);
// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter3 = ds3->CreateIterator();
// Expect failure: invalid YelpReview input.
EXPECT_EQ(iter3, nullptr);

// Test invalid shard_id >= num_shards.
std::shared_ptr<Dataset> ds4 = YelpReview(dataset_dir,"test", 0, ShuffleMode::kFalse, 2, 2);
EXPECT_NE(ds4, nullptr);
// Create an iterator over the result of the above dataset.
std::shared_ptr<Iterator> iter4 = ds4->CreateIterator();
// Expect failure: invalid YelpReview input.
EXPECT_EQ(iter4, nullptr);
}

/// Feature: Test YelpReview Dataset.
/// Description: test YelpReview Dataset interface in pipeline.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TestYelpReviewDatasetBasicWithPipeline) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestYelpReviewDatasetBasicWithPipeline.";

// Create two YelpReview Dataset, with single YelpReview file.
std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";

std::shared_ptr<Dataset> ds1 = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse);
std::shared_ptr<Dataset> ds2 = YelpReview(dataset_dir, "test", 0, ShuffleMode::kFalse);
EXPECT_NE(ds1, nullptr);
EXPECT_NE(ds2, nullptr);

// Create two Repeat operation on ds.
int32_t repeat_num = 2;
ds1 = ds1->Repeat(repeat_num);
EXPECT_NE(ds1, nullptr);
repeat_num = 3;
ds2 = ds2->Repeat(repeat_num);
EXPECT_NE(ds2, nullptr);

// Create two Project operation on ds.
std::vector<std::string> column_project = {"label"};
ds1 = ds1->Project(column_project);
EXPECT_NE(ds1, nullptr);
ds2 = ds2->Project(column_project);
EXPECT_NE(ds2, nullptr);

// Create a Concat operation on the ds.
ds1 = ds1->Concat({ds2});
EXPECT_NE(ds1, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds1->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));

EXPECT_NE(row.find("label"), row.end());
uint64_t i = 0;
while (row.size() != 0) {
auto text = row["label"];
MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
i++;
ASSERT_OK(iter->GetNextRow(&row));
}

// Expect 10 samples.
EXPECT_EQ(i, 10);

// Manually terminate the pipeline.
iter->Stop();
}

/// Feature: Test YelpReview Dataset(ShuffleMode=kFiles).
/// Description: test YelpReview Dataset interface with different ShuffleMode.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TesYelpReviewDatasetShuffleFilesA) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TesYelpReviewDatasetShuffleFilesA.";

// Set configuration.
uint32_t original_seed = GlobalContext::config_manager()->seed();
uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
GlobalContext::config_manager()->set_seed(130);
GlobalContext::config_manager()->set_num_parallel_workers(4);

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "all" , 0, ShuffleMode::kFiles);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"2", "\\\"Yelp\\\" service was very good.\\n"},
{"1", "The food today is terrible.\\n"},
{"1", "\\\"Yelp\\\" service was very bad.\\n"},
{"2", "The food is delicious today.\\n"},
{"1", "Today's drink tastes bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
// Expect 5 samples.
EXPECT_EQ(i, 5);

// Manually terminate the pipeline.
iter->Stop();

// Restore configuration.
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}

/// Feature: Test YelpReview Dataset(ShuffleMode=kInfile).
/// Description: test YelpReview Dataset interface with different ShuffleMode.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TesYelpReviewDatasetShuffleFilesB) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TesYelpReviewDatasetShuffleFilesB.";

// Set configuration.
uint32_t original_seed = GlobalContext::config_manager()->seed();
uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
GlobalContext::config_manager()->set_seed(130);
GlobalContext::config_manager()->set_num_parallel_workers(4);

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "all" , 0, ShuffleMode::kInfile);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1", "The food today is terrible.\\n"},
{"2", "\\\"Yelp\\\" service was very good.\\n"},
{"2", "The food is delicious today.\\n"},
{"1", "\\\"Yelp\\\" service was very bad.\\n"},
{"1", "Today's drink tastes bad.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 5 samples.
EXPECT_EQ(i, 5);

// Manually terminate the pipeline.
iter->Stop();

// Restore configuration.
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}

/// Feature: Test YelpReview Dataset(ShuffleMode=kGlobal).
/// Description: test YelpReview Dataset interface with different ShuffleMode.
/// Expectation: the data is processed successfully.
TEST_F(MindDataTestPipeline, TesYelpReviewDatasetShuffleFilesGlobal) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TesYelpReviewDatasetShuffleFilesGlobal.";

// Set configuration.
uint32_t original_seed = GlobalContext::config_manager()->seed();
uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
GlobalContext::config_manager()->set_seed(130);
GlobalContext::config_manager()->set_num_parallel_workers(4);

std::string dataset_dir = datasets_root_path_ + "/testYelpReview/polarity";
std::vector<std::string> column_names = {"label", "text"};

std::shared_ptr<Dataset> ds = YelpReview(dataset_dir, "train" , 0, ShuffleMode::kGlobal);
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset.
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row.
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
EXPECT_NE(row.find("label"), row.end());
std::vector<std::vector<std::string>> expected_result = {
{"1", "The food today is terrible.\\n"},
{"1", "Today's drink tastes bad.\\n"},
{"2", "The food is delicious today.\\n"}
};

uint64_t i = 0;
while (row.size() != 0) {
for (int j = 0; j < column_names.size(); j++) {
auto text = row[column_names[j]];
std::shared_ptr<Tensor> de_text;
ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
std::string_view sv;
ASSERT_OK(de_text->GetItemAt(&sv, {}));
std::string ss(sv);
EXPECT_STREQ(ss.c_str(), expected_result[i][j].c_str());
}
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

// Expect 3 samples.
EXPECT_EQ(i, 3);

// Manually terminate the pipeline.
iter->Stop();

// Restore configuration.
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}

+ 2
- 0
tests/ut/data/dataset/testYelpReview/full/test.csv View File

@@ -0,0 +1,2 @@
1,"\""YelpFull\"" service was very good.\n"
1,"\""YelpFull\"" service was very bad.\n"

+ 3
- 0
tests/ut/data/dataset/testYelpReview/full/train.csv View File

@@ -0,0 +1,3 @@
5,Yelpfull's drink tastes bad.\n
2,Yelpfull's food is terrible.\n
4,Yelpful's service was very good.\n

+ 2
- 0
tests/ut/data/dataset/testYelpReview/polarity/test.csv View File

@@ -0,0 +1,2 @@
2,"\""Yelp\"" service was very good.\n"
1,"\""Yelp\"" service was very bad.\n"

+ 3
- 0
tests/ut/data/dataset/testYelpReview/polarity/train.csv View File

@@ -0,0 +1,3 @@
1,The food today is terrible.\n
2,The food is delicious today.\n
1,Today's drink tastes bad.\n

+ 152
- 0
tests/ut/python/dataset/test_datasets_yelp_review.py View File

@@ -0,0 +1,152 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import mindspore.dataset as ds

DATA_POLARITY_DIR = '../data/dataset/testYelpReview/polarity'
DATA_FULL_DIR = '../data/dataset/testYelpReview/full'


def test_yelp_review_polarity_dataset_basic():
"""
Feature: Test YelpReviewPolarity Dataset.
Description: read data from a single file.
Expectation: the data is processed successfully.
"""
buffer = []
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False)
data = data.repeat(2)
data = data.skip(2)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.append(d)
assert len(buffer) == 2


def test_yelp_review_full_dataset_basic():
"""
Feature: Test YelpReviewFull Dataset.
Description: read data from a single file.
Expectation: the data is processed successfully.
"""
buffer = []
data = ds.YelpReviewDataset(DATA_FULL_DIR, usage='test', shuffle=False)
data = data.repeat(2)
data = data.skip(2)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.append(d)
assert len(buffer) == 2


def test_yelp_review_dataset_quoted():
"""
Feature: Test get the YelpReview Dataset.
Description: read YelpReviewPolarityDataset data and get data.
Expectation: the data is processed successfully.
"""
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False)
buffer = []
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.extend([d['label'].item().decode("utf8"),
d['text'].item().decode("utf8")])
assert buffer == ["2", "\\\"Yelp\\\" service was very good.\\n",
"1", "\\\"Yelp\\\" service was very bad.\\n"]


def test_yelp_review_dataset_usage_all():
"""
Feature: Test YelpReviewPolarity Dataset(usage=all).
Description: read train data and test data.
Expectation: the data is processed successfully.
"""
buffer = []
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='all', shuffle=False)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
buffer.append(d)
assert len(buffer) == 5


def test_yelp_review_dataset_get_datasetsize():
"""
Feature: Test Getters.
Description: test get_dataset_size of YelpReview dataset.
Expectation: the data is processed successfully.
"""
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False)
size = data.get_dataset_size()
assert size == 2


def test_yelp_review_dataset_distribution():
"""
Feature: Test YelpReviewDataset in distribution.
Description: test in a distributed state.
Expectation: the data is processed successfully.
"""
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False, num_shards=2, shard_id=0)
count = 0
for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
count += 1
assert count == 1


def test_yelp_review_dataset_num_samples():
"""
Feature: Test YelpReview Dataset(num_samples = 2).
Description: test get num_samples.
Expectation: the data is processed successfully.
"""
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False, num_samples=2)
count = 0
for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
count += 1
assert count == 2


def test_yelp_review_dataset_exception():
"""
Feature: Error Test.
Description: test the wrong input.
Expectation: unable to read in data.
"""
def exception_func(item):
raise Exception("Error occur!")

try:
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False)
data = data.map(operations=exception_func, input_columns=["label"], num_parallel_workers=1)
for _ in data.create_dict_iterator():
pass
assert False
except RuntimeError as e:
assert "map operation: [PyFunc] failed. The corresponding data files" in str(e)

try:
data = ds.YelpReviewDataset(DATA_POLARITY_DIR, usage='test', shuffle=False)
data = data.map(operations=exception_func, input_columns=["text"], num_parallel_workers=1)
for _ in data.create_dict_iterator():
pass
assert False
except RuntimeError as e:
assert "map operation: [PyFunc] failed. The corresponding data files" in str(e)


if __name__ == "__main__":
test_yelp_review_polarity_dataset_basic()
test_yelp_review_full_dataset_basic()
test_yelp_review_dataset_quoted()
test_yelp_review_dataset_usage_all()
test_yelp_review_dataset_get_datasetsize()
test_yelp_review_dataset_distribution()
test_yelp_review_dataset_num_samples()
test_yelp_review_dataset_exception()

Loading…
Cancel
Save