Browse Source

!11248 dataset: Use int32_t for text's vocab_size

From: @cathwong
Reviewed-by: @mikef,@robingrosman
Signed-off-by:
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 4 years ago
parent
commit
8ef663fa68
11 changed files with 14 additions and 14 deletions
  1. +1
    -1
      mindspore/ccsrc/minddata/dataset/api/datasets.cc
  2. +1
    -1
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc
  3. +1
    -1
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
  4. +1
    -1
      mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc
  5. +2
    -2
      mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h
  6. +1
    -1
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc
  7. +2
    -2
      mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h
  8. +2
    -2
      mindspore/ccsrc/minddata/dataset/include/datasets.h
  9. +1
    -1
      mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc
  10. +1
    -1
      mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h
  11. +1
    -1
      tests/ut/cpp/dataset/build_vocab_test.cc

+ 1
- 1
mindspore/ccsrc/minddata/dataset/api/datasets.cc View File

@@ -569,7 +569,7 @@ std::shared_ptr<Dataset> Dataset::SetNumWorkers(int32_t num_workers) {

#ifndef ENABLE_ANDROID
std::shared_ptr<SentencePieceVocab> Dataset::BuildSentencePieceVocab(
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> &params) {
auto vocab = std::make_shared<SentencePieceVocab>();
auto ds = std::make_shared<BuildSentenceVocabNode>(IRNode(), vocab, col_names, vocab_size, character_coverage,


+ 1
- 1
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc View File

@@ -388,7 +388,7 @@ PYBIND_REGISTER(BuildSentenceVocabNode, 2, ([](const py::module *m) {
(void)py::class_<BuildSentenceVocabNode, DatasetNode, std::shared_ptr<BuildSentenceVocabNode>>(
*m, "BuildSentenceVocabNode", "to create a BuildSentenceVocabNode")
.def(py::init([](std::shared_ptr<DatasetNode> self, std::shared_ptr<SentencePieceVocab> vocab,
const std::vector<std::string> &col_names, uint32_t vocab_size,
const std::vector<std::string> &col_names, int32_t vocab_size,
float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params) {
auto build_sentence_vocab = std::make_shared<BuildSentenceVocabNode>(


+ 1
- 1
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc View File

@@ -54,7 +54,7 @@ PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) {
(void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
.def(py::init<>())
.def_static("from_file",
[](const py::list &paths, const int vocab_size, const float character_coverage,
[](const py::list &paths, const int32_t vocab_size, const float character_coverage,
const SentencePieceModel model_type, const py::dict &params) {
std::shared_ptr<SentencePieceVocab> v;
std::vector<std::string> path_list;


+ 1
- 1
mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc View File

@@ -23,7 +23,7 @@
namespace mindspore {
namespace dataset {
BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab,
std::vector<std::string> col_names, uint32_t vocab_size,
std::vector<std::string> col_names, int32_t vocab_size,
float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
int32_t op_conn_size)


+ 2
- 2
mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h View File

@@ -134,7 +134,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
};

BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
uint32_t vocab_size, float character_coverage, SentencePieceModel model_type,
int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);

~BuildSentencePieceVocabOp() = default;
@@ -174,7 +174,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
private:
bool read_done_;
Status ret_status_;
uint32_t vocab_size_;
int32_t vocab_size_;
float character_coverage_;
SentencePieceModel model_type_;
std::unordered_map<std::string, std::string> params_;


+ 1
- 1
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc View File

@@ -30,7 +30,7 @@ namespace dataset {

BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child,
std::shared_ptr<SentencePieceVocab> vocab,
const std::vector<std::string> &col_names, uint32_t vocab_size,
const std::vector<std::string> &col_names, int32_t vocab_size,
float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params)
: vocab_(vocab),


+ 2
- 2
mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h View File

@@ -33,7 +33,7 @@ class BuildSentenceVocabNode : public DatasetNode {
public:
/// \brief Constructor
BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child, std::shared_ptr<SentencePieceVocab> vocab,
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> &params);

/// \brief Destructor
@@ -75,7 +75,7 @@ class BuildSentenceVocabNode : public DatasetNode {
private:
std::shared_ptr<SentencePieceVocab> vocab_;
std::vector<std::string> col_names_;
uint32_t vocab_size_;
int32_t vocab_size_;
float character_coverage_;
SentencePieceModel model_type_;
std::unordered_map<std::string, std::string> params_;


+ 2
- 2
mindspore/ccsrc/minddata/dataset/include/datasets.h View File

@@ -225,7 +225,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \brief Function to create a SentencePieceVocab from source dataset
/// \notes Build a SentencePieceVocab from a dataset.
/// \param[in] col_names Column names to get words from. It can be a vector of column names
/// \param[in] vocab_size Vocabulary size. The type is uint32
/// \param[in] vocab_size Vocabulary size.
/// \param[in] character_coverage Percentage of characters covered by the model, must be between
/// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like
/// Japanese or Chinese character sets, and 1.0 for other languages with small character sets.
@@ -233,7 +233,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// The input sentence must be pretokenized when using word type.
/// \param[in] params A vector contains more option parameters of sentencepiece library
std::shared_ptr<SentencePieceVocab> BuildSentencePieceVocab(
const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage,
const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage,
SentencePieceModel model_type, const std::unordered_map<std::string, std::string> &params);

/// \brief Function to create a Vocab from source dataset


+ 1
- 1
mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc View File

@@ -28,7 +28,7 @@ namespace dataset {

SentencePieceVocab::SentencePieceVocab() : model_proto_("") {}

Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
std::shared_ptr<SentencePieceVocab> *vocab) {


+ 1
- 1
mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h View File

@@ -29,7 +29,7 @@ namespace dataset {

class SentencePieceVocab {
public:
static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size,
static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
std::shared_ptr<SentencePieceVocab> *vocab);


+ 1
- 1
tests/ut/cpp/dataset/build_vocab_test.cc View File

@@ -225,7 +225,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();

// Expected failure: vocab_size shoule be either -1 or positive integer
// Expected failure: vocab_size should be either -1 or positive integer
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
EXPECT_NE(s, Status::OK());
}


Loading…
Cancel
Save