From 4b1ccf9dd5676c001f00ee3baacbb4b72abb434a Mon Sep 17 00:00:00 2001 From: Cathy Wong Date: Wed, 13 Jan 2021 17:32:33 -0500 Subject: [PATCH] dataset: Use int32_t for text's vocab_size --- mindspore/ccsrc/minddata/dataset/api/datasets.cc | 2 +- .../api/python/bindings/dataset/include/datasets_bindings.cc | 2 +- .../dataset/api/python/bindings/dataset/text/bindings.cc | 2 +- .../engine/datasetops/build_sentence_piece_vocab_op.cc | 2 +- .../dataset/engine/datasetops/build_sentence_piece_vocab_op.h | 4 ++-- .../engine/ir/datasetops/build_sentence_piece_vocab_node.cc | 2 +- .../engine/ir/datasetops/build_sentence_piece_vocab_node.h | 4 ++-- mindspore/ccsrc/minddata/dataset/include/datasets.h | 4 ++-- mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc | 2 +- mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h | 2 +- tests/ut/cpp/dataset/build_vocab_test.cc | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index 15300fb2a5..c24f9a79c9 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -569,7 +569,7 @@ std::shared_ptr Dataset::SetNumWorkers(int32_t num_workers) { #ifndef ENABLE_ANDROID std::shared_ptr Dataset::BuildSentencePieceVocab( - const std::vector &col_names, uint32_t vocab_size, float character_coverage, + const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms) { auto vocab = std::make_shared(); auto ds = std::make_shared(IRNode(), vocab, col_names, vocab_size, character_coverage, diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc index 48274411af..b3288a9ee4 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/include/datasets_bindings.cc @@ -388,7 +388,7 @@ PYBIND_REGISTER(BuildSentenceVocabNode, 2, ([](const py::module *m) { (void)py::class_>( *m, "BuildSentenceVocabNode", "to create a BuildSentenceVocabNode") .def(py::init([](std::shared_ptr self, std::shared_ptr vocab, - const std::vector &col_names, uint32_t vocab_size, + const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms) { auto build_sentence_vocab = std::make_shared( diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc index d746fa298f..ef367b76fd 100644 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc @@ -54,7 +54,7 @@ PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) { (void)py::class_>(*m, "SentencePieceVocab") .def(py::init<>()) .def_static("from_file", - [](const py::list &paths, const int vocab_size, const float character_coverage, + [](const py::list &paths, const int32_t vocab_size, const float character_coverage, const SentencePieceModel model_type, const py::dict ¶ms) { std::shared_ptr v; std::vector path_list; diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc index ce328efcb1..540e8db408 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc @@ -23,7 +23,7 @@ namespace mindspore { namespace dataset { BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr vocab, - std::vector col_names, uint32_t vocab_size, + std::vector col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms, int32_t op_conn_size) diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h index 0ff81b9b9f..073122e1ce 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h @@ -134,7 +134,7 @@ class BuildSentencePieceVocabOp : public PipelineOp { }; BuildSentencePieceVocabOp(std::shared_ptr vocab, std::vector col_names, - uint32_t vocab_size, float character_coverage, SentencePieceModel model_type, + int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms, int32_t op_conn_size); ~BuildSentencePieceVocabOp() = default; @@ -174,7 +174,7 @@ class BuildSentencePieceVocabOp : public PipelineOp { private: bool read_done_; Status ret_status_; - uint32_t vocab_size_; + int32_t vocab_size_; float character_coverage_; SentencePieceModel model_type_; std::unordered_map params_; diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc index 6677ec8bde..86b1f91a69 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.cc @@ -30,7 +30,7 @@ namespace dataset { BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr child, std::shared_ptr vocab, - const std::vector &col_names, uint32_t vocab_size, + const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms) : vocab_(vocab), diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h index a689e05689..7a073a1e24 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h +++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/build_sentence_piece_vocab_node.h @@ -33,7 +33,7 @@ class BuildSentenceVocabNode : public DatasetNode { public: /// \brief Constructor BuildSentenceVocabNode(std::shared_ptr child, std::shared_ptr vocab, - const std::vector &col_names, uint32_t vocab_size, float character_coverage, + const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms); /// \brief Destructor @@ -75,7 +75,7 @@ class BuildSentenceVocabNode : public DatasetNode { private: std::shared_ptr vocab_; std::vector col_names_; - uint32_t vocab_size_; + int32_t vocab_size_; float character_coverage_; SentencePieceModel model_type_; std::unordered_map params_; diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index b5ad1ec77d..1c2648574f 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -225,7 +225,7 @@ class Dataset : public std::enable_shared_from_this { /// \brief Function to create a SentencePieceVocab from source dataset /// \notes Build a SentencePieceVocab from a dataset. /// \param[in] col_names Column names to get words from. It can be a vector of column names - /// \param[in] vocab_size Vocabulary size. The type is uint32 + /// \param[in] vocab_size Vocabulary size. /// \param[in] character_coverage Percentage of characters covered by the model, must be between /// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like /// Japanese or Chinese character sets, and 1.0 for other languages with small character sets. @@ -233,7 +233,7 @@ class Dataset : public std::enable_shared_from_this { /// The input sentence must be pretokenized when using word type. /// \param[in] params A vector contains more option parameters of sentencepiece library std::shared_ptr BuildSentencePieceVocab( - const std::vector &col_names, uint32_t vocab_size, float character_coverage, + const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms); /// \brief Function to create a Vocab from source dataset diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc index d9935112f4..bdbede8292 100644 --- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc +++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.cc @@ -28,7 +28,7 @@ namespace dataset { SentencePieceVocab::SentencePieceVocab() : model_proto_("") {} -Status SentencePieceVocab::BuildFromFile(const std::vector &path_list, const int vocab_size, +Status SentencePieceVocab::BuildFromFile(const std::vector &path_list, const int32_t vocab_size, const float character_coverage, const SentencePieceModel model_type, const std::unordered_map ¶ms, std::shared_ptr *vocab) { diff --git a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h index 2e78bc8ce6..1c6a678f6b 100644 --- a/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h +++ b/mindspore/ccsrc/minddata/dataset/text/sentence_piece_vocab.h @@ -29,7 +29,7 @@ namespace dataset { class SentencePieceVocab { public: - static Status BuildFromFile(const std::vector &path_list, const int vocab_size, + static Status BuildFromFile(const std::vector &path_list, const int32_t vocab_size, const float character_coverage, const SentencePieceModel model_type, const std::unordered_map ¶ms, std::shared_ptr *vocab); diff --git a/tests/ut/cpp/dataset/build_vocab_test.cc b/tests/ut/cpp/dataset/build_vocab_test.cc index c50da85c56..a0d42e6f89 100644 --- a/tests/ut/cpp/dataset/build_vocab_test.cc +++ b/tests/ut/cpp/dataset/build_vocab_test.cc @@ -225,7 +225,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) { std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; std::shared_ptr vocab = std::make_shared(); - // Expected failure: vocab_size shoule be either -1 or positive integer + // Expected failure: vocab_size should be either -1 or positive integer Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); EXPECT_NE(s, Status::OK()); }