From: @cathwong Reviewed-by: @mikef,@robingrosman Signed-off-by:tags/v1.2.0-rc1
| @@ -569,7 +569,7 @@ std::shared_ptr<Dataset> Dataset::SetNumWorkers(int32_t num_workers) { | |||||
| #ifndef ENABLE_ANDROID | #ifndef ENABLE_ANDROID | ||||
| std::shared_ptr<SentencePieceVocab> Dataset::BuildSentencePieceVocab( | std::shared_ptr<SentencePieceVocab> Dataset::BuildSentencePieceVocab( | ||||
| const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage, | |||||
| const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage, | |||||
| SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms) { | SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms) { | ||||
| auto vocab = std::make_shared<SentencePieceVocab>(); | auto vocab = std::make_shared<SentencePieceVocab>(); | ||||
| auto ds = std::make_shared<BuildSentenceVocabNode>(IRNode(), vocab, col_names, vocab_size, character_coverage, | auto ds = std::make_shared<BuildSentenceVocabNode>(IRNode(), vocab, col_names, vocab_size, character_coverage, | ||||
| @@ -388,7 +388,7 @@ PYBIND_REGISTER(BuildSentenceVocabNode, 2, ([](const py::module *m) { | |||||
| (void)py::class_<BuildSentenceVocabNode, DatasetNode, std::shared_ptr<BuildSentenceVocabNode>>( | (void)py::class_<BuildSentenceVocabNode, DatasetNode, std::shared_ptr<BuildSentenceVocabNode>>( | ||||
| *m, "BuildSentenceVocabNode", "to create a BuildSentenceVocabNode") | *m, "BuildSentenceVocabNode", "to create a BuildSentenceVocabNode") | ||||
| .def(py::init([](std::shared_ptr<DatasetNode> self, std::shared_ptr<SentencePieceVocab> vocab, | .def(py::init([](std::shared_ptr<DatasetNode> self, std::shared_ptr<SentencePieceVocab> vocab, | ||||
| const std::vector<std::string> &col_names, uint32_t vocab_size, | |||||
| const std::vector<std::string> &col_names, int32_t vocab_size, | |||||
| float character_coverage, SentencePieceModel model_type, | float character_coverage, SentencePieceModel model_type, | ||||
| const std::unordered_map<std::string, std::string> ¶ms) { | const std::unordered_map<std::string, std::string> ¶ms) { | ||||
| auto build_sentence_vocab = std::make_shared<BuildSentenceVocabNode>( | auto build_sentence_vocab = std::make_shared<BuildSentenceVocabNode>( | ||||
| @@ -54,7 +54,7 @@ PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) { | |||||
| (void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab") | (void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab") | ||||
| .def(py::init<>()) | .def(py::init<>()) | ||||
| .def_static("from_file", | .def_static("from_file", | ||||
| [](const py::list &paths, const int vocab_size, const float character_coverage, | |||||
| [](const py::list &paths, const int32_t vocab_size, const float character_coverage, | |||||
| const SentencePieceModel model_type, const py::dict ¶ms) { | const SentencePieceModel model_type, const py::dict ¶ms) { | ||||
| std::shared_ptr<SentencePieceVocab> v; | std::shared_ptr<SentencePieceVocab> v; | ||||
| std::vector<std::string> path_list; | std::vector<std::string> path_list; | ||||
| @@ -23,7 +23,7 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, | BuildSentencePieceVocabOp::BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, | ||||
| std::vector<std::string> col_names, uint32_t vocab_size, | |||||
| std::vector<std::string> col_names, int32_t vocab_size, | |||||
| float character_coverage, SentencePieceModel model_type, | float character_coverage, SentencePieceModel model_type, | ||||
| const std::unordered_map<std::string, std::string> ¶ms, | const std::unordered_map<std::string, std::string> ¶ms, | ||||
| int32_t op_conn_size) | int32_t op_conn_size) | ||||
| @@ -134,7 +134,7 @@ class BuildSentencePieceVocabOp : public PipelineOp { | |||||
| }; | }; | ||||
| BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names, | BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names, | ||||
| uint32_t vocab_size, float character_coverage, SentencePieceModel model_type, | |||||
| int32_t vocab_size, float character_coverage, SentencePieceModel model_type, | |||||
| const std::unordered_map<std::string, std::string> ¶ms, int32_t op_conn_size); | const std::unordered_map<std::string, std::string> ¶ms, int32_t op_conn_size); | ||||
| ~BuildSentencePieceVocabOp() = default; | ~BuildSentencePieceVocabOp() = default; | ||||
| @@ -174,7 +174,7 @@ class BuildSentencePieceVocabOp : public PipelineOp { | |||||
| private: | private: | ||||
| bool read_done_; | bool read_done_; | ||||
| Status ret_status_; | Status ret_status_; | ||||
| uint32_t vocab_size_; | |||||
| int32_t vocab_size_; | |||||
| float character_coverage_; | float character_coverage_; | ||||
| SentencePieceModel model_type_; | SentencePieceModel model_type_; | ||||
| std::unordered_map<std::string, std::string> params_; | std::unordered_map<std::string, std::string> params_; | ||||
| @@ -30,7 +30,7 @@ namespace dataset { | |||||
| BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child, | BuildSentenceVocabNode::BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child, | ||||
| std::shared_ptr<SentencePieceVocab> vocab, | std::shared_ptr<SentencePieceVocab> vocab, | ||||
| const std::vector<std::string> &col_names, uint32_t vocab_size, | |||||
| const std::vector<std::string> &col_names, int32_t vocab_size, | |||||
| float character_coverage, SentencePieceModel model_type, | float character_coverage, SentencePieceModel model_type, | ||||
| const std::unordered_map<std::string, std::string> ¶ms) | const std::unordered_map<std::string, std::string> ¶ms) | ||||
| : vocab_(vocab), | : vocab_(vocab), | ||||
| @@ -33,7 +33,7 @@ class BuildSentenceVocabNode : public DatasetNode { | |||||
| public: | public: | ||||
| /// \brief Constructor | /// \brief Constructor | ||||
| BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child, std::shared_ptr<SentencePieceVocab> vocab, | BuildSentenceVocabNode(std::shared_ptr<DatasetNode> child, std::shared_ptr<SentencePieceVocab> vocab, | ||||
| const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage, | |||||
| const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage, | |||||
| SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms); | SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms); | ||||
| /// \brief Destructor | /// \brief Destructor | ||||
| @@ -75,7 +75,7 @@ class BuildSentenceVocabNode : public DatasetNode { | |||||
| private: | private: | ||||
| std::shared_ptr<SentencePieceVocab> vocab_; | std::shared_ptr<SentencePieceVocab> vocab_; | ||||
| std::vector<std::string> col_names_; | std::vector<std::string> col_names_; | ||||
| uint32_t vocab_size_; | |||||
| int32_t vocab_size_; | |||||
| float character_coverage_; | float character_coverage_; | ||||
| SentencePieceModel model_type_; | SentencePieceModel model_type_; | ||||
| std::unordered_map<std::string, std::string> params_; | std::unordered_map<std::string, std::string> params_; | ||||
| @@ -225,7 +225,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||||
| /// \brief Function to create a SentencePieceVocab from source dataset | /// \brief Function to create a SentencePieceVocab from source dataset | ||||
| /// \notes Build a SentencePieceVocab from a dataset. | /// \notes Build a SentencePieceVocab from a dataset. | ||||
| /// \param[in] col_names Column names to get words from. It can be a vector of column names | /// \param[in] col_names Column names to get words from. It can be a vector of column names | ||||
| /// \param[in] vocab_size Vocabulary size. The type is uint32 | |||||
| /// \param[in] vocab_size Vocabulary size. | |||||
| /// \param[in] character_coverage Percentage of characters covered by the model, must be between | /// \param[in] character_coverage Percentage of characters covered by the model, must be between | ||||
| /// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like | /// 0.98 and 1.0 Good defaults are: 0.9995 for languages with rich character sets like | ||||
| /// Japanese or Chinese character sets, and 1.0 for other languages with small character sets. | /// Japanese or Chinese character sets, and 1.0 for other languages with small character sets. | ||||
| @@ -233,7 +233,7 @@ class Dataset : public std::enable_shared_from_this<Dataset> { | |||||
| /// The input sentence must be pretokenized when using word type. | /// The input sentence must be pretokenized when using word type. | ||||
| /// \param[in] params A vector contains more option parameters of sentencepiece library | /// \param[in] params A vector contains more option parameters of sentencepiece library | ||||
| std::shared_ptr<SentencePieceVocab> BuildSentencePieceVocab( | std::shared_ptr<SentencePieceVocab> BuildSentencePieceVocab( | ||||
| const std::vector<std::string> &col_names, uint32_t vocab_size, float character_coverage, | |||||
| const std::vector<std::string> &col_names, int32_t vocab_size, float character_coverage, | |||||
| SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms); | SentencePieceModel model_type, const std::unordered_map<std::string, std::string> ¶ms); | ||||
| /// \brief Function to create a Vocab from source dataset | /// \brief Function to create a Vocab from source dataset | ||||
| @@ -28,7 +28,7 @@ namespace dataset { | |||||
| SentencePieceVocab::SentencePieceVocab() : model_proto_("") {} | SentencePieceVocab::SentencePieceVocab() : model_proto_("") {} | ||||
| Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size, | |||||
| Status SentencePieceVocab::BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size, | |||||
| const float character_coverage, const SentencePieceModel model_type, | const float character_coverage, const SentencePieceModel model_type, | ||||
| const std::unordered_map<std::string, std::string> ¶ms, | const std::unordered_map<std::string, std::string> ¶ms, | ||||
| std::shared_ptr<SentencePieceVocab> *vocab) { | std::shared_ptr<SentencePieceVocab> *vocab) { | ||||
| @@ -29,7 +29,7 @@ namespace dataset { | |||||
| class SentencePieceVocab { | class SentencePieceVocab { | ||||
| public: | public: | ||||
| static Status BuildFromFile(const std::vector<std::string> &path_list, const int vocab_size, | |||||
| static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size, | |||||
| const float character_coverage, const SentencePieceModel model_type, | const float character_coverage, const SentencePieceModel model_type, | ||||
| const std::unordered_map<std::string, std::string> ¶ms, | const std::unordered_map<std::string, std::string> ¶ms, | ||||
| std::shared_ptr<SentencePieceVocab> *vocab); | std::shared_ptr<SentencePieceVocab> *vocab); | ||||
| @@ -225,7 +225,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) { | |||||
| std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; | std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; | ||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | ||||
| // Expected failure: vocab_size shoule be either -1 or positive integer | |||||
| // Expected failure: vocab_size should be either -1 or positive integer | |||||
| Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); | Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); | ||||
| EXPECT_NE(s, Status::OK()); | EXPECT_NE(s, Status::OK()); | ||||
| } | } | ||||