From a8675f8227e00509d574ed66f8bf303d42b9588e Mon Sep 17 00:00:00 2001 From: alex-yuyue Date: Tue, 19 Jan 2021 09:09:51 -0500 Subject: [PATCH] TextOp decoupling Signed-off-by: alex-yuyue --- .../ccsrc/minddata/dataset/api/CMakeLists.txt | 2 +- .../bindings/dataset/text/kernels/bindings.cc | 205 -------------- .../dataset/text/kernels/ir/bindings.cc | 267 ++++++++++++++++++ mindspore/ccsrc/minddata/dataset/api/text.cc | 35 ++- .../ccsrc/minddata/dataset/include/text.h | 3 + mindspore/dataset/core/validator_helpers.py | 4 + mindspore/dataset/engine/datasets.py | 9 +- mindspore/dataset/engine/validators.py | 4 - mindspore/dataset/text/transforms.py | 141 ++++++--- mindspore/dataset/vision/c_transforms.py | 80 +++--- tests/ut/cpp/dataset/c_api_text_test.cc | 223 +++++++++++++++ tests/ut/python/dataset/test_vocab.py | 3 +- 12 files changed, 670 insertions(+), 306 deletions(-) delete mode 100644 mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/bindings.cc create mode 100644 mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc diff --git a/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt index 8396ea524c..1bc16a4291 100644 --- a/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt @@ -18,7 +18,7 @@ if(ENABLE_PYTHON) python/bindings/dataset/kernels/ir/bindings.cc python/bindings/dataset/kernels/ir/image/bindings.cc python/bindings/dataset/text/bindings.cc - python/bindings/dataset/text/kernels/bindings.cc + python/bindings/dataset/text/kernels/ir/bindings.cc python/bindings/mindrecord/include/bindings.cc python/pybind_conversion.cc python/pybind_register.cc diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/bindings.cc deleted file mode 100644 index efcfa640f3..0000000000 --- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/bindings.cc +++ /dev/null @@ -1,205 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" -#include "pybind11/stl_bind.h" -#include "minddata/dataset/api/python/pybind_register.h" - -#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" -#include "minddata/dataset/text/kernels/lookup_op.h" -#include "minddata/dataset/text/kernels/ngram_op.h" -#include "minddata/dataset/text/kernels/sliding_window_op.h" -#include "minddata/dataset/text/kernels/to_number_op.h" -#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" -#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" -#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" -#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" - -#ifdef ENABLE_ICU4C -#include "minddata/dataset/text/kernels/basic_tokenizer_op.h" -#include "minddata/dataset/text/kernels/bert_tokenizer_op.h" -#include "minddata/dataset/text/kernels/case_fold_op.h" -#include "minddata/dataset/text/kernels/normalize_utf8_op.h" -#include "minddata/dataset/text/kernels/regex_replace_op.h" -#include "minddata/dataset/text/kernels/regex_tokenizer_op.h" -#include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" -#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" -#endif - -namespace mindspore { -namespace dataset { - -#ifdef ENABLE_ICU4C - -PYBIND_REGISTER(BasicTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, - "BasicTokenizerOp") - .def(py::init()); - })); - -PYBIND_REGISTER(WhitespaceTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "WhitespaceTokenizerOp") - .def(py::init()); - })); - -PYBIND_REGISTER(UnicodeScriptTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "UnicodeScriptTokenizerOp") - .def(py::init<>()) - .def(py::init()); - })); - -PYBIND_REGISTER( - CaseFoldOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "CaseFoldOp").def(py::init<>()); - })); - -PYBIND_REGISTER(NormalizeUTF8Op, 1, ([](const py::module *m) { - (void)py::class_>(*m, "NormalizeUTF8Op") - .def(py::init<>()) - .def(py::init()); - })); - -PYBIND_REGISTER(RegexReplaceOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "RegexReplaceOp") - .def(py::init()); - })); - -PYBIND_REGISTER(RegexTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, - "RegexTokenizerOp") - .def(py::init()); - })); - -PYBIND_REGISTER(BertTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "BertTokenizerOp") - .def(py::init &, const std::string &, const int &, const std::string &, - const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>()); - })); - -PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) { - (void)py::enum_(*m, "NormalizeForm", py::arithmetic()) - .value("DE_NORMALIZE_NONE", NormalizeForm::kNone) - .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc) - .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc) - .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd) - .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd) - .export_values(); - })); - -#endif - -PYBIND_REGISTER(JiebaTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, - "JiebaTokenizerOp") - .def(py::init()) - .def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) { - THROW_IF_ERROR(self.AddWord(word, freq)); - }); - })); - -PYBIND_REGISTER(UnicodeCharTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "UnicodeCharTokenizerOp") - .def(py::init()); - })); - -PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "LookupOp") - .def(py::init([](std::shared_ptr vocab, const py::object &py_word, - const DataType &data_type) { - if (vocab == nullptr) { - THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null.")); - } - if (py_word.is_none()) { - return std::make_shared(vocab, Vocab::kNoTokenExists, data_type); - } - std::string word = py::reinterpret_borrow(py_word); - WordIdType default_id = vocab->Lookup(word); - if (default_id == Vocab::kNoTokenExists) { - THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, - "default unknown token: " + word + " doesn't exist in vocab.")); - } - return std::make_shared(vocab, default_id, data_type); - })); - })); - -PYBIND_REGISTER(NgramOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "NgramOp") - .def(py::init &, int32_t, int32_t, const std::string &, - const std::string &, const std::string &>()); - })); - -PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "WordpieceTokenizerOp") - .def(py::init &, const std::string &, const int &, const std::string &, - const bool &>()); - })); - -PYBIND_REGISTER(SlidingWindowOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "SlidingWindowOp") - .def(py::init()); - })); - -PYBIND_REGISTER( - SentencePieceTokenizerOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "SentencePieceTokenizerOp") - .def( - py::init &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>()) - .def(py::init()); - })); - -PYBIND_REGISTER(ToNumberOp, 1, ([](const py::module *m) { - (void)py::class_>(*m, "ToNumberOp") - .def(py::init()) - .def(py::init()); - })); - -PYBIND_REGISTER(TruncateSequencePairOp, 1, ([](const py::module *m) { - (void)py::class_>( - *m, "TruncateSequencePairOp") - .def(py::init()); - })); - -PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { - (void)py::enum_(*m, "JiebaMode", py::arithmetic()) - .value("DE_JIEBA_MIX", JiebaMode::kMix) - .value("DE_JIEBA_MP", JiebaMode::kMp) - .value("DE_JIEBA_HMM", JiebaMode::kHmm) - .export_values(); - })); - -PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) { - (void)py::enum_(*m, "SPieceTokenizerOutType", py::arithmetic()) - .value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString) - .value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt) - .export_values(); - })); - -PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) { - (void)py::enum_(*m, "SPieceTokenizerLoadType", py::arithmetic()) - .value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile) - .value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel) - .export_values(); - })); - -} // namespace dataset -} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc new file mode 100644 index 0000000000..48121570af --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc @@ -0,0 +1,267 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" +#include "minddata/dataset/api/python/pybind_register.h" +#include "minddata/dataset/include/text.h" +#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" +#include "minddata/dataset/text/sentence_piece_vocab.h" +#include "minddata/dataset/text/vocab.h" + +namespace mindspore { +namespace dataset { + +#ifdef ENABLE_ICU4C + +PYBIND_REGISTER( + BasicTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "BasicTokenizerOperation") + .def(py::init([](bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, + bool preserve_unused_token, bool with_offsets) { + auto basic_tokenizer = std::make_shared( + lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets); + THROW_IF_ERROR(basic_tokenizer->ValidateParams()); + return basic_tokenizer; + })); + })); + +PYBIND_REGISTER( + BertTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "BertTokenizerOperation") + .def(py::init([](const std::shared_ptr &vocab, const std::string &suffix_indicator, + int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, + bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, + bool with_offsets) { + auto bert_tokenizer = std::make_shared( + vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, normalize_form, + preserve_unused_token, with_offsets); + THROW_IF_ERROR(bert_tokenizer->ValidateParams()); + return bert_tokenizer; + })); + })); + +PYBIND_REGISTER(CaseFoldOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "CaseFoldOperation") + .def(py::init([]() { + auto case_fold = std::make_shared(); + THROW_IF_ERROR(case_fold->ValidateParams()); + return case_fold; + })); + })); + +PYBIND_REGISTER( + NormalizeUTF8Operation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "NormalizeUTF8Operation") + .def(py::init([](NormalizeForm normalize_form) { + auto normalize_utf8 = std::make_shared(normalize_form); + THROW_IF_ERROR(normalize_utf8->ValidateParams()); + return normalize_utf8; + })); + })); + +PYBIND_REGISTER( + RegexReplaceOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "RegexReplaceOperation") + .def(py::init([](std::string pattern, std::string replace, bool replace_all) { + auto regex_replace = std::make_shared(pattern, replace, replace_all); + THROW_IF_ERROR(regex_replace->ValidateParams()); + return regex_replace; + })); + })); + +PYBIND_REGISTER( + RegexTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "RegexTokenizerOperation") + .def( + py::init([](const std::string &delim_pattern, const std::string &keep_delim_pattern, const bool &with_offsets) { + auto regex_tokenizer = + std::make_shared(delim_pattern, keep_delim_pattern, with_offsets); + THROW_IF_ERROR(regex_tokenizer->ValidateParams()); + return regex_tokenizer; + })); + })); + +PYBIND_REGISTER(UnicodeScriptTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "UnicodeScriptTokenizerOperation") + .def(py::init([](bool keep_whitespace, bool with_offsets) { + auto unicode_script_tokenizer = + std::make_shared(keep_whitespace, with_offsets); + THROW_IF_ERROR(unicode_script_tokenizer->ValidateParams()); + return unicode_script_tokenizer; + })); + })); + +PYBIND_REGISTER(WhitespaceTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>(*m, + "WhitespaceTokenizerOperation") + .def(py::init([](bool with_offsets) { + auto whitespace_tokenizer = std::make_shared(with_offsets); + THROW_IF_ERROR(whitespace_tokenizer->ValidateParams()); + return whitespace_tokenizer; + })); + })); + +PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) { + (void)py::enum_(*m, "NormalizeForm", py::arithmetic()) + .value("DE_NORMALIZE_NONE", NormalizeForm::kNone) + .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc) + .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc) + .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd) + .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd) + .export_values(); + })); +#endif + +PYBIND_REGISTER( + JiebaTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "JiebaTokenizerOperation") + .def( + py::init([](const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) { + auto jieba_tokenizer = std::make_shared(hmm_path, mp_path, mode, with_offsets); + THROW_IF_ERROR(jieba_tokenizer->ValidateParams()); + return jieba_tokenizer; + })) + .def("add_word", [](text::JiebaTokenizerOperation &self, const std::string word, int64_t freq) { + THROW_IF_ERROR(self.AddWord(word, freq)); + }); + })); + +PYBIND_REGISTER(LookupOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "LookupOperation") + .def(py::init([](const std::shared_ptr &vocab, const std::string &unknown_token, + const std::string &data_type) { + auto lookup = std::make_shared(vocab, unknown_token, data_type); + THROW_IF_ERROR(lookup->ValidateParams()); + return lookup; + })); + })); + +PYBIND_REGISTER(NgramOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "NgramOperation") + .def( + py::init([](const std::vector &ngrams, const std::pair &left_pad, + const std::pair &right_pad, const std::string &separator) { + auto ngram = std::make_shared(ngrams, left_pad, right_pad, separator); + THROW_IF_ERROR(ngram->ValidateParams()); + return ngram; + })); + })); + +PYBIND_REGISTER( + SentencePieceTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>(*m, "SentencePieceTokenizerOperation") + .def(py::init([](const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) { + auto SentencePieceTokenizer = std::make_shared(vocab, out_type); + THROW_IF_ERROR(SentencePieceTokenizer->ValidateParams()); + return SentencePieceTokenizer; + })) + .def(py::init([](const std::string &vocab_path, SPieceTokenizerOutType out_type) { + auto sentence_piece_tokenizer = std::make_shared(vocab_path, out_type); + THROW_IF_ERROR(sentence_piece_tokenizer->ValidateParams()); + return sentence_piece_tokenizer; + })); + })); + +PYBIND_REGISTER( + SlidingWindowOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "SlidingWindowOperation") + .def(py::init([](const int32_t width, const int32_t axis) { + auto sliding_window = std::make_shared(width, axis); + THROW_IF_ERROR(sliding_window->ValidateParams()); + return sliding_window; + })); + })); + +PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "ToNumberOperation") + .def(py::init([](std::string data_type) { + auto to_number = std::make_shared(data_type); + THROW_IF_ERROR(to_number->ValidateParams()); + return to_number; + })); + })); + +PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "TruncateSequencePairOperation") + .def(py::init([](int32_t max_length) { + auto truncate_sequence_pair = std::make_shared(max_length); + THROW_IF_ERROR(truncate_sequence_pair->ValidateParams()); + return truncate_sequence_pair; + })); + })); + +PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "UnicodeCharTokenizerOperation") + .def(py::init([](bool with_offsets) { + auto unicode_char_tokenizer = std::make_shared(with_offsets); + THROW_IF_ERROR(unicode_char_tokenizer->ValidateParams()); + return unicode_char_tokenizer; + })); + })); + +// TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ +PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { + (void)py::class_>( + *m, "WordpieceTokenizerOp") + .def(py::init &, const std::string &, const int &, const std::string &, + const bool &>()); + })); + +PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { + (void)py::enum_(*m, "JiebaMode", py::arithmetic()) + .value("DE_JIEBA_MIX", JiebaMode::kMix) + .value("DE_JIEBA_MP", JiebaMode::kMp) + .value("DE_JIEBA_HMM", JiebaMode::kHmm) + .export_values(); + })); + +PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) { + (void)py::enum_(*m, "SPieceTokenizerLoadType", py::arithmetic()) + .value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile) + .value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel) + .export_values(); + })); + +PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) { + (void)py::enum_(*m, "SPieceTokenizerOutType", py::arithmetic()) + .value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString) + .value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt) + .export_values(); + })); + +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc index 5981331007..2544f83522 100644 --- a/mindspore/ccsrc/minddata/dataset/api/text.cc +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -314,9 +314,31 @@ Status JiebaTokenizerOperation::ValidateParams() { std::shared_ptr JiebaTokenizerOperation::Build() { std::shared_ptr tensor_op = std::make_shared(hmm_path_, mp_path_, mode_, with_offsets_); + for (auto &word : words_list_) { + Status rc = tensor_op->AddWord(word.first, word.second); + if (rc.IsError()) { + MS_LOG(ERROR) << rc; + return {}; + } + } return tensor_op; } +Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { + if (word.empty()) { + std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + if (freq < 0) { + std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } + words_list_.emplace_back(word, freq); + return Status::OK(); +} + // LookupOperation LookupOperation::LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token, const std::string &data_type) @@ -330,12 +352,13 @@ Status LookupOperation::ValidateParams() { MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } - - default_id_ = vocab_->Lookup(unknown_token_); - if (default_id_ == Vocab::kNoTokenExists) { - std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab."; - MS_LOG(ERROR) << err_msg; - RETURN_STATUS_SYNTAX_ERROR(err_msg); + if (!unknown_token_.empty()) { + default_id_ = vocab_->Lookup(unknown_token_); + if (default_id_ == Vocab::kNoTokenExists) { + std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab."; + MS_LOG(ERROR) << err_msg; + RETURN_STATUS_SYNTAX_ERROR(err_msg); + } } if (!IsTypeNumeric(data_type_)) { diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h index d1e8de8c02..bf5522fe67 100644 --- a/mindspore/ccsrc/minddata/dataset/include/text.h +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -331,11 +331,14 @@ class JiebaTokenizerOperation : public TensorOperation { std::string Name() const override { return kJiebaTokenizerOperation; } + Status AddWord(const std::string &word, int64_t freq = 0); + private: std::string hmm_path_; std::string mp_path_; JiebaMode mode_; bool with_offsets_; + std::vector> words_list_; }; class LookupOperation : public TensorOperation { diff --git a/mindspore/dataset/core/validator_helpers.py b/mindspore/dataset/core/validator_helpers.py index 21bdb256ce..46a16d100e 100644 --- a/mindspore/dataset/core/validator_helpers.py +++ b/mindspore/dataset/core/validator_helpers.py @@ -383,3 +383,7 @@ def check_tensor_op(param, param_name): """check whether param is a tensor op or a callable Python function""" if not isinstance(param, cde.TensorOp) and not callable(param) and not getattr(param, 'parse', None): raise TypeError("{0} is neither a c_transform op (TensorOperation) nor a callable pyfunc.".format(param_name)) + + +def replace_none(value, default): + return value if value is not None else default diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index bb7da16795..b3e86b9765 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -55,9 +55,10 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \ check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \ check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \ - check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, replace_none + check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send from ..core.config import get_callback_timeout, _init_device_info from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist +from ..core.validator_helpers import replace_none try: context = import_module("mindspore.context") @@ -372,7 +373,7 @@ class Dataset: Args: condition_name (str): The condition name that is used to toggle sending next row. num_batch (int): the number of batches without blocking at the start of each epoch. - callback (function): The callback funciton that will be invoked when sync_update is called. + callback (function): The callback function that will be invoked when sync_update is called. Returns: SyncWaitDataset, dataset added a blocking condition. @@ -398,7 +399,7 @@ class Dataset: 1. Make a shuffle buffer that contains the first buffer_size rows. 2. Randomly select an element from the shuffle buffer to be the next row - propogated to the child node. + propagated to the child node. 3. Get the next row (if any) from the parent node and put it in the shuffle buffer. 4. Repeat steps 2 and 3 until there are no more rows left in the shuffle buffer. @@ -1718,7 +1719,7 @@ class MappableDataset(SourceDataset): - The sum of split sizes < K, the difference will be added to the first split. - The sum of split sizes > K, the difference will be removed from the first large - enough split such that it will have atleast 1 row after removing the difference. + enough split such that it will have at least 1 row after removing the difference. randomize (bool, optional): Determines whether or not to split the data randomly (default=True). If True, the data will be randomly split. Otherwise, each split will be created with diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py index 939af41f75..724207d5ae 100644 --- a/mindspore/dataset/engine/validators.py +++ b/mindspore/dataset/engine/validators.py @@ -1323,7 +1323,3 @@ def check_to_device_send(method): return method(self, *args, **kwargs) return new_method - - -def replace_none(value, default): - return value if value is not None else default diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index 987d7e3ab9..358c9a7a29 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -58,9 +58,13 @@ from .validators import check_lookup, check_jieba_add_dict, \ check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate, \ check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow from ..core.datatypes import mstype_to_detype +from ..core.validator_helpers import replace_none +class TextTensorOperation: + def parse(self): + raise NotImplementedError("TextTensorOperation has to implement parse method.") -class Lookup(cde.LookupOp): +class Lookup(TextTensorOperation): """ Lookup operator that looks up a word to an id. @@ -82,10 +86,15 @@ class Lookup(cde.LookupOp): @check_lookup def __init__(self, vocab, unknown_token=None, data_type=mstype.int32): - super().__init__(vocab, unknown_token, mstype_to_detype(data_type)) + self.vocab = vocab + self.unknown_token = replace_none(unknown_token, '') + self.data_type = data_type + + def parse(self): + return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type))) -class SlidingWindow(cde.SlidingWindowOp): +class SlidingWindow(TextTensorOperation): """ TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis is a slice of data starting at the corresponding position, with a specified width. @@ -114,10 +123,14 @@ class SlidingWindow(cde.SlidingWindowOp): @check_slidingwindow def __init__(self, width, axis=0): - super().__init__(width, axis) + self.width = width + self.axis = axis + + def parse(self): + return cde.SlidingWindowOperation(self.width, self.axis) -class Ngram(cde.NgramOp): +class Ngram(TextTensorOperation): """ TensorOp to generate n-gram from a 1-D string Tensor. @@ -145,7 +158,13 @@ class Ngram(cde.NgramOp): @check_ngram def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "): - super().__init__(n, left_pad[1], right_pad[1], left_pad[0], right_pad[0], separator) + self.ngrams = n + self.left_pad = left_pad + self.right_pad = right_pad + self.separator = separator + + def parse(self): + return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator) DE_C_INTER_JIEBA_MODE = { @@ -155,7 +174,7 @@ DE_C_INTER_JIEBA_MODE = { } -class JiebaTokenizer(cde.JiebaTokenizerOp): +class JiebaTokenizer(TextTensorOperation): """ Tokenize Chinese string into words based on dictionary. @@ -196,11 +215,19 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): self.mode = mode self.__check_path__(hmm_path) + self.hmm_path = hmm_path self.__check_path__(mp_path) + self.mp_path = mp_path self.with_offsets = with_offsets - super().__init__(hmm_path, mp_path, - DE_C_INTER_JIEBA_MODE[mode], - self.with_offsets) + self.words = [] + + def parse(self): + jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path, + DE_C_INTER_JIEBA_MODE[self.mode], + self.with_offsets) + for word in self.words: + jieba_tokenizer.add_word(word[0], word[1]) + return jieba_tokenizer @check_jieba_add_word def add_word(self, word, freq=None): @@ -225,9 +252,9 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): """ if freq is None: - super().add_word(word, 0) + self.words.append((word, 0)) else: - super().add_word(word, freq) + self.words.append((word, freq)) @check_jieba_add_dict def add_dict(self, user_dict): @@ -308,7 +335,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): " jieba mode file {} is not exist.".format(model_path)) -class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): +class UnicodeCharTokenizer(TextTensorOperation): """ Tokenize a scalar tensor of UTF-8 string to Unicode characters. @@ -332,9 +359,12 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): @check_with_offsets def __init__(self, with_offsets=False): self.with_offsets = with_offsets - super().__init__(self.with_offsets) + def parse(self): + return cde.UnicodeCharTokenizerOperation(self.with_offsets) + +# TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ class WordpieceTokenizer(cde.WordpieceTokenizerOp): """ Tokenize scalar token or 1-D tokens to 1-D subword tokens. @@ -386,7 +416,7 @@ DE_C_INTER_SENTENCEPIECE_OUTTYPE = { } -class SentencePieceTokenizer(cde.SentencePieceTokenizerOp): +class SentencePieceTokenizer(TextTensorOperation): """ Tokenize scalar token or 1-D tokens to tokens by sentencepiece. @@ -404,19 +434,15 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp): """ def __init__(self, mode, out_type): + self.mode = mode self.out_type = out_type - if isinstance(mode, str): - model_path, model_filename = os.path.split(mode) - super().__init__(model_path, model_filename, - DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.FILE], - DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type]) - elif isinstance(mode, cde.SentencePieceVocab): - super().__init__(mode, DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.MODEL], - DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type]) + + def parse(self): + return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type]) if platform.system().lower() != 'windows': - class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): + class WhitespaceTokenizer(TextTensorOperation): """ Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. @@ -444,10 +470,12 @@ if platform.system().lower() != 'windows': @check_with_offsets def __init__(self, with_offsets=False): self.with_offsets = with_offsets - super().__init__(self.with_offsets) + + def parse(self): + return cde.WhitespaceTokenizerOperation(self.with_offsets) - class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): + class UnicodeScriptTokenizer(TextTensorOperation): """ Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. @@ -475,12 +503,16 @@ if platform.system().lower() != 'windows': @check_unicode_script_tokenizer def __init__(self, keep_whitespace=False, with_offsets=False): + keep_whitespace = replace_none(keep_whitespace, False) + with_offsets = replace_none(with_offsets, False) self.keep_whitespace = keep_whitespace self.with_offsets = with_offsets - super().__init__(self.keep_whitespace, self.with_offsets) + def parse(self): + return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets) - class CaseFold(cde.CaseFoldOp): + + class CaseFold(TextTensorOperation): """ Apply case fold operation on UTF-8 string tensor. @@ -494,6 +526,9 @@ if platform.system().lower() != 'windows': >>> data1 = data1.map(operations=case_op) """ + def parse(self): + return cde.CaseFoldOperation() + DE_C_INTER_NORMALIZE_FORM = { NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, @@ -504,7 +539,7 @@ if platform.system().lower() != 'windows': } - class NormalizeUTF8(cde.NormalizeUTF8Op): + class NormalizeUTF8(TextTensorOperation): """ Apply normalize operation on UTF-8 string tensor. @@ -534,11 +569,14 @@ if platform.system().lower() != 'windows': if not isinstance(normalize_form, NormalizeForm): raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") + normalize_form = replace_none(normalize_form, NormalizeForm.NFKC) self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] - super().__init__(self.normalize_form) + + def parse(self): + return cde.NormalizeUTF8Operation(self.normalize_form) - class RegexReplace(cde.RegexReplaceOp): + class RegexReplace(TextTensorOperation): """ Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'. @@ -566,10 +604,12 @@ if platform.system().lower() != 'windows': self.pattern = pattern self.replace = replace self.replace_all = replace_all - super().__init__(self.pattern, self.replace, self.replace_all) + def parse(self): + return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all) - class RegexTokenizer(cde.RegexTokenizerOp): + + class RegexTokenizer(TextTensorOperation): """ Tokenize a scalar tensor of UTF-8 string by regex expression pattern. @@ -606,10 +646,12 @@ if platform.system().lower() != 'windows': self.delim_pattern = delim_pattern self.keep_delim_pattern = keep_delim_pattern self.with_offsets = with_offsets - super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) + + def parse(self): + return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) - class BasicTokenizer(cde.BasicTokenizerOp): + class BasicTokenizer(TextTensorOperation): """ Tokenize a scalar tensor of UTF-8 string by specific rules. @@ -661,11 +703,13 @@ if platform.system().lower() != 'windows': self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.preserve_unused_token = preserve_unused_token self.with_offsets = with_offsets - super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form, - self.preserve_unused_token, self.with_offsets) + def parse(self): + return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form, + self.preserve_unused_token, self.with_offsets) - class BertTokenizer(cde.BertTokenizerOp): + + class BertTokenizer(TextTensorOperation): """ Tokenizer used for Bert text process. @@ -725,12 +769,14 @@ if platform.system().lower() != 'windows': self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.preserve_unused_token = preserve_unused_token self.with_offsets = with_offsets - super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, - self.lower_case, self.keep_whitespace, self.normalization_form, - self.preserve_unused_token, self.with_offsets) + + def parse(self): + return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, + self.unknown_token, self.lower_case, self.keep_whitespace, + self.normalization_form, self.preserve_unused_token, self.with_offsets) -class TruncateSequencePair(cde.TruncateSequencePairOp): +class TruncateSequencePair(TextTensorOperation): """ Truncate a pair of rank-1 tensors such that the total length is less than max_length. @@ -757,10 +803,13 @@ class TruncateSequencePair(cde.TruncateSequencePairOp): @check_pair_truncate def __init__(self, max_length): - super().__init__(max_length) + self.max_length = max_length + def parse(self): + return cde.TruncateSequencePairOperation(self.max_length) -class ToNumber(cde.ToNumberOp): + +class ToNumber(TextTensorOperation): """ Tensor operation to convert every element of a string tensor to a number. @@ -789,7 +838,9 @@ class ToNumber(cde.ToNumberOp): def __init__(self, data_type): data_type = mstype_to_detype(data_type) self.data_type = str(data_type) - super().__init__(data_type) + + def parse(self): + return cde.ToNumberOperation(self.data_type) class PythonTokenizer: diff --git a/mindspore/dataset/vision/c_transforms.py b/mindspore/dataset/vision/c_transforms.py index f5c0589a04..c83eaf618a 100644 --- a/mindspore/dataset/vision/c_transforms.py +++ b/mindspore/dataset/vision/c_transforms.py @@ -81,11 +81,11 @@ def parse_padding(padding): padding = tuple(padding) return padding -class TensorOperation: +class ImageTensorOperation: def parse(self): - raise NotImplementedError("TensorOperation has to implement parse method.") + raise NotImplementedError("ImageTensorOperation has to implement parse method.") -class AutoContrast(TensorOperation): +class AutoContrast(ImageTensorOperation): """ Apply automatic contrast on input image. @@ -112,7 +112,7 @@ class AutoContrast(TensorOperation): return cde.AutoContrastOperation(self.cutoff, self.ignore) -class RandomSharpness(TensorOperation): +class RandomSharpness(ImageTensorOperation): """ Adjust the sharpness of the input image by a fixed or random degree. Degree of 0.0 gives a blurred image, degree of 1.0 gives the original image, and degree of 2.0 gives a sharpened image. @@ -140,7 +140,7 @@ class RandomSharpness(TensorOperation): return cde.RandomSharpnessOperation(self.degrees) -class Equalize(TensorOperation): +class Equalize(ImageTensorOperation): """ Apply histogram equalization on input image. @@ -153,7 +153,7 @@ class Equalize(TensorOperation): return cde.EqualizeOperation() -class Invert(TensorOperation): +class Invert(ImageTensorOperation): """ Apply invert on input image in RGB mode. @@ -166,7 +166,7 @@ class Invert(TensorOperation): return cde.InvertOperation() -class Decode(TensorOperation): +class Decode(ImageTensorOperation): """ Decode the input image in RGB mode. @@ -203,7 +203,7 @@ class Decode(TensorOperation): return cde.DecodeOperation(self.rgb) -class CutMixBatch(TensorOperation): +class CutMixBatch(ImageTensorOperation): """ Apply CutMix transformation on input batch of images and labels. Note that you need to make labels into one-hot format and batch before calling this function. @@ -235,7 +235,7 @@ class CutMixBatch(TensorOperation): return cde.CutMixBatchOperation(DE_C_IMAGE_BATCH_FORMAT[self.image_batch_format], self.alpha, self.prob) -class CutOut(TensorOperation): +class CutOut(ImageTensorOperation): """ Randomly cut (mask) out a given number of square patches from the input NumPy image array. @@ -258,7 +258,7 @@ class CutOut(TensorOperation): return cde.CutOutOperation(self.length, self.num_patches) -class MixUpBatch(TensorOperation): +class MixUpBatch(ImageTensorOperation): """ Apply MixUp transformation on input batch of images and labels. Each image is multiplied by a random weight (lambda) and then added to a randomly selected image from the batch multiplied by (1 - lambda). The same formula is also @@ -286,7 +286,7 @@ class MixUpBatch(TensorOperation): return cde.MixUpBatchOperation(self.alpha) -class Normalize(TensorOperation): +class Normalize(ImageTensorOperation): """ Normalize the input image with respect to mean and standard deviation. @@ -333,7 +333,7 @@ class Normalize(TensorOperation): return cde.NormalizeOperation(self.mean, self.std) -class NormalizePad(TensorOperation): +class NormalizePad(ImageTensorOperation): """ Normalize the input image with respect to mean and standard deviation then pad an extra channel with value zero. @@ -380,7 +380,7 @@ class NormalizePad(TensorOperation): return cde.NormalizePadOperation(self.mean, self.std, self.dtype) -class RandomAffine(TensorOperation): +class RandomAffine(ImageTensorOperation): """ Apply Random affine transformation to the input image. @@ -486,7 +486,7 @@ class RandomAffine(TensorOperation): self.fill_value) -class RandomCrop(TensorOperation): +class RandomCrop(ImageTensorOperation): """ Crop the input image at a random location. @@ -551,7 +551,7 @@ class RandomCrop(TensorOperation): return cde.RandomCropOperation(self.size, self.padding, self.pad_if_needed, self.fill_value, border_type) -class RandomCropWithBBox(TensorOperation): +class RandomCropWithBBox(ImageTensorOperation): """ Crop the input image at a random location and adjust bounding boxes accordingly. @@ -615,7 +615,7 @@ class RandomCropWithBBox(TensorOperation): border_type) -class RandomHorizontalFlip(TensorOperation): +class RandomHorizontalFlip(ImageTensorOperation): """ Flip the input image horizontally, randomly with a given probability. @@ -636,7 +636,7 @@ class RandomHorizontalFlip(TensorOperation): return cde.RandomHorizontalFlipOperation(self.prob) -class RandomHorizontalFlipWithBBox(TensorOperation): +class RandomHorizontalFlipWithBBox(ImageTensorOperation): """ Flip the input image horizontally, randomly with a given probability and adjust bounding boxes accordingly. @@ -657,7 +657,7 @@ class RandomHorizontalFlipWithBBox(TensorOperation): return cde.RandomHorizontalFlipWithBBoxOperation(self.prob) -class RandomPosterize(TensorOperation): +class RandomPosterize(ImageTensorOperation): """ Reduce the number of bits for each color channel. @@ -685,7 +685,7 @@ class RandomPosterize(TensorOperation): return cde.RandomPosterizeOperation(bits) -class RandomVerticalFlip(TensorOperation): +class RandomVerticalFlip(ImageTensorOperation): """ Flip the input image vertically, randomly with a given probability. @@ -706,7 +706,7 @@ class RandomVerticalFlip(TensorOperation): return cde.RandomVerticalFlipOperation(self.prob) -class RandomVerticalFlipWithBBox(TensorOperation): +class RandomVerticalFlipWithBBox(ImageTensorOperation): """ Flip the input image vertically, randomly with a given probability and adjust bounding boxes accordingly. @@ -727,7 +727,7 @@ class RandomVerticalFlipWithBBox(TensorOperation): return cde.RandomVerticalFlipWithBBoxOperation(self.prob) -class BoundingBoxAugment(TensorOperation): +class BoundingBoxAugment(ImageTensorOperation): """ Apply a given image transform on a random selection of bounding box regions of a given image. @@ -760,7 +760,7 @@ class BoundingBoxAugment(TensorOperation): return cde.BoundingBoxAugmentOperation(transform, self.ratio) -class Resize(TensorOperation): +class Resize(ImageTensorOperation): """ Resize the input image to the given size. @@ -816,7 +816,7 @@ class Resize(TensorOperation): return cde.ResizeOperation(self.size, DE_C_INTER_MODE[self.interpolation]) -class ResizeWithBBox(TensorOperation): +class ResizeWithBBox(ImageTensorOperation): """ Resize the input image to the given size and adjust bounding boxes accordingly. @@ -855,7 +855,7 @@ class ResizeWithBBox(TensorOperation): return cde.ResizeWithBBoxOperation(size, DE_C_INTER_MODE[self.interpolation]) -class RandomResizedCropWithBBox(TensorOperation): +class RandomResizedCropWithBBox(ImageTensorOperation): """ Crop the input image to a random size and aspect ratio and adjust bounding boxes accordingly. @@ -904,7 +904,7 @@ class RandomResizedCropWithBBox(TensorOperation): DE_C_INTER_MODE[self.interpolation], self.max_attempts) -class RandomResizedCrop(TensorOperation): +class RandomResizedCrop(ImageTensorOperation): """ Crop the input image to a random size and aspect ratio. @@ -954,7 +954,7 @@ class RandomResizedCrop(TensorOperation): self.max_attempts) -class CenterCrop(TensorOperation): +class CenterCrop(ImageTensorOperation): """ Crops the input image at the center to the given size. @@ -984,7 +984,7 @@ class CenterCrop(TensorOperation): return cde.CenterCropOperation(self.size) -class RandomColor(TensorOperation): +class RandomColor(ImageTensorOperation): """ Adjust the color of the input image by a fixed or random degree. This operation works only with 3-channel color images. @@ -1008,7 +1008,7 @@ class RandomColor(TensorOperation): return cde.RandomColorOperation(*self.degrees) -class RandomColorAdjust(TensorOperation): +class RandomColorAdjust(ImageTensorOperation): """ Randomly adjust the brightness, contrast, saturation, and hue of the input image. @@ -1060,7 +1060,7 @@ class RandomColorAdjust(TensorOperation): return cde.RandomColorAdjustOperation(self.brightness, self.contrast, self.saturation, self.hue) -class RandomRotation(TensorOperation): +class RandomRotation(ImageTensorOperation): """ Rotate the input image by a random angle. @@ -1116,7 +1116,7 @@ class RandomRotation(TensorOperation): return cde.RandomRotationOperation(degrees, interpolation, expand, center, fill_value) -class Rescale(TensorOperation): +class Rescale(ImageTensorOperation): """ Tensor operation to rescale the input image. @@ -1155,7 +1155,7 @@ class Rescale(TensorOperation): return cde.RescaleOperation(self.rescale, self.shift) -class RandomResize(TensorOperation): +class RandomResize(ImageTensorOperation): """ Tensor operation to resize the input image using a randomly selected interpolation mode. @@ -1187,7 +1187,7 @@ class RandomResize(TensorOperation): return cde.RandomResizeOperation(size) -class RandomResizeWithBBox(TensorOperation): +class RandomResizeWithBBox(ImageTensorOperation): """ Tensor operation to resize the input image using a randomly selected interpolation mode and adjust bounding boxes accordingly. @@ -1220,7 +1220,7 @@ class RandomResizeWithBBox(TensorOperation): return cde.RandomResizeWithBBoxOperation(size) -class HWC2CHW(TensorOperation): +class HWC2CHW(ImageTensorOperation): """ Transpose the input image; shape (H, W, C) to shape (C, H, W). @@ -1253,7 +1253,7 @@ class HWC2CHW(TensorOperation): return cde.HwcToChwOperation() -class RandomCropDecodeResize(TensorOperation): +class RandomCropDecodeResize(ImageTensorOperation): """ Equivalent to RandomResizedCrop, but crops before decodes. @@ -1305,7 +1305,7 @@ class RandomCropDecodeResize(TensorOperation): self.max_attempts) -class Pad(TensorOperation): +class Pad(ImageTensorOperation): """ Pads the image according to padding parameters. @@ -1370,7 +1370,7 @@ class Pad(TensorOperation): return img.as_array() -class UniformAugment(TensorOperation): +class UniformAugment(ImageTensorOperation): """ Tensor operation to perform randomly selected augmentation. @@ -1407,7 +1407,7 @@ class UniformAugment(TensorOperation): return cde.UniformAugOperation(transforms, self.num_ops) -class RandomSelectSubpolicy(TensorOperation): +class RandomSelectSubpolicy(ImageTensorOperation): """ Choose a random sub-policy from a list to be applied on the input image. A sub-policy is a list of tuples (op, prob), where op is a TensorOp operation and prob is the probability that this op will be applied. Once @@ -1446,7 +1446,7 @@ class RandomSelectSubpolicy(TensorOperation): return cde.RandomSelectSubpolicyOperation(policy) -class SoftDvppDecodeResizeJpeg(TensorOperation): +class SoftDvppDecodeResizeJpeg(ImageTensorOperation): """ Tensor operation to decode and resize JPEG image using the simulation algorithm of Ascend series chip DVPP module. @@ -1486,7 +1486,7 @@ class SoftDvppDecodeResizeJpeg(TensorOperation): return cde.SoftDvppDecodeResizeJpegOperation(self.size) -class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation): +class SoftDvppDecodeRandomCropResizeJpeg(ImageTensorOperation): """ Tensor operation to decode, random crop and resize JPEG image using the simulation algorithm of Ascend series chip DVPP module. @@ -1531,7 +1531,7 @@ class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation): return cde.SoftDvppDecodeRandomCropResizeJpegOperation(self.size, self.scale, self.ratio, self.max_attempts) -class RandomSolarize(TensorOperation): +class RandomSolarize(ImageTensorOperation): """ Invert all pixel values above a threshold. diff --git a/tests/ut/cpp/dataset/c_api_text_test.cc b/tests/ut/cpp/dataset/c_api_text_test.cc index 9df3cca6b3..c368bcee21 100644 --- a/tests/ut/cpp/dataset/c_api_text_test.cc +++ b/tests/ut/cpp/dataset/c_api_text_test.cc @@ -877,6 +877,229 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) { EXPECT_EQ(jieba_tokenizer3, nullptr); } +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) { + // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0). + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = + text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Add word with freq not provided (default 0) + jieba_tokenizer->AddWord("男默女泪"); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"男默女泪", "市", "长江大桥"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) { + // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = + text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Add word with freq is set explicitly to 0 + jieba_tokenizer->AddWord("男默女泪", 0); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"男默女泪", "市", "长江大桥"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) { + // Testing the parameter AddWord of JiebaTokenizer when the freq is 10. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = + text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Add word with freq 10 + jieba_tokenizer->AddWord("男默女泪", 10); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"男默女泪", "市", "长江大桥"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) { + // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}); + EXPECT_NE(ds, nullptr); + + // Create jieba_tokenizer operation on ds + std::shared_ptr jieba_tokenizer = + text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + + // Add word with freq 20000 + jieba_tokenizer->AddWord("江大桥", 20000); + + // Create Map operation on ds + ds = ds->Map({jieba_tokenizer}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + std::vector expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; + + uint64_t i = 0; + while (row.size() != 0) { + auto ind = row["text"]; + std::shared_ptr expected_tensor; + Tensor::CreateFromVector(expected, &expected_tensor); + EXPECT_EQ(*ind, *expected_tensor); + iter->GetNextRow(&row); + i++; + } + + EXPECT_EQ(i, 1); + + // Manually terminate the pipeline + iter->Stop(); +} + +TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) { + // Testing the incorrect parameter of AddWord in JiebaTokenizer. + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; + std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; + std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Testing the parameter word of AddWord is empty + std::shared_ptr jieba_tokenizer = + text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer, nullptr); + EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK()); + // Testing the parameter freq of AddWord is negative + std::shared_ptr jieba_tokenizer1 = + text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); + EXPECT_NE(jieba_tokenizer1, nullptr); + EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); +} + TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { // Testing the parameter of SlidingWindow interface when the axis is 0. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; diff --git a/tests/ut/python/dataset/test_vocab.py b/tests/ut/python/dataset/test_vocab.py index c8a46db9c4..f9032b3e54 100644 --- a/tests/ut/python/dataset/test_vocab.py +++ b/tests/ut/python/dataset/test_vocab.py @@ -166,7 +166,8 @@ def test_lookup_cast_type(): assert test_config("unk") == np.dtype("int32") # test exception, data_type isn't the correct type assert "tldr is not of type (,)" in test_config("unk", "tldr") - assert "Lookup doesn't support string to string lookup" in test_config("w1", mstype.string) + assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \ + test_config("w1", mstype.string) if __name__ == '__main__':