From 47060631e56d3a241332d59e34245b99b3cf25b0 Mon Sep 17 00:00:00 2001 From: xiefangqi Date: Wed, 8 Jul 2020 16:46:13 +0800 Subject: [PATCH] add offsets feature to tokenizer --- .../ccsrc/dataset/api/python_bindings.cc | 39 +- .../text/kernels/basic_tokenizer_op.cc | 26 +- .../dataset/text/kernels/basic_tokenizer_op.h | 12 +- .../dataset/text/kernels/bert_tokenizer_op.cc | 6 +- .../dataset/text/kernels/bert_tokenizer_op.h | 15 +- .../text/kernels/jieba_tokenizer_op.cc | 48 +- .../dataset/text/kernels/jieba_tokenizer_op.h | 11 +- .../text/kernels/regex_tokenizer_op.cc | 57 ++- .../dataset/text/kernels/regex_tokenizer_op.h | 14 +- .../text/kernels/unicode_char_tokenizer_op.cc | 30 +- .../text/kernels/unicode_char_tokenizer_op.h | 9 +- .../kernels/unicode_script_tokenizer_op.cc | 31 +- .../kernels/unicode_script_tokenizer_op.h | 8 +- .../text/kernels/whitespace_tokenizer_op.cc | 34 +- .../text/kernels/whitespace_tokenizer_op.h | 9 +- .../text/kernels/wordpiece_tokenizer_op.cc | 67 ++- .../text/kernels/wordpiece_tokenizer_op.h | 14 +- mindspore/dataset/text/transforms.py | 187 ++++++- mindspore/dataset/text/validators.py | 138 ++++- .../ut/cpp/dataset/jieba_tokenizer_op_test.cc | 45 +- tests/ut/cpp/dataset/tokenizer_op_test.cc | 347 +++++++------ .../ut/python/dataset/test_basic_tokenizer.py | 83 --- tests/ut/python/dataset/test_nlp_jieop.py | 238 --------- .../dataset/test_text_basic_tokenizer.py | 138 +++++ ...kenizer.py => test_text_bert_tokenizer.py} | 101 +++- .../dataset/test_text_jieba_tokenizer.py | 471 ++++++++++++++++++ .../ut/python/dataset/test_text_tokenizer.py | 380 ++++++++++++++ .../dataset/test_text_wordpiece_tokenizer.py | 160 ++++++ tests/ut/python/dataset/test_tokenizer.py | 233 --------- .../dataset/test_wordpiece_tokenizer.py | 113 ----- 30 files changed, 2067 insertions(+), 997 deletions(-) delete mode 100644 tests/ut/python/dataset/test_basic_tokenizer.py delete mode 100644 tests/ut/python/dataset/test_nlp_jieop.py create mode 100644 tests/ut/python/dataset/test_text_basic_tokenizer.py rename tests/ut/python/dataset/{test_bert_tokenizer.py => test_text_bert_tokenizer.py} (51%) create mode 100644 tests/ut/python/dataset/test_text_jieba_tokenizer.py create mode 100644 tests/ut/python/dataset/test_text_tokenizer.py create mode 100644 tests/ut/python/dataset/test_text_wordpiece_tokenizer.py delete mode 100644 tests/ut/python/dataset/test_tokenizer.py delete mode 100644 tests/ut/python/dataset/test_wordpiece_tokenizer.py diff --git a/mindspore/ccsrc/dataset/api/python_bindings.cc b/mindspore/ccsrc/dataset/api/python_bindings.cc index 0ae64db671..af8ba893be 100644 --- a/mindspore/ccsrc/dataset/api/python_bindings.cc +++ b/mindspore/ccsrc/dataset/api/python_bindings.cc @@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) { void bindTokenizerOps(py::module *m) { (void)py::class_>(*m, "JiebaTokenizerOp", "") - .def(py::init(), py::arg("hmm_path"), py::arg("mp_path"), - py::arg("mode") = JiebaMode::kMix) + .def(py::init(), py::arg("hmm_path"), + py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix, + py::arg("with_offsets") = JiebaTokenizerOp::kDefWithOffsets) .def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); }); (void)py::class_>( *m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.") - .def(py::init<>()); + .def(py::init(), py::arg("with_offsets") = UnicodeCharTokenizerOp::kDefWithOffsets); (void)py::class_>(*m, "LookupOp", "Tensor operation to LookUp each word") .def(py::init, WordIdType>(), py::arg("vocab"), py::arg("unknown")) @@ -619,21 +620,25 @@ void bindTokenizerOps(py::module *m) { py::arg("separator")); (void)py::class_>( *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.") - .def(py::init &, const std::string &, const int &, const std::string &>(), - py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), - py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, - py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken)); + .def( + py::init &, const std::string &, const int &, const std::string &, const bool &>(), + py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), + py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, + py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), + py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets); } void bindDependIcuTokenizerOps(py::module *m) { #ifdef ENABLE_ICU4C (void)py::class_>( *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.") - .def(py::init<>()); + .def(py::init(), py::arg("with_offsets") = WhitespaceTokenizerOp::kDefWithOffsets); (void)py::class_>( *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.") .def(py::init<>()) - .def(py::init(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace); + .def(py::init(), + py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace, + py::arg("with_offsets") = UnicodeScriptTokenizerOp::kDefWithOffsets); (void)py::class_>( *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor") .def(py::init<>()); @@ -647,24 +652,28 @@ void bindDependIcuTokenizerOps(py::module *m) { py::arg("replace_all")); (void)py::class_>( *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.") - .def(py::init(), py::arg("delim_pattern"), py::arg("keep_delim_pattern")); + .def(py::init(), py::arg("delim_pattern"), + py::arg("keep_delim_pattern"), py::arg("with_offsets") = RegexTokenizerOp::kDefWithOffsets); (void)py::class_>( *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.") - .def(py::init(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, + .def(py::init(), + py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, - py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); + py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken, + py::arg("with_offsets") = BasicTokenizerOp::kDefWithOffsets); (void)py::class_>(*m, "BertTokenizerOp", "Tokenizer used for Bert text process.") - .def(py::init &, const std::string &, const int &, const std::string &, bool, bool, - NormalizeForm, bool>(), + .def(py::init &, const std::string &, const int &, const std::string &, const bool &, + const bool &, const NormalizeForm &, const bool &, const bool &>(), py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, - py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); + py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken, + py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets); #endif } diff --git a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc index 3512a4b2d7..c0217b2083 100644 --- a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc @@ -27,10 +27,12 @@ namespace mindspore { namespace dataset { + const bool BasicTokenizerOp::kDefLowerCase = false; const bool BasicTokenizerOp::kDefKeepWhitespace = false; const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone; const bool BasicTokenizerOp::kDefPreserveUnusedToken = true; +const bool BasicTokenizerOp::kDefWithOffsets = false; const char BasicTokenizerOp::kCommonPattern[] = "[!-/]" "|[:-@]" @@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] = "|[\\x{2F800}-\\x{2FA1F}]"; const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|"; const std::unordered_set BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"}; -BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form, - bool preserve_unused_token) + +BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace, + const NormalizeForm &normalization_form, const bool &preserve_unused_token, + const bool &with_offsets) : lower_case_(lower_case), keep_whitespace_(keep_whitespace), preserve_unused_token_(preserve_unused_token), + with_offsets_(with_offsets), case_fold_(std::make_unique()), nfd_normalize_(std::make_unique(NormalizeForm::kNfd)), normalization_form_(normalization_form), @@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal keep_delim_pattern = kUnusedPattern + keep_delim_pattern; delim_pattern = kUnusedPattern + delim_pattern; } - regex_tokenizer_ = std::make_unique(delim_pattern, keep_delim_pattern); + regex_tokenizer_ = std::make_unique(delim_pattern, keep_delim_pattern, with_offsets_); } Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text, @@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { +Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); + if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); } std::shared_ptr cur_input; @@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr &input, std::shar if (lower_case_) { if (!preserve_unused_token_) { // to lower case - RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor)); + RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor)); } else { // to lower case except words in kUnusedWords - RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input, &processed_tensor)); + RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor)); } cur_input = processed_tensor; // strip accent characters @@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr &input, std::shar cur_input = processed_tensor; RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor)); } else { - RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor)); + RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor)); } // strip control characters cur_input = processed_tensor; RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor)); - return regex_tokenizer_->Compute(processed_tensor, output); + return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output); } } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h index 01827a0ba4..258c08c946 100644 --- a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h @@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp { static const bool kDefKeepWhitespace; static const NormalizeForm kDefNormalizationForm; static const bool kDefPreserveUnusedToken; - explicit BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace, - NormalizeForm normalization_form = kDefNormalizationForm, - bool preserve_unused_token = kDefPreserveUnusedToken); + static const bool kDefWithOffsets; + + explicit BasicTokenizerOp(const bool &lower_case = kDefLowerCase, const bool &keep_whitespace = kDefKeepWhitespace, + const NormalizeForm &normalization_form = kDefNormalizationForm, + const bool &preserve_unused_token = kDefPreserveUnusedToken, + const bool &with_offsets = kDefWithOffsets); ~BasicTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; protected: Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set &unused_words, @@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp { static const char kCommonPattern[]; static const char kUnusedPattern[]; static const std::unordered_set kUnusedWords; + bool with_offsets_; bool lower_case_; bool keep_whitespace_; NormalizeForm normalization_form_; diff --git a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc index 2b68a5accb..3e7f1251ed 100644 --- a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc @@ -16,9 +16,9 @@ #include "dataset/text/kernels/bert_tokenizer_op.h" namespace mindspore { namespace dataset { -Status BertTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - std::shared_ptr basic_tensor; +Status BertTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + TensorRow basic_tensor; RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor)); RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output)); return Status::OK(); diff --git a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h index 660fdc7ba5..2933c3dc14 100644 --- a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h @@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp { const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator, const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken, const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken, - bool lower_case = BasicTokenizerOp::kDefLowerCase, - bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace, - NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm, - bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken) - : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token), - basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {} + const bool &lower_case = BasicTokenizerOp::kDefLowerCase, + const bool &keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace, + const NormalizeForm &normalization_form = BasicTokenizerOp::kDefNormalizationForm, + const bool &preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken, + const bool &with_offsets = WordpieceTokenizerOp::kDefWithOffsets) + : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets), + basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token, with_offsets) {} ~BertTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "BertTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; private: WordpieceTokenizerOp wordpiece_tokenizer_; diff --git a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc index de1d915fbb..b221e9cafd 100644 --- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc @@ -23,35 +23,63 @@ namespace mindspore { namespace dataset { -JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, JiebaMode mode) - : jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path) { +const bool JiebaTokenizerOp::kDefWithOffsets = false; + +JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, const JiebaMode &mode, + const bool &with_offsets) + : jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path), with_offsets_(with_offsets) { jieba_parser_ = std::make_unique(mp_dict_path_, hmm_model_path_, ""); } -Status JiebaTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); +Status JiebaTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); RETURN_UNEXPECTED_IF_NULL(jieba_parser_); - if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { + if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor"); } std::string_view sentence_v; - RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {})); + RETURN_IF_NOT_OK(input[0]->GetItemAt(&sentence_v, {})); std::string sentence{sentence_v}; std::vector words; + std::vector offsets_start, offsets_limit; + std::shared_ptr token_tensor, offsets_start_tensor, offsets_limit_tensor; if (sentence == "") { words.push_back(""); } else { + std::vector tmp; if (jieba_mode_ == JiebaMode::kMp) { - jieba_parser_->CutSmall(sentence, words, MAX_WORD_LENGTH); + std::unique_ptr mp_seg = std::make_unique(jieba_parser_->GetDictTrie()); + mp_seg->Cut(sentence, tmp, MAX_WORD_LENGTH); } else if (jieba_mode_ == JiebaMode::kHmm) { - jieba_parser_->CutHMM(sentence, words); + std::unique_ptr hmm_seg = + std::make_unique(jieba_parser_->GetHMMModel()); + hmm_seg->Cut(sentence, tmp); } else { // Mix - jieba_parser_->Cut(sentence, words, true); + std::unique_ptr mix_seg = + std::make_unique(jieba_parser_->GetDictTrie(), jieba_parser_->GetHMMModel()); + mix_seg->Cut(sentence, tmp, true); + } + GetStringsFromWords(tmp, words); + for (auto item : tmp) { + offsets_start.push_back(static_cast(item.offset)); + offsets_limit.push_back(static_cast(item.offset + item.word.length())); } } - *output = std::make_shared(words, TensorShape({(dsize_t)words.size()})); + token_tensor = std::make_shared(words, TensorShape({(dsize_t)words.size()})); + output->push_back(token_tensor); + if (with_offsets_) { + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_start[0]))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_limit[0]))); + output->push_back(offsets_start_tensor); + output->push_back(offsets_limit_tensor); + } return Status::OK(); } diff --git a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h index 41736e4fdb..ca2aeea793 100644 --- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h @@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 }; class JiebaTokenizerOp : public TensorOp { public: - // deffault constant for Jieba MPSegment algorithm. + // default constant for Jieba MPSegment algorithm. static constexpr size_t MAX_WORD_LENGTH = 512; + // default const for set whether Jieba output offsets tensor. + static const bool kDefWithOffsets; // Constructor for JiebaTokenizerOp. // @param hmm_path HMM model file. // @param mp_path MP model file. // @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will // tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and // HMMSegment algorithm. - JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, JiebaMode mode = JiebaMode::kMix); + // @with_offsets user set this value to choose whether output offset tensor. + JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix, + const bool &with_offsets = kDefWithOffsets); ~JiebaTokenizerOp() override = default; void Print(std::ostream &out) const override { @@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp { << mp_dict_path_; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; // @word the word to be added to the JiebaTokenizer. // @freq [Default 0] the frequency fo the word to be added. @@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp { std::string mp_dict_path_; std::unique_ptr jieba_parser_; JiebaMode jieba_mode_; + bool with_offsets_; }; } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc index 34c06f28ea..b15df9af67 100644 --- a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc @@ -22,8 +22,11 @@ namespace mindspore { namespace dataset { -Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, - icu::UnicodeString *out_unicode) const { + +const bool RegexTokenizerOp::kDefWithOffsets = false; + +Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, + std::string *out_utf8, icu::UnicodeString *out_unicode) const { CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input"); int total_len = input.length(); int end = start + len; @@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s return Status::OK(); } -Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector *out_tokens) const { +Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector *out_tokens, + std::vector *offsets_start, + std::vector *offsets_limit) const { UErrorCode status = U_ZERO_ERROR; out_tokens->clear(); icu::RegexMatcher token_matcher(delim_pattern_, 0, status); @@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector 0) { std::string token; + uint32_t token_offset = 0; RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token)); + token_offset = token.length(); out_tokens->emplace_back(std::move(token)); + offsets_start->push_back(static_cast(text_start_index)); + offsets_limit->push_back(static_cast(text_start_index + token_offset)); + text_start_index += token_offset; } int delim_len = deli_end_index - deli_start_index; - if (keep_delim_ && delim_len > 0) { + if (delim_len > 0) { icu::UnicodeString delim_str; std::string delim_utf8_str; + uint32_t delim_str_offset = 0; RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str)); delim_matcher.reset(delim_str); - if (delim_matcher.matches(status) && U_SUCCESS(status)) { + delim_str_offset = delim_utf8_str.length(); + if (keep_delim_ && delim_matcher.matches(status) && U_SUCCESS(status)) { out_tokens->emplace_back(std::move(delim_utf8_str)); + offsets_start->push_back(static_cast(text_start_index)); + offsets_limit->push_back(static_cast(text_start_index + delim_str_offset)); } + text_start_index += delim_str_offset; } token_start_index = deli_end_index; } if (token_start_index < utext.length()) { std::string temp; + uint32_t temp_offset = 0; RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp)); + temp_offset = temp.length(); out_tokens->emplace_back(std::move(temp)); + offsets_start->push_back(static_cast(text_start_index)); + offsets_limit->push_back(static_cast(text_start_index + temp_offset)); } return Status::OK(); } -Status RegexTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { +Status RegexTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); + if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); } std::string_view text; - RETURN_IF_NOT_OK(input->GetItemAt(&text, {})); std::vector tokens; - RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens)); - *output = std::make_shared(std::move(tokens), TensorShape({(dsize_t)tokens.size()})); + std::vector offsets_start; + std::vector offsets_limit; + std::shared_ptr token_tensor, offsets_start_tensor, offsets_limit_tensor; + RETURN_IF_NOT_OK(input[0]->GetItemAt(&text, {})); + RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens, &offsets_start, &offsets_limit)); + token_tensor = std::make_shared(std::move(tokens), TensorShape({(dsize_t)tokens.size()})); + output->push_back(token_tensor); + if (with_offsets_) { + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_start[0]))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_limit[0]))); + output->push_back(offsets_start_tensor); + output->push_back(offsets_limit_tensor); + } return Status::OK(); } } // namespace dataset diff --git a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h index bcf02a4a11..f351800b46 100644 --- a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h @@ -32,25 +32,31 @@ namespace dataset { class RegexTokenizerOp : public TensorOp { public: - RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern) + static const bool kDefWithOffsets; + + RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern, + const bool &with_offsets = kDefWithOffsets) : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)), keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)), + with_offsets_(with_offsets), keep_delim_(!keep_delim_pattern.empty()) {} ~RegexTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; protected: - Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, + Status GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, std::string *out_utf8, icu::UnicodeString *out_unicode = nullptr) const; - Status GetRegexTokens(const std::string &text, std::vector *out_tokens) const; + Status GetRegexTokens(const std::string &text, std::vector *out_tokens, + std::vector *offsets_start, std::vector *offsets_limit) const; private: const icu::UnicodeString delim_pattern_; const icu::UnicodeString keep_delim_pattern_; + bool with_offsets_; const bool keep_delim_; }; } // namespace dataset diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc index 063bf21630..d2bd22058b 100644 --- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc @@ -27,26 +27,46 @@ using cppjieba::RuneStrArray; namespace mindspore { namespace dataset { -Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { +const bool UnicodeCharTokenizerOp::kDefWithOffsets = false; + +Status UnicodeCharTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); + if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); } std::string_view str; - RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); + RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {})); RuneStrArray runes; if (!DecodeRunesInString(str.data(), str.size(), runes)) { RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); } + std::shared_ptr token_tensor, offsets_start_tensor, offsets_limit_tensor; std::vector splits(runes.size()); + std::vector offsets_start, offsets_limit; for (size_t i = 0; i < runes.size(); i++) { + offsets_start.push_back(runes[i].offset); + offsets_limit.push_back(runes[i].offset + runes[i].len); splits[i] = str.substr(runes[i].offset, runes[i].len); } if (splits.empty()) { splits.emplace_back(""); + offsets_start.push_back(0); + offsets_limit.push_back(0); + } + token_tensor = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); + output->push_back(token_tensor); + if (with_offsets_) { + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_start[0]))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_limit[0]))); + output->push_back(offsets_start_tensor); + output->push_back(offsets_limit_tensor); } - *output = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); return Status::OK(); } } // namespace dataset diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h index 01a84eca8b..ab15696c95 100644 --- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h @@ -26,13 +26,18 @@ namespace dataset { class UnicodeCharTokenizerOp : public TensorOp { public: - UnicodeCharTokenizerOp() {} + static const bool kDefWithOffsets; + + explicit UnicodeCharTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {} ~UnicodeCharTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "UnicodeCharTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; + + private: + bool with_offsets_; }; } // namespace dataset diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc index 97a4f1333d..0760fea90a 100644 --- a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc @@ -32,24 +32,28 @@ namespace mindspore { namespace dataset { const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false; +const bool UnicodeScriptTokenizerOp::kDefWithOffsets = false; -Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { +Status UnicodeScriptTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); + if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); } std::string_view str; - RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); + RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {})); RuneStrArray runes; if (!DecodeRunesInString(str.data(), str.size(), runes)) { RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); } + std::shared_ptr token_tensor, offsets_start_tensor, offsets_limit_tensor; UScriptCode last_script = USCRIPT_INVALID_CODE; icu::ErrorCode status; int start = 0; int len = 0; std::vector splits; + std::vector offsets_start, offsets_limit; bool was_space = false; for (size_t i = 0; i < runes.size(); i++) { @@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr &input, s if (len > 0 && (script != last_script || is_space != was_space)) { // 3) If keep_whitespace_ is false, all the whitespace characters will be discard if (keep_whitespace_ || !was_space) { + offsets_start.push_back(static_cast(start)); + offsets_limit.push_back(static_cast(start + len)); std::string temp(str.substr(start, len)); splits.emplace_back(std::move(temp)); } @@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr &input, s } if (len > 0 && (keep_whitespace_ || !was_space)) { + offsets_start.push_back(static_cast(start)); + offsets_limit.push_back(static_cast(start + len)); std::string temp(str.substr(start, len)); splits.emplace_back(std::move(temp)); } // 4) If the input is empty scalar string, the output will be 1-D empty string. if (splits.empty()) { splits.emplace_back(""); + offsets_start.push_back(0); + offsets_limit.push_back(0); + } + token_tensor = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); + output->push_back(token_tensor); + if (with_offsets_) { + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_start[0]))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_limit[0]))); + output->push_back(offsets_start_tensor); + output->push_back(offsets_limit_tensor); } - *output = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); return Status::OK(); } } // namespace dataset diff --git a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h index a77b0b3fa3..eaf0a66be1 100644 --- a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h @@ -27,17 +27,21 @@ namespace dataset { class UnicodeScriptTokenizerOp : public TensorOp { public: static const bool kDefKeepWhitespace; + static const bool kDefWithOffsets; - explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {} + explicit UnicodeScriptTokenizerOp(const bool &keep_whitespace = kDefKeepWhitespace, + const bool &with_offsets = kDefWithOffsets) + : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} ~UnicodeScriptTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; private: bool keep_whitespace_; // If or not keep whitespace tokens + bool with_offsets_; }; } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc index 35f3f8d0e2..16bc2c87a3 100644 --- a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc @@ -30,24 +30,33 @@ using cppjieba::RuneStrArray; namespace mindspore { namespace dataset { -Status WhitespaceTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { + +const bool WhitespaceTokenizerOp::kDefWithOffsets = false; + +Status WhitespaceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); + if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); } std::string_view str; - RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); + RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {})); RuneStrArray runes; if (!DecodeRunesInString(str.data(), str.size(), runes)) { RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); } + + std::shared_ptr token_tensor, offsets_start_tensor, offsets_limit_tensor; + std::vector offsets_start, offsets_limit; std::vector splits; int start = 0; int len = 0; for (size_t i = 0; i < runes.size(); i++) { if (u_isUWhiteSpace(runes[i].rune)) { if (len > 0) { + offsets_start.push_back(static_cast(start)); + offsets_limit.push_back(static_cast(start + len)); std::string temp(str.substr(start, len)); splits.emplace_back(std::move(temp)); len = 0; @@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr &input, std: } } if (len > 0) { + offsets_start.push_back(static_cast(start)); + offsets_limit.push_back(static_cast(start + len)); std::string temp(str.substr(start, len)); splits.emplace_back(std::move(temp)); } if (splits.empty()) { splits.emplace_back(""); + offsets_start.push_back(0); + offsets_limit.push_back(0); + } + token_tensor = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); + output->push_back(token_tensor); + if (with_offsets_) { + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_start[0]))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_limit[0]))); + output->push_back(offsets_start_tensor); + output->push_back(offsets_limit_tensor); } - *output = std::make_shared(splits, TensorShape({(dsize_t)splits.size()})); return Status::OK(); } } // namespace dataset diff --git a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h index 6d0bab0bea..50d695ce5b 100644 --- a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h @@ -26,13 +26,18 @@ namespace dataset { class WhitespaceTokenizerOp : public TensorOp { public: - WhitespaceTokenizerOp() {} + static const bool kDefWithOffsets; + + explicit WhitespaceTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {} ~WhitespaceTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; + + private: + bool with_offsets_; }; } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc index e488c527cd..0cd65cdd7c 100644 --- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc @@ -24,13 +24,16 @@ namespace dataset { const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##"; const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100; const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]"; +const bool WordpieceTokenizerOp::kDefWithOffsets = false; WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr &vocab, const std::string &suffix_indicator, - const int &max_bytes_per_token, const std::string &unknown_token) + const int &max_bytes_per_token, const std::string &unknown_token, + const bool &with_offsets) : vocab_(vocab), suffix_indicator_(suffix_indicator), max_bytes_per_token_(max_bytes_per_token), - unknown_token_(unknown_token) {} + unknown_token_(unknown_token), + with_offsets_(with_offsets) {} Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found, int *out_end) const { @@ -52,17 +55,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru return Status::OK(); } -Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector *out_tokens) const { +Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, const uint32_t &basic_start, + std::vector *out_tokens, std::vector *offsets_start, + std::vector *offsets_limit) const { out_tokens->clear(); + offsets_start->push_back(basic_start); if (unknown_token_.empty()) { out_tokens->emplace_back(input_token); + offsets_limit->push_back(basic_start + input_token.length()); } else { out_tokens->emplace_back(unknown_token_); + offsets_limit->push_back(basic_start + input_token.length()); } return Status::OK(); } -Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end, +Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int &start, const int &end, std::vector *out_tokens) const { CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range"); std::string subword = input_token.substr(start, end - start); @@ -73,9 +81,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in return Status::OK(); } -Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector *out_tokens) const { +Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, const uint32_t &basic_start, + std::vector *out_tokens, std::vector *offsets_start, + std::vector *offsets_limit) const { if (input_token.size() > max_bytes_per_token_) { - return FoundNoToken(input_token, out_tokens); + offsets_start->push_back(basic_start); + if (!unknown_token_.empty()) { + offsets_limit->push_back(basic_start + unknown_token_.size()); + out_tokens->emplace_back(unknown_token_); + } else { + out_tokens->emplace_back(input_token); + offsets_limit->push_back(basic_start + input_token.size()); + } + return Status::OK(); } RuneStrArray runes; if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) { @@ -87,29 +105,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end)); if (found) { RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens)); + offsets_start->push_back(static_cast(basic_start + start)); + offsets_limit->push_back(static_cast(basic_start + end)); start = end; } else { - return FoundNoToken(input_token, out_tokens); + return FoundNoToken(input_token, basic_start, out_tokens, offsets_start, offsets_limit); } } return Status::OK(); } -Status WordpieceTokenizerOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { - IO_CHECK(input, output); - if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { +Status WordpieceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { + IO_CHECK_VECTOR(input, output); + if (input[0]->Rank() > 1 || input[0]->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); } + dsize_t count = 0; std::vector out_tokens; - for (auto iter = input->begin(); iter != input->end(); iter++) { + std::vector offsets_start, offsets_limit; + std::shared_ptr token_tensor, offsets_start_tensor, offsets_limit_tensor; + for (auto iter = input[0]->begin(); iter != input[0]->end(); iter++) { + uint32_t basic_start = 0; std::vector temp_tokens; - RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens)); + if (with_offsets_ && input.size() == 3) { + RETURN_IF_NOT_OK(input[1]->GetItemAt(&basic_start, {count, 0})); + } + RETURN_IF_NOT_OK(GetTokens(std::string(*iter), basic_start, &temp_tokens, &offsets_start, &offsets_limit)); out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end()); + count++; } if (out_tokens.empty()) { out_tokens.emplace_back(""); + offsets_start.push_back(0); + offsets_limit.push_back(0); + } + token_tensor = std::make_shared(out_tokens, TensorShape({(dsize_t)out_tokens.size()})); + output->push_back(token_tensor); + if (with_offsets_) { + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_start[0]))); + RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, + TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), + reinterpret_cast(&offsets_limit[0]))); + output->push_back(offsets_start_tensor); + output->push_back(offsets_limit_tensor); } - *output = std::make_shared(out_tokens, TensorShape({(dsize_t)out_tokens.size()})); return Status::OK(); } diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h index c9a75025c6..4784902b46 100644 --- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h @@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp { static const char kDefSuffixIndicator[]; static const int kDefMaxBytesPerToken; static const char kDefUnknownToken[]; + static const bool kDefWithOffsets; WordpieceTokenizerOp(const std::shared_ptr &vocab, const std::string &suffix_indicator = kDefSuffixIndicator, const int &max_bytes_per_token = kDefMaxBytesPerToken, - const std::string &unknown_token = kDefUnknownToken); + const std::string &unknown_token = kDefUnknownToken, const bool &with_offsets = kDefWithOffsets); ~WordpieceTokenizerOp() override = default; void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; } - Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; + Status Compute(const TensorRow &input, TensorRow *output) override; protected: - Status AddSubword(const std::string &input_token, const int start, const int end, + Status AddSubword(const std::string &input_token, const int &start, const int &end, std::vector *out_token) const; - Status FoundNoToken(const std::string &input_token, std::vector *out_tokens) const; + Status FoundNoToken(const std::string &input_token, const uint32_t &basic_start, std::vector *out_tokens, + std::vector *offsets_start, std::vector *offsets_limit) const; Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found, int *out_end) const; - Status GetTokens(const std::string &input_token, std::vector *out_tokens) const; + Status GetTokens(const std::string &input_token, const uint32_t &basic_start, std::vector *out_tokens, + std::vector *offsets_start, std::vector *offsets_limit) const; private: const std::shared_ptr vocab_; const std::string suffix_indicator_; + const bool with_offsets_; const int max_bytes_per_token_; const std::string unknown_token_; }; diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index f829e4ba73..90c54b80db 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde from .utils import JiebaMode, NormalizeForm, to_str from .validators import check_lookup, check_jieba_add_dict, \ - check_jieba_add_word, check_jieba_init, check_ngram, check_pair_truncate, \ - check_to_number, check_python_tokenizer + check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer,\ + check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate,\ + check_to_number, check_bert_tokenizer, check_python_tokenizer from ..core.datatypes import mstype_to_detype @@ -125,15 +126,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): - JiebaMode.MP, tokenize with MPSegment algorithm. - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False) + >>> data = data.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ @check_jieba_init - def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): + def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False): + if not isinstance(mode, JiebaMode): + raise TypeError("Wrong input type for mode, should be JiebaMode.") + self.mode = mode self.__check_path__(hmm_path) self.__check_path__(mp_path) + self.with_offsets = with_offsets super().__init__(hmm_path, mp_path, - DE_C_INTER_JIEBA_MODE[mode]) + DE_C_INTER_JIEBA_MODE[mode], + self.with_offsets) @check_jieba_add_word def add_word(self, word, freq=None): @@ -226,8 +243,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string to Unicode characters. + + Args: + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.UnicodeCharTokenizer() + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.UnicodeCharTokenizer(True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ + @check_with_offsets + def __init__(self, with_offsets=False): + self.with_offsets = with_offsets + super().__init__(self.with_offsets) + class WordpieceTokenizer(cde.WordpieceTokenizerOp): """ @@ -239,22 +274,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100). unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, return the token directly, else return 'unknown_token'(default='[UNK]'). + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], + >>> max_bytes_per_token=100, with_offsets=False) + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], + >>> max_bytes_per_token=100, with_offsets=True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ - def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'): + @check_wordpiece_tokenizer + def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, + unknown_token='[UNK]', with_offsets=False): self.vocab = vocab self.suffix_indicator = suffix_indicator self.max_bytes_per_token = max_bytes_per_token self.unknown_token = unknown_token - super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token) + self.with_offsets = with_offsets + super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, + self.unknown_token, self.with_offsets) if platform.system().lower() != 'windows': class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): """ Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n'). + + Args: + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.WhitespaceTokenizer() + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> # ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.WhitespaceTokenizer(True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ + @check_with_offsets + def __init__(self, with_offsets=False): + self.with_offsets = with_offsets + super().__init__(self.with_offsets) + class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): """ @@ -262,11 +333,25 @@ if platform.system().lower() != 'windows': Args: keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False) + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> # ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ - def __init__(self, keep_whitespace=False): + @check_unicode_script_tokenizer + def __init__(self, keep_whitespace=False, with_offsets=False): self.keep_whitespace = keep_whitespace - super().__init__(self.keep_whitespace) + self.with_offsets = with_offsets + super().__init__(self.keep_whitespace, self.with_offsets) class CaseFold(cde.CaseFoldOp): @@ -302,6 +387,9 @@ if platform.system().lower() != 'windows': """ def __init__(self, normalize_form=NormalizeForm.NFKC): + if not isinstance(normalize_form, NormalizeForm): + raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.") + self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] super().__init__(self.normalize_form) @@ -338,12 +426,26 @@ if platform.system().lower() != 'windows': keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), in this situation, delimiters will not kept as a output token(default=''). + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False) + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> # ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ - def __init__(self, delim_pattern, keep_delim_pattern=''): + @check_regex_tokenizer + def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False): self.delim_pattern = delim_pattern self.keep_delim_pattern = keep_delim_pattern - super().__init__(self.delim_pattern, self.keep_delim_pattern) + self.with_offsets = with_offsets + super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) class BasicTokenizer(cde.BasicTokenizerOp): @@ -359,16 +461,41 @@ if platform.system().lower() != 'windows': only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). preserve_unused_token(bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.BasicTokenizer(lower_case=False, + >>> keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, + >>> preserve_unused_token=True, + >>> with_offsets=False) + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> # ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.BasicTokenizer(lower_case=False, + >>> keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, + >>> preserve_unused_token=True, + >>> with_offsets=True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ - def __init__(self, lower_case=False, keep_whitespace=False, - normalization_form=NormalizeForm.NONE, preserve_unused_token=True): + @check_basic_tokenizer + def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, + preserve_unused_token=True, with_offsets=False): + if not isinstance(normalization_form, NormalizeForm): + raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.") + self.lower_case = lower_case self.keep_whitespace = keep_whitespace self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.preserve_unused_token = preserve_unused_token - super().__init__(self.lower_case, self.keep_whitespace, - self.normalization_form, self.preserve_unused_token) + self.with_offsets = with_offsets + super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form, + self.preserve_unused_token, self.with_offsets) class BertTokenizer(cde.BertTokenizerOp): @@ -389,11 +516,33 @@ if platform.system().lower() != 'windows': only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). preserve_unused_token(bool, optional): If True, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). + with_offsets (bool, optional): If or not output offsets of tokens (default=False). + + Examples: + >>> # If with_offsets=False, default output one column {["text", dtype=str]} + >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, + >>> unknown_token=100, lower_case=False, keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, + >>> with_offsets=False) + >>> dataset = dataset.map(operations=tokenizer_op) + >>> # If with_offsets=False, then output three columns {["token", dtype=str], + >>> # ["offsets_start", dtype=uint32], + >>> # ["offsets_limit", dtype=uint32]} + >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, + >>> unknown_token=100, lower_case=False, keep_whitespace=False, + >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, + >>> with_offsets=True) + >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) """ - def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, - unknown_token='[UNK]', lower_case=False, keep_whitespace=False, - normalization_form=NormalizeForm.NONE, preserve_unused_token=True): + @check_bert_tokenizer + def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', + lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, + preserve_unused_token=True, with_offsets=False): + if not isinstance(normalization_form, NormalizeForm): + raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.") + self.vocab = vocab self.suffix_indicator = suffix_indicator self.max_bytes_per_token = max_bytes_per_token @@ -402,8 +551,10 @@ if platform.system().lower() != 'windows': self.keep_whitespace = keep_whitespace self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.preserve_unused_token = preserve_unused_token + self.with_offsets = with_offsets super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, - self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token) + self.lower_case, self.keep_whitespace, self.normalization_form, + self.preserve_unused_token, self.with_offsets) class TruncateSequencePair(cde.TruncateSequencePairOp): diff --git a/mindspore/dataset/text/validators.py b/mindspore/dataset/text/validators.py index 39a0c4e632..250e3ff42f 100644 --- a/mindspore/dataset/text/validators.py +++ b/mindspore/dataset/text/validators.py @@ -25,7 +25,6 @@ from mindspore._c_expression import typing from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, check_positive, \ INT32_MAX, check_value - def check_unique_list_of_words(words, arg_name): """Check that words is a list and each element is a str without any duplication""" @@ -116,11 +115,22 @@ def check_from_dict(method): def check_jieba_init(method): - """Wrapper method to check the parameters of jieba add word.""" + """Wrapper method to check the parameters of jieba init.""" @wraps(method) def new_method(self, *args, **kwargs): - parse_user_args(method, *args, **kwargs) + [hmm_path, mp_path, _, with_offsets], _ = parse_user_args(method, *args, **kwargs) + + if hmm_path is None: + raise ValueError("The dict of HMMSegment in cppjieba is not provided.") + if not isinstance(hmm_path, str): + raise TypeError("Wrong input type for hmm_path, should be string.") + if mp_path is None: + raise ValueError("The dict of MPSegment in cppjieba is not provided.") + if not isinstance(mp_path, str): + raise TypeError("Wrong input type for mp_path, should be string.") + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") return method(self, *args, **kwargs) return new_method @@ -152,6 +162,128 @@ def check_jieba_add_dict(method): return new_method +def check_with_offsets(method): + """Wrapper method to check if with_offsets is the only one parameter.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [with_offsets], _ = parse_user_args(method, *args, **kwargs) + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") + return method(self, *args, **kwargs) + + return new_method + + +def check_unicode_script_tokenizer(method): + """Wrapper method to check the parameter of UnicodeScriptTokenizer.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [keep_whitespace, with_offsets], _ = parse_user_args(method, *args, **kwargs) + if not isinstance(keep_whitespace, bool): + raise TypeError("Wrong input type for keep_whitespace, should be boolean.") + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") + return method(self, *args, **kwargs) + + return new_method + + +def check_wordpiece_tokenizer(method): + """Wrapper method to check the parameter of WordpieceTokenizer.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets], _ =\ + parse_user_args(method, *args, **kwargs) + if vocab is None: + raise ValueError("vocab is not provided.") + if not isinstance(vocab, cde.Vocab): + raise TypeError("Wrong input type for vocab, should be Vocab object.") + if not isinstance(suffix_indicator, str): + raise TypeError("Wrong input type for suffix_indicator, should be string.") + if not isinstance(unknown_token, str): + raise TypeError("Wrong input type for unknown_token, should be string.") + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") + check_uint32(max_bytes_per_token) + return method(self, *args, **kwargs) + + return new_method + + +def check_regex_tokenizer(method): + """Wrapper method to check the parameter of RegexTokenizer.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [delim_pattern, keep_delim_pattern, with_offsets], _ = parse_user_args(method, *args, **kwargs) + if delim_pattern is None: + raise ValueError("delim_pattern is not provided.") + if not isinstance(delim_pattern, str): + raise TypeError("Wrong input type for delim_pattern, should be string.") + if not isinstance(keep_delim_pattern, str): + raise TypeError("Wrong input type for keep_delim_pattern, should be string.") + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") + return method(self, *args, **kwargs) + + return new_method + + +def check_basic_tokenizer(method): + """Wrapper method to check the parameter of RegexTokenizer.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [lower_case, keep_whitespace, _, preserve_unused, with_offsets], _ =\ + parse_user_args(method, *args, **kwargs) + if not isinstance(lower_case, bool): + raise TypeError("Wrong input type for lower_case, should be boolean.") + if not isinstance(keep_whitespace, bool): + raise TypeError("Wrong input type for keep_whitespace, should be boolean.") + if not isinstance(preserve_unused, bool): + raise TypeError("Wrong input type for preserve_unused_token, should be boolean.") + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") + return method(self, *args, **kwargs) + + return new_method + + +def check_bert_tokenizer(method): + """Wrapper method to check the parameter of BertTokenizer.""" + + @wraps(method) + def new_method(self, *args, **kwargs): + [vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, _, + preserve_unused_token, with_offsets], _ = parse_user_args(method, *args, **kwargs) + if vocab is None: + raise ValueError("vacab is not provided.") + if not isinstance(vocab, cde.Vocab): + raise TypeError("Wrong input type for vocab, should be Vocab object.") + if not isinstance(suffix_indicator, str): + raise TypeError("Wrong input type for suffix_indicator, should be string.") + if not isinstance(max_bytes_per_token, int): + raise TypeError("Wrong input type for max_bytes_per_token, should be int.") + check_uint32(max_bytes_per_token) + + if not isinstance(unknown_token, str): + raise TypeError("Wrong input type for unknown_token, should be string.") + if not isinstance(lower_case, bool): + raise TypeError("Wrong input type for lower_case, should be boolean.") + if not isinstance(keep_whitespace, bool): + raise TypeError("Wrong input type for keep_whitespace, should be boolean.") + if not isinstance(preserve_unused_token, bool): + raise TypeError("Wrong input type for preserve_unused_token, should be boolean.") + if not isinstance(with_offsets, bool): + raise TypeError("Wrong input type for with_offsets, should be boolean.") + return method(self, *args, **kwargs) + + return new_method + + def check_from_dataset(method): """A wrapper that wrap a parameter checker to the original function.""" diff --git a/tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc b/tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc index c5a733f285..849943beb1 100644 --- a/tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc +++ b/tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc @@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) { std::string dataset_path = datasets_root_path_ + "/jiebadict"; std::string hmm_path = dataset_path + "/hmm_model.utf8"; std::string mp_path = dataset_path + "/jieba.dict.utf8"; - std::shared_ptr output_tensor; + TensorRow input, output; std::unique_ptr op(new JiebaTokenizerOp(hmm_path, mp_path)); std::shared_ptr input_tensor = std::make_shared("今天天气太好了我们一起去外面玩吧"); - Status s = op->Compute(input_tensor, &output_tensor); + input.push_back(input_tensor); + Status s = op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output_tensor->Rank(), 1); - EXPECT_EQ(output_tensor->Size(), 7); - CheckEqual(output_tensor, {0}, "今天天气"); - CheckEqual(output_tensor, {1}, "太好了"); - CheckEqual(output_tensor, {2}, "我们"); - CheckEqual(output_tensor, {3}, "一起"); - CheckEqual(output_tensor, {4}, "去"); - CheckEqual(output_tensor, {5}, "外面"); - CheckEqual(output_tensor, {6}, "玩吧"); + EXPECT_EQ(output[0]->Rank(), 1); + EXPECT_EQ(output[0]->Size(), 7); + CheckEqual(output[0], {0}, "今天天气"); + CheckEqual(output[0], {1}, "太好了"); + CheckEqual(output[0], {2}, "我们"); + CheckEqual(output[0], {3}, "一起"); + CheckEqual(output[0], {4}, "去"); + CheckEqual(output[0], {5}, "外面"); + CheckEqual(output[0], {6}, "玩吧"); } TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) { @@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) { std::string dataset_path = datasets_root_path_ + "/jiebadict"; std::string hmm_path = dataset_path + "/hmm_model.utf8"; std::string mp_path = dataset_path + "/jieba.dict.utf8"; - std::shared_ptr output_tensor; + TensorRow input, output; std::unique_ptr op(new JiebaTokenizerOp(hmm_path, mp_path)); op->AddWord("男默女泪"); std::shared_ptr input_tensor = std::make_shared("男默女泪"); - Status s = op->Compute(input_tensor, &output_tensor); + input.push_back(input_tensor); + Status s = op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output_tensor->Rank(), 1); - EXPECT_EQ(output_tensor->Size(), 1); - CheckEqual(output_tensor, {0}, "男默女泪"); + EXPECT_EQ(output[0]->Rank(), 1); + EXPECT_EQ(output[0]->Size(), 1); + CheckEqual(output[0], {0}, "男默女泪"); } TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) { @@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) { std::string dataset_path = datasets_root_path_ + "/jiebadict"; std::string hmm_path = dataset_path + "/hmm_model.utf8"; std::string mp_path = dataset_path + "/jieba.dict.utf8"; - std::shared_ptr output_tensor; + TensorRow input, output; std::unique_ptr op(new JiebaTokenizerOp(hmm_path, mp_path)); op->AddWord("男默女泪"); std::shared_ptr input_tensor = std::make_shared(""); - Status s = op->Compute(input_tensor, &output_tensor); + input.push_back(input_tensor); + Status s = op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output_tensor->Rank(), 1); - EXPECT_EQ(output_tensor->Size(), 1); - CheckEqual(output_tensor, {0}, ""); + EXPECT_EQ(output[0]->Rank(), 1); + EXPECT_EQ(output[0]->Size(), 1); + CheckEqual(output[0], {0}, ""); } \ No newline at end of file diff --git a/tests/ut/cpp/dataset/tokenizer_op_test.cc b/tests/ut/cpp/dataset/tokenizer_op_test.cc index 8a18f0da0c..afac92aa4b 100644 --- a/tests/ut/cpp/dataset/tokenizer_op_test.cc +++ b/tests/ut/cpp/dataset/tokenizer_op_test.cc @@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common { TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) { MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp."; - std::unique_ptr op(new UnicodeCharTokenizerOp()); + std::unique_ptr op(new UnicodeCharTokenizerOp(true)); std::shared_ptr input = std::make_shared("Hello World!"); - std::shared_ptr output; - Status s = op->Compute(input, &output); + TensorRow output; + Status s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 12); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor1: " << output->ToString(); - CheckEqual(output, {0}, "H"); - CheckEqual(output, {1}, "e"); - CheckEqual(output, {2}, "l"); - CheckEqual(output, {3}, "l"); - CheckEqual(output, {4}, "o"); - CheckEqual(output, {5}, " "); - CheckEqual(output, {6}, "W"); - CheckEqual(output, {7}, "o"); - CheckEqual(output, {8}, "r"); - CheckEqual(output, {9}, "l"); - CheckEqual(output, {10}, "d"); - CheckEqual(output, {11}, "!"); + EXPECT_EQ(output[0]->Size(), 12); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "H"); + CheckEqual(output[0], {1}, "e"); + CheckEqual(output[0], {2}, "l"); + CheckEqual(output[0], {3}, "l"); + CheckEqual(output[0], {4}, "o"); + CheckEqual(output[0], {5}, " "); + CheckEqual(output[0], {6}, "W"); + CheckEqual(output[0], {7}, "o"); + CheckEqual(output[0], {8}, "r"); + CheckEqual(output[0], {9}, "l"); + CheckEqual(output[0], {10}, "d"); + CheckEqual(output[0], {11}, "!"); input = std::make_shared("中国 你好!"); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 6); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor2: " << output->ToString(); - CheckEqual(output, {0}, "中"); - CheckEqual(output, {1}, "国"); - CheckEqual(output, {2}, " "); - CheckEqual(output, {3}, "你"); - CheckEqual(output, {4}, "好"); - CheckEqual(output, {5}, "!"); + EXPECT_EQ(output[0]->Size(), 6); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "中"); + CheckEqual(output[0], {1}, "国"); + CheckEqual(output[0], {2}, " "); + CheckEqual(output[0], {3}, "你"); + CheckEqual(output[0], {4}, "好"); + CheckEqual(output[0], {5}, "!"); input = std::make_shared("中"); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor3: " << output->ToString(); - CheckEqual(output, {0}, "中"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "中"); input = std::make_shared("H"); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor4: " << output->ToString(); - CheckEqual(output, {0}, "H"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "H"); input = std::make_shared(" "); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 2); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor5: " << output->ToString(); - CheckEqual(output, {0}, " "); - CheckEqual(output, {1}, " "); + EXPECT_EQ(output[0]->Size(), 2); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); + CheckEqual(output[0], {0}, " "); + CheckEqual(output[0], {1}, " "); input = std::make_shared(""); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor6: " << output->ToString(); - CheckEqual(output, {0}, ""); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); + CheckEqual(output[0], {0}, ""); } TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) { MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp."; - std::unique_ptr op(new WhitespaceTokenizerOp()); + std::unique_ptr op(new WhitespaceTokenizerOp(true)); std::shared_ptr input = std::make_shared("Welcome to China."); - std::shared_ptr output; - Status s = op->Compute(input, &output); + TensorRow output; + Status s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 3); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor1: " << output->ToString(); - CheckEqual(output, {0}, "Welcome"); - CheckEqual(output, {1}, "to"); - CheckEqual(output, {2}, "China."); + EXPECT_EQ(output[0]->Size(), 3); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "Welcome"); + CheckEqual(output[0], {1}, "to"); + CheckEqual(output[0], {2}, "China."); input = std::make_shared(" hello"); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor2: " << output->ToString(); - CheckEqual(output, {0}, "hello"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "hello"); input = std::make_shared("hello"); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor3: " << output->ToString(); - CheckEqual(output, {0}, "hello"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "hello"); input = std::make_shared("hello "); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor4: " << output->ToString(); - CheckEqual(output, {0}, "hello"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "hello"); input = std::make_shared(" "); - s = op->Compute(input, &output); + output.clear(); + s = op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor5: " << output->ToString(); - CheckEqual(output, {0}, ""); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); + CheckEqual(output[0], {0}, ""); } TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) { MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer."; - std::unique_ptr keep_whitespace_op(new UnicodeScriptTokenizerOp(true)); - std::unique_ptr skip_whitespace_op(new UnicodeScriptTokenizerOp(false)); + std::unique_ptr keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true)); + std::unique_ptr skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true)); std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); - std::shared_ptr output; - Status s = keep_whitespace_op->Compute(input, &output); + TensorRow output; + Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 10); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor1: " << output->ToString(); - CheckEqual(output, {0}, "Welcome"); - CheckEqual(output, {1}, " "); - CheckEqual(output, {2}, "to"); - CheckEqual(output, {3}, " "); - CheckEqual(output, {4}, "China"); - CheckEqual(output, {5}, "."); - CheckEqual(output, {6}, " \n "); - CheckEqual(output, {7}, "中国"); - CheckEqual(output, {8}, "\t"); - CheckEqual(output, {9}, "北京"); - s = skip_whitespace_op->Compute(input, &output); + EXPECT_EQ(output[0]->Size(), 10); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "Welcome"); + CheckEqual(output[0], {1}, " "); + CheckEqual(output[0], {2}, "to"); + CheckEqual(output[0], {3}, " "); + CheckEqual(output[0], {4}, "China"); + CheckEqual(output[0], {5}, "."); + CheckEqual(output[0], {6}, " \n "); + CheckEqual(output[0], {7}, "中国"); + CheckEqual(output[0], {8}, "\t"); + CheckEqual(output[0], {9}, "北京"); + output.clear(); + s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 6); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor2: " << output->ToString(); - CheckEqual(output, {0}, "Welcome"); - CheckEqual(output, {1}, "to"); - CheckEqual(output, {2}, "China"); - CheckEqual(output, {3}, "."); - CheckEqual(output, {4}, "中国"); - CheckEqual(output, {5}, "北京"); + EXPECT_EQ(output[0]->Size(), 6); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "Welcome"); + CheckEqual(output[0], {1}, "to"); + CheckEqual(output[0], {2}, "China"); + CheckEqual(output[0], {3}, "."); + CheckEqual(output[0], {4}, "中国"); + CheckEqual(output[0], {5}, "北京"); input = std::make_shared(" Welcome to 中国. "); - s = skip_whitespace_op->Compute(input, &output); + output.clear(); + s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 4); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor3: " << output->ToString(); - CheckEqual(output, {0}, "Welcome"); - CheckEqual(output, {1}, "to"); - CheckEqual(output, {2}, "中国"); - CheckEqual(output, {3}, "."); - s = keep_whitespace_op->Compute(input, &output); + EXPECT_EQ(output[0]->Size(), 4); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "Welcome"); + CheckEqual(output[0], {1}, "to"); + CheckEqual(output[0], {2}, "中国"); + CheckEqual(output[0], {3}, "."); + output.clear(); + s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 8); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor4: " << output->ToString(); - CheckEqual(output, {0}, " "); - CheckEqual(output, {1}, "Welcome"); - CheckEqual(output, {2}, " "); - CheckEqual(output, {3}, "to"); - CheckEqual(output, {4}, " "); - CheckEqual(output, {5}, "中国"); - CheckEqual(output, {6}, "."); - CheckEqual(output, {7}, " "); + EXPECT_EQ(output[0]->Size(), 8); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); + CheckEqual(output[0], {0}, " "); + CheckEqual(output[0], {1}, "Welcome"); + CheckEqual(output[0], {2}, " "); + CheckEqual(output[0], {3}, "to"); + CheckEqual(output[0], {4}, " "); + CheckEqual(output[0], {5}, "中国"); + CheckEqual(output[0], {6}, "."); + CheckEqual(output[0], {7}, " "); input = std::make_shared("Hello"); - s = keep_whitespace_op->Compute(input, &output); + output.clear(); + s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor5: " << output->ToString(); - CheckEqual(output, {0}, "Hello"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "Hello"); input = std::make_shared("H"); - s = keep_whitespace_op->Compute(input, &output); + output.clear(); + s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor6: " << output->ToString(); - CheckEqual(output, {0}, "H"); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "H"); input = std::make_shared(""); - s = keep_whitespace_op->Compute(input, &output); + output.clear(); + s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor7: " << output->ToString(); - CheckEqual(output, {0}, ""); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString(); + CheckEqual(output[0], {0}, ""); input = std::make_shared("Hello中国Hello世界"); - s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 4); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor8: " << output->ToString(); - CheckEqual(output, {0}, "Hello"); - CheckEqual(output, {1}, "中国"); - CheckEqual(output, {2}, "Hello"); - CheckEqual(output, {3}, "世界"); + output.clear(); + s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); + EXPECT_EQ(output[0]->Size(), 4); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString(); + CheckEqual(output[0], {0}, "Hello"); + CheckEqual(output[0], {1}, "中国"); + CheckEqual(output[0], {2}, "Hello"); + CheckEqual(output[0], {3}, "世界"); input = std::make_shared(" "); - s = keep_whitespace_op->Compute(input, &output); + output.clear(); + s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor10: " << output->ToString(); - CheckEqual(output, {0}, " "); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString(); + CheckEqual(output[0], {0}, " "); input = std::make_shared(" "); - s = skip_whitespace_op->Compute(input, &output); + output.clear(); + s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); - EXPECT_EQ(output->Size(), 1); - EXPECT_EQ(output->Rank(), 1); - MS_LOG(INFO) << "Out tensor11: " << output->ToString(); - CheckEqual(output, {0}, ""); + EXPECT_EQ(output[0]->Size(), 1); + EXPECT_EQ(output[0]->Rank(), 1); + MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString(); + CheckEqual(output[0], {0}, ""); } TEST_F(MindDataTestTokenizerOp, TestCaseFold) { @@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) { TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) { MS_LOG(INFO) << "Doing TestRegexTokenizerOp."; - std::unique_ptr regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "")); + std::unique_ptr regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true)); std::shared_ptr input = std::make_shared("Welcome to China. \n 中国\t北京"); - std::shared_ptr output; - Status s = regex_tokenizer_op->Compute(input, &output); + TensorRow output; + Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); } @@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) { MS_LOG(INFO) << "Doing TestBasicTokenizer."; //bool lower_case, bool keep_whitespace, // NormalizeForm normalization_form, bool preserve_unused_token - std::unique_ptr basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false)); + std::unique_ptr basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false, + true)); std::shared_ptr input = std::make_shared("Welcome to China. 中国\t北京"); - std::shared_ptr output; - Status s = basic_tokenizer->Compute(input, &output); + TensorRow output; + Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); } \ No newline at end of file diff --git a/tests/ut/python/dataset/test_basic_tokenizer.py b/tests/ut/python/dataset/test_basic_tokenizer.py deleted file mode 100644 index 45c9f94da4..0000000000 --- a/tests/ut/python/dataset/test_basic_tokenizer.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Testing BasicTokenizer op in DE -""" -import numpy as np -import mindspore.dataset as ds -from mindspore import log as logger -import mindspore.dataset.text as nlp - -BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" - -test_paras = [ - dict( - first=1, - last=6, - expected_tokens= - [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'], - ['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'], - ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], - ['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝', - '(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封', - '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'], - ['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代', - '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', - '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], - ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', - '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']] - ), - dict( - first=7, - last=7, - expected_tokens=[['this', 'is', 'a', 'funky', 'string']], - lower_case=True - ), -] - - -def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False, - normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): - dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) - if first > 1: - dataset = dataset.skip(first - 1) - if last >= first: - dataset = dataset.take(last - first + 1) - - basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case, - keep_whitespace=keep_whitespace, - normalization_form=normalization_form, - preserve_unused_token=preserve_unused_token) - - dataset = dataset.map(operations=basic_tokenizer) - count = 0 - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']) - logger.info("Out:", text) - logger.info("Exp:", expected_tokens[count]) - np.testing.assert_array_equal(text, expected_tokens[count]) - count = count + 1 - - -def test_basic_tokenizer(): - """ - Test BasicTokenizer - """ - for paras in test_paras: - check_basic_tokenizer(**paras) - - -if __name__ == '__main__': - test_basic_tokenizer() diff --git a/tests/ut/python/dataset/test_nlp_jieop.py b/tests/ut/python/dataset/test_nlp_jieop.py deleted file mode 100644 index 1ab53205d0..0000000000 --- a/tests/ut/python/dataset/test_nlp_jieop.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -import numpy as np -import mindspore.dataset as ds -from mindspore.dataset.text import JiebaTokenizer -from mindspore.dataset.text import JiebaMode, to_str - -DATA_FILE = "../data/dataset/testJiebaDataset/3.txt" -DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*" - -HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8" -MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8" - - -def test_jieba_1(): - """Test jieba tokenizer with MP mode""" - data = ds.TextFileDataset(DATA_FILE) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] - ret = [] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_1_1(): - """Test jieba tokenizer with HMM mode""" - data = ds.TextFileDataset(DATA_FILE) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_1_2(): - """Test jieba tokenizer with HMM MIX""" - data = ds.TextFileDataset(DATA_FILE) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_2(): - """Test add_word""" - DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_word("男默女泪") - expect = ['男默女泪', '市', '长江大桥'] - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=2) - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_2_1(): - """Test add_word with freq""" - DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_word("男默女泪", 10) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=2) - expect = ['男默女泪', '市', '长江大桥'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_2_2(): - """Test add_word with invalid None Input""" - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - try: - jieba_op.add_word(None) - except ValueError: - pass - - -def test_jieba_2_3(): - """Test add_word with freq, the value of freq affects the result of segmentation""" - DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_word("江大桥", 20000) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=2) - expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_3(): - """Test add_dict with dict""" - DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" - user_dict = { - "男默女泪": 10 - } - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_dict(user_dict) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['男默女泪', '市', '长江大桥'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_3_1(): - """Test add_dict with dict""" - DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" - user_dict = { - "男默女泪": 10, - "江大桥": 20000 - } - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_dict(user_dict) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['男默女泪', '市长', '江大桥'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_4(): - DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" - DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" - - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_dict(DICT_FILE) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def test_jieba_4_1(): - """Test add dict with invalid file path""" - DICT_FILE = "" - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - try: - jieba_op.add_dict(DICT_FILE) - except ValueError: - pass - - -def test_jieba_5(): - """Test add dict with file path""" - DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" - - data = ds.TextFileDataset(DATA_FILE4) - jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) - jieba_op.add_word("江大桥", 20000) - data = data.map(input_columns=["text"], - operations=jieba_op, num_parallel_workers=1) - expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -def gen(): - text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S') - yield (text,) - - -def pytoken_op(input_data): - te = str(to_str(input_data)) - tokens = [] - tokens.append(te[:5].encode("UTF8")) - tokens.append(te[5:10].encode("UTF8")) - tokens.append(te[10:].encode("UTF8")) - return np.array(tokens, dtype='S') - - -def test_jieba_6(): - data = ds.GeneratorDataset(gen, column_names=["text"]) - data = data.map(input_columns=["text"], - operations=pytoken_op, num_parallel_workers=1) - expect = ['今天天气太', '好了我们一', '起去外面玩吧'] - for i in data.create_dict_iterator(): - ret = to_str(i["text"]) - for index, item in enumerate(ret): - assert item == expect[index] - - -if __name__ == "__main__": - test_jieba_1() - test_jieba_1_1() - test_jieba_1_2() - test_jieba_2() - test_jieba_2_1() - test_jieba_2_2() - test_jieba_3() - test_jieba_3_1() - test_jieba_4() - test_jieba_4_1() - test_jieba_5() - test_jieba_5() - test_jieba_6() diff --git a/tests/ut/python/dataset/test_text_basic_tokenizer.py b/tests/ut/python/dataset/test_text_basic_tokenizer.py new file mode 100644 index 0000000000..822790fd60 --- /dev/null +++ b/tests/ut/python/dataset/test_text_basic_tokenizer.py @@ -0,0 +1,138 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing BasicTokenizer op in DE +""" +import numpy as np +import mindspore.dataset as ds +from mindspore import log as logger +import mindspore.dataset.text as text + +BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" + +test_paras = [ + dict( + first=1, + last=6, + expected_tokens= + [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'], + ['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'], + ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], + ['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝', + '(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封', + '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'], + ['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代', + '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', + '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], + ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', + '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']], + expected_offsets_start=[[0, 8, 11, 18, 21, 24, 27, 30], + [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42], + [0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], + [0, 3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, + 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100], + [0, 3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, + 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115], + [0, 10, 11, 15, 16, 20, 21, 25, 35, 36, 40, 41, 45, 46, 50, 57, 64, 74, 87, 97, 101]], + expected_offsets_limit=[[7, 10, 18, 21, 24, 27, 30, 33], + [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45], + [4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], + [3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 52, 55, 58, + 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 103], + [3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 54, + 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115, 124], + [9, 11, 15, 16, 20, 21, 24, 34, 36, 40, 41, 45, 46, 49, 56, 63, 73, 86, 96, 100, 113]] + ), + dict( + first=7, + last=7, + expected_tokens=[['this', 'is', 'a', 'funky', 'string']], + expected_offsets_start=[[0, 5, 8, 10, 16]], + expected_offsets_limit=[[4, 7, 9, 15, 22]], + lower_case=True + ), +] + + +def check_basic_tokenizer_default(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, + lower_case=False, keep_whitespace=False, + normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): + dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + + basic_tokenizer = text.BasicTokenizer(lower_case=lower_case, + keep_whitespace=keep_whitespace, + normalization_form=normalization_form, + preserve_unused_token=preserve_unused_token) + + dataset = dataset.map(operations=basic_tokenizer) + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']) + logger.info("Out:", token) + logger.info("Exp:", expected_tokens[count]) + np.testing.assert_array_equal(token, expected_tokens[count]) + count = count + 1 + + +def check_basic_tokenizer_with_offsets(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, + lower_case=False, keep_whitespace=False, + normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): + dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + + basic_tokenizer = text.BasicTokenizer(lower_case=lower_case, + keep_whitespace=keep_whitespace, + normalization_form=normalization_form, + preserve_unused_token=preserve_unused_token, + with_offsets=True) + + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=basic_tokenizer) + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']) + logger.info("Out:", token) + logger.info("Exp:", expected_tokens[count]) + np.testing.assert_array_equal(token, expected_tokens[count]) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count = count + 1 + +def test_basic_tokenizer_with_offsets(): + """ + Test BasicTokenizer + """ + for paras in test_paras: + check_basic_tokenizer_with_offsets(**paras) + + +def test_basic_tokenizer_default(): + """ + Test BasicTokenizer + """ + for paras in test_paras: + check_basic_tokenizer_default(**paras) + + +if __name__ == '__main__': + test_basic_tokenizer_default() + test_basic_tokenizer_with_offsets() diff --git a/tests/ut/python/dataset/test_bert_tokenizer.py b/tests/ut/python/dataset/test_text_bert_tokenizer.py similarity index 51% rename from tests/ut/python/dataset/test_bert_tokenizer.py rename to tests/ut/python/dataset/test_text_bert_tokenizer.py index ba487343a0..b29f94eb32 100644 --- a/tests/ut/python/dataset/test_bert_tokenizer.py +++ b/tests/ut/python/dataset/test_text_bert_tokenizer.py @@ -18,7 +18,7 @@ Testing BertTokenizer op in DE import numpy as np import mindspore.dataset as ds from mindspore import log as logger -import mindspore.dataset.text as nlp +import mindspore.dataset.text as text BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt" @@ -39,6 +39,14 @@ test_paras = [ ['疑', '是', '地', '上', '霜'], ['举', '头', '望', '明', '月'], ['低', '头', '思', '故', '乡']], + expected_offsets_start=[[0, 3, 6, 9, 12], + [0, 3, 6, 9, 12], + [0, 3, 6, 9, 12], + [0, 3, 6, 9, 12]], + expected_offsets_limit=[[3, 6, 9, 12, 15], + [3, 6, 9, 12, 15], + [3, 6, 9, 12, 15], + [3, 6, 9, 12, 15]], vocab_list=vocab_bert ), # test english text @@ -46,6 +54,8 @@ test_paras = [ first=5, last=5, expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], + expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]], + expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]], lower_case=True, vocab_list=vocab_bert ), @@ -53,6 +63,8 @@ test_paras = [ first=5, last=5, expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], + expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]], + expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]], lower_case=False, vocab_list=vocab_bert ), @@ -63,7 +75,9 @@ test_paras = [ expect_str=[ ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], ['繁', '體', '字']], - normalization_form=nlp.utils.NormalizeForm.NFKC, + expected_offsets_start=[[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], [0, 3, 6]], + expected_offsets_limit=[[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], [3, 6, 9]], + normalization_form=text.utils.NormalizeForm.NFKC, vocab_list=vocab_bert ), # test preserved tokens @@ -79,6 +93,8 @@ test_paras = [ ['[unused1]'], ['[unused10]'] ], + expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]], + expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=True, @@ -95,6 +111,8 @@ test_paras = [ ['[unused1]'], ['[unused10]'] ], + expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]], + expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]], lower_case=True, vocab_list=vocab_bert, preserve_unused_token=True, @@ -104,6 +122,8 @@ test_paras = [ first=15, last=15, expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']], + expected_offsets_start=[[0, 2, 3, 4, 5, 7, 8, 10, 11, 12]], + expected_offsets_limit=[[2, 3, 4, 5, 7, 8, 10, 11, 12, 14]], preserve_unused_token=True, vocab_list=vocab_bert ), @@ -112,6 +132,8 @@ test_paras = [ first=8, last=8, expect_str=[['[UNK]', ' ', '[CLS]']], + expected_offsets_start=[[0, 6, 7]], + expected_offsets_limit=[[6, 7, 12]], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=True, @@ -121,6 +143,8 @@ test_paras = [ first=8, last=8, expect_str=[['unused', ' ', '[CLS]']], + expected_offsets_start=[[0, 6, 7]], + expected_offsets_limit=[[6, 7, 12]], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=True, @@ -131,6 +155,8 @@ test_paras = [ first=8, last=8, expect_str=[['unused', ' ', '[', 'CLS', ']']], + expected_offsets_start=[[0, 6, 7, 8, 11]], + expected_offsets_limit=[[6, 7, 8, 11, 12]], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=False, @@ -140,20 +166,20 @@ test_paras = [ ] -def check_bert_tokenizer(first, last, expect_str, - vocab_list, - suffix_indicator='##', - max_bytes_per_token=100, unknown_token='[UNK]', - lower_case=False, keep_whitespace=False, - normalization_form=nlp.utils.NormalizeForm.NONE, - preserve_unused_token=False): +def check_bert_tokenizer_default(first, last, expect_str, + expected_offsets_start, expected_offsets_limit, + vocab_list, suffix_indicator='##', + max_bytes_per_token=100, unknown_token='[UNK]', + lower_case=False, keep_whitespace=False, + normalization_form=text.utils.NormalizeForm.NONE, + preserve_unused_token=False): dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) - vocab = nlp.Vocab.from_list(vocab_list) - tokenizer_op = nlp.BertTokenizer( + vocab = text.Vocab.from_list(vocab_list) + tokenizer_op = text.BertTokenizer( vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, @@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str, dataset = dataset.map(operations=tokenizer_op) count = 0 for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']) - logger.info("Out:", text) + token = text.to_str(i['text']) + logger.info("Out:", token) logger.info("Exp:", expect_str[count]) - np.testing.assert_array_equal(text, expect_str[count]) + np.testing.assert_array_equal(token, expect_str[count]) count = count + 1 -def test_bert_tokenizer(): +def check_bert_tokenizer_with_offsets(first, last, expect_str, + expected_offsets_start, expected_offsets_limit, + vocab_list, suffix_indicator='##', + max_bytes_per_token=100, unknown_token='[UNK]', + lower_case=False, keep_whitespace=False, + normalization_form=text.utils.NormalizeForm.NONE, + preserve_unused_token=False): + dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + vocab = text.Vocab.from_list(vocab_list) + tokenizer_op = text.BertTokenizer( + vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, + unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, + normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']) + logger.info("Out:", token) + logger.info("Exp:", expect_str[count]) + np.testing.assert_array_equal(token, expect_str[count]) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count = count + 1 + + +def test_bert_tokenizer_default(): + """ + Test WordpieceTokenizer when with_offsets=False + """ + for paras in test_paras: + check_bert_tokenizer_default(**paras) + + +def test_bert_tokenizer_with_offsets(): """ - Test WordpieceTokenizer + Test WordpieceTokenizer when with_offsets=True """ for paras in test_paras: - check_bert_tokenizer(**paras) + check_bert_tokenizer_with_offsets(**paras) if __name__ == '__main__': - test_bert_tokenizer() + test_bert_tokenizer_default() + test_bert_tokenizer_with_offsets() diff --git a/tests/ut/python/dataset/test_text_jieba_tokenizer.py b/tests/ut/python/dataset/test_text_jieba_tokenizer.py new file mode 100644 index 0000000000..66665b61e6 --- /dev/null +++ b/tests/ut/python/dataset/test_text_jieba_tokenizer.py @@ -0,0 +1,471 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import numpy as np +import mindspore.dataset as ds +from mindspore.dataset.text import JiebaTokenizer +from mindspore.dataset.text import JiebaMode, to_str + +DATA_FILE = "../data/dataset/testJiebaDataset/3.txt" +DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*" + +HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8" +MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8" + + +def test_jieba_1(): + """Test jieba tokenizer with MP mode""" + data = ds.TextFileDataset(DATA_FILE) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] + ret = [] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_1_1(): + """Test jieba tokenizer with HMM mode""" + data = ds.TextFileDataset(DATA_FILE) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_1_2(): + """Test jieba tokenizer with HMM MIX""" + data = ds.TextFileDataset(DATA_FILE) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_2(): + """Test add_word""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_word("男默女泪") + expect = ['男默女泪', '市', '长江大桥'] + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=2) + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_2_1(): + """Test add_word with freq""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_word("男默女泪", 10) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=2) + expect = ['男默女泪', '市', '长江大桥'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_2_2(): + """Test add_word with invalid None Input""" + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + try: + jieba_op.add_word(None) + except ValueError: + pass + + +def test_jieba_2_3(): + """Test add_word with freq, the value of freq affects the result of segmentation""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_word("江大桥", 20000) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=2) + expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_3(): + """Test add_dict with dict""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + user_dict = { + "男默女泪": 10 + } + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_dict(user_dict) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['男默女泪', '市', '长江大桥'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_3_1(): + """Test add_dict with dict""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + user_dict = { + "男默女泪": 10, + "江大桥": 20000 + } + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_dict(user_dict) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['男默女泪', '市长', '江大桥'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_4(): + DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" + DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" + + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_dict(DICT_FILE) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_4_1(): + """Test add dict with invalid file path""" + DICT_FILE = "" + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + try: + jieba_op.add_dict(DICT_FILE) + except ValueError: + pass + + +def test_jieba_5(): + """Test add dict with file path""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" + + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) + jieba_op.add_word("江大桥", 20000) + data = data.map(input_columns=["text"], + operations=jieba_op, num_parallel_workers=1) + expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +def test_jieba_with_offsets_1(): + """Test jieba tokenizer with MP mode""" + data = ds.TextFileDataset(DATA_FILE) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] + expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] + expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] + ret = [] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_1_1(): + """Test jieba tokenizer with HMM mode""" + data = ds.TextFileDataset(DATA_FILE) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] + expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45] + expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_1_2(): + """Test jieba tokenizer with HMM MIX""" + data = ds.TextFileDataset(DATA_FILE) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] + expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] + expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_2(): + """Test add_word""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_word("男默女泪") + expect = ['男默女泪', '市', '长江大桥'] + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=2) + expected_offsets_start = [0, 12, 15] + expected_offsets_limit = [12, 15, 27] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_2_1(): + """Test add_word with freq""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_word("男默女泪", 10) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=2) + expect = ['男默女泪', '市', '长江大桥'] + expected_offsets_start = [0, 12, 15] + expected_offsets_limit = [12, 15, 27] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_2_2(): + """Test add_word with freq, the value of freq affects the result of segmentation""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_word("江大桥", 20000) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=2) + expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] + expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] + expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_3(): + """Test add_dict with dict""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + user_dict = { + "男默女泪": 10 + } + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_dict(user_dict) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['男默女泪', '市', '长江大桥'] + expected_offsets_start = [0, 12, 15] + expected_offsets_limit = [12, 15, 27] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_3_1(): + """Test add_dict with dict""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" + user_dict = { + "男默女泪": 10, + "江大桥": 20000 + } + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_dict(user_dict) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['男默女泪', '市长', '江大桥'] + expected_offsets_start = [0, 12, 18] + expected_offsets_limit = [12, 18, 27] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_4(): + DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" + DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" + + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_dict(DICT_FILE) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] + expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] + expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + + +def test_jieba_with_offsets_5(): + """Test add dict with file path""" + DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" + + data = ds.TextFileDataset(DATA_FILE4) + jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) + jieba_op.add_word("江大桥", 20000) + data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], + columns_order=["token", "offsets_start", "offsets_limit"], + operations=jieba_op, num_parallel_workers=1) + expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] + expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] + expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] + for i in data.create_dict_iterator(): + ret = to_str(i["token"]) + for index, item in enumerate(ret): + assert item == expect[index] + for index, item in enumerate(i["offsets_start"]): + assert item == expected_offsets_start[index] + for index, item in enumerate(i["offsets_limit"]): + assert item == expected_offsets_limit[index] + +def gen(): + text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S') + yield (text,) + + +def pytoken_op(input_data): + te = str(to_str(input_data)) + tokens = [] + tokens.append(te[:5].encode("UTF8")) + tokens.append(te[5:10].encode("UTF8")) + tokens.append(te[10:].encode("UTF8")) + return np.array(tokens, dtype='S') + + +def test_jieba_6(): + data = ds.GeneratorDataset(gen, column_names=["text"]) + data = data.map(input_columns=["text"], + operations=pytoken_op, num_parallel_workers=1) + expect = ['今天天气太', '好了我们一', '起去外面玩吧'] + for i in data.create_dict_iterator(): + ret = to_str(i["text"]) + for index, item in enumerate(ret): + assert item == expect[index] + + +if __name__ == "__main__": + test_jieba_1() + test_jieba_1_1() + test_jieba_1_2() + test_jieba_2() + test_jieba_2_1() + test_jieba_2_2() + test_jieba_3() + test_jieba_3_1() + test_jieba_4() + test_jieba_4_1() + test_jieba_5() + test_jieba_5() + test_jieba_6() + test_jieba_with_offsets_1() + test_jieba_with_offsets_1_1() + test_jieba_with_offsets_1_2() + test_jieba_with_offsets_2() + test_jieba_with_offsets_2_1() + test_jieba_with_offsets_2_2() + test_jieba_with_offsets_3() + test_jieba_with_offsets_3_1() + test_jieba_with_offsets_4() + test_jieba_with_offsets_5() diff --git a/tests/ut/python/dataset/test_text_tokenizer.py b/tests/ut/python/dataset/test_text_tokenizer.py new file mode 100644 index 0000000000..2e2b7b741d --- /dev/null +++ b/tests/ut/python/dataset/test_text_tokenizer.py @@ -0,0 +1,380 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing UnicodeCharTokenizer op in DE +""" +import numpy as np +import mindspore.dataset as ds +from mindspore import log as logger +import mindspore.dataset.text as text + +DATA_FILE = "../data/dataset/testTokenizerData/1.txt" +NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" +REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt" +REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt" + + +def split_by_unicode_char(input_strs): + """ + Split utf-8 strings to unicode characters + """ + out = [] + for s in input_strs: + out.append([c for c in s]) + return out + + +def test_unicode_char_tokenizer_default(): + """ + Test UnicodeCharTokenizer + """ + input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.UnicodeCharTokenizer() + dataset = dataset.map(operations=tokenizer) + tokens = [] + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + tokens.append(token) + logger.info("The out tokens is : {}".format(tokens)) + assert split_by_unicode_char(input_strs) == tokens + + +def test_unicode_char_tokenizer_with_offsets(): + """ + Test UnicodeCharTokenizer + """ + input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.UnicodeCharTokenizer(with_offsets=True) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) + tokens = [] + expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], + [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]] + expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]] + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']).tolist() + tokens.append(token) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count += 1 + logger.info("The out tokens is : {}".format(tokens)) + assert split_by_unicode_char(input_strs) == tokens + + +def test_whitespace_tokenizer_default(): + """ + Test WhitespaceTokenizer + """ + whitespace_strs = [["Welcome", "to", "Beijing!"], + ["北京欢迎您!"], + ["我喜欢English!"], + [""]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.WhitespaceTokenizer() + dataset = dataset.map(operations=tokenizer) + tokens = [] + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + tokens.append(token) + logger.info("The out tokens is : {}".format(tokens)) + assert whitespace_strs == tokens + + +def test_whitespace_tokenizer_with_offsets(): + """ + Test WhitespaceTokenizer + """ + whitespace_strs = [["Welcome", "to", "Beijing!"], + ["北京欢迎您!"], + ["我喜欢English!"], + [""]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.WhitespaceTokenizer(with_offsets=True) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) + tokens = [] + expected_offsets_start = [[0, 8, 11], [0], [0], [0]] + expected_offsets_limit = [[7, 10, 19], [18], [17], [0]] + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']).tolist() + tokens.append(token) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count += 1 + + logger.info("The out tokens is : {}".format(tokens)) + assert whitespace_strs == tokens + + +def test_unicode_script_tokenizer_default(): + """ + Test UnicodeScriptTokenizer when para keep_whitespace=False + """ + unicode_script_strs = [["Welcome", "to", "Beijing", "!"], + ["北京欢迎您", "!"], + ["我喜欢", "English", "!"], + [""]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False) + dataset = dataset.map(operations=tokenizer) + + tokens = [] + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + tokens.append(token) + logger.info("The out tokens is : {}".format(tokens)) + assert unicode_script_strs == tokens + + +def test_unicode_script_tokenizer_default2(): + """ + Test UnicodeScriptTokenizer when para keep_whitespace=True + """ + unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], + ["北京欢迎您", "!"], + ["我喜欢", "English", "!"], + [" "]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True) + dataset = dataset.map(operations=tokenizer) + tokens = [] + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + tokens.append(token) + logger.info("The out tokens is :", tokens) + assert unicode_script_strs2 == tokens + + +def test_unicode_script_tokenizer_with_offsets(): + """ + Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True + """ + unicode_script_strs = [["Welcome", "to", "Beijing", "!"], + ["北京欢迎您", "!"], + ["我喜欢", "English", "!"], + [""]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) + tokens = [] + expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]] + expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]] + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']).tolist() + tokens.append(token) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count += 1 + logger.info("The out tokens is : {}".format(tokens)) + assert unicode_script_strs == tokens + + +def test_unicode_script_tokenizer_with_offsets2(): + """ + Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True + """ + unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], + ["北京欢迎您", "!"], + ["我喜欢", "English", "!"], + [" "]] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) + tokens = [] + expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]] + expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]] + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']).tolist() + tokens.append(token) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count += 1 + logger.info("The out tokens is :", tokens) + assert unicode_script_strs2 == tokens + + +def test_case_fold(): + """ + Test CaseFold + """ + expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] + dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) + op = text.CaseFold() + dataset = dataset.map(operations=op) + + lower_strs = [] + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + lower_strs.append(token) + assert lower_strs == expect_strs + + +def test_normalize_utf8(): + """ + Test NormalizeUTF8 + """ + + def normalize(normalize_form): + dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) + normalize = text.NormalizeUTF8(normalize_form=normalize_form) + dataset = dataset.map(operations=normalize) + out_bytes = [] + out_texts = [] + for i in dataset.create_dict_iterator(): + out_bytes.append(i['text']) + out_texts.append(text.to_str(i['text']).tolist()) + logger.info("The out bytes is : ", out_bytes) + logger.info("The out texts is: ", out_texts) + return out_bytes + + expect_normlize_data = [ + # NFC + [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'], + # NFKC + [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'fi', b'25', b'\xe1\xb9\xa9'], + # NFD + [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'], + # NFKD + [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', + b'fi', b'25', b's\xcc\xa3\xcc\x87'] + ] + assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0] + assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1] + assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2] + assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3] + + +def test_regex_replace(): + """ + Test RegexReplace + """ + + def regex_replace(first, last, expect_str, pattern, replace): + dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + replace_op = text.RegexReplace(pattern, replace) + dataset = dataset.map(operations=replace_op) + out_text = [] + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + out_text.append(token) + logger.info("Out:", out_text) + logger.info("Exp:", expect_str) + assert expect_str == out_text + + regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_') + regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "") + regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "") + regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") + + +def test_regex_tokenizer_default(): + """ + Test RegexTokenizer + """ + + def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): + dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern) + dataset = dataset.map(operations=tokenizer_op) + out_text = [] + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']).tolist() + np.testing.assert_array_equal(token, expect_str[count]) + count += 1 + out_text.append(token) + logger.info("Out:", out_text) + logger.info("Exp:", expect_str) + + regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "") + regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+") + regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}") + regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") + regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "") + regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") + + +def test_regex_tokenizer_with_offsets(): + """ + Test RegexTokenizer + """ + + def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern, + keep_delim_pattern): + dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) + out_text = [] + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']).tolist() + np.testing.assert_array_equal(token, expect_str[count]) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count += 1 + out_text.append(token) + logger.info("Out:", out_text) + logger.info("Exp:", expect_str) + + regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "") + regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]], + "\\s+", "\\s+") + regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]], + [[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}") + regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]], + r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") + regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "") + regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "") + + +if __name__ == '__main__': + test_unicode_char_tokenizer_default() + test_unicode_char_tokenizer_with_offsets() + test_whitespace_tokenizer_default() + test_whitespace_tokenizer_with_offsets() + test_unicode_script_tokenizer_default() + test_unicode_script_tokenizer_default2() + test_unicode_script_tokenizer_with_offsets() + test_unicode_script_tokenizer_with_offsets2() + test_case_fold() + test_normalize_utf8() + test_regex_replace() + test_regex_tokenizer_default() + test_regex_tokenizer_with_offsets() diff --git a/tests/ut/python/dataset/test_text_wordpiece_tokenizer.py b/tests/ut/python/dataset/test_text_wordpiece_tokenizer.py new file mode 100644 index 0000000000..8b47ec971e --- /dev/null +++ b/tests/ut/python/dataset/test_text_wordpiece_tokenizer.py @@ -0,0 +1,160 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Testing WordpieceTokenizer op in DE +""" +import numpy as np +import mindspore.dataset as ds +from mindspore import log as logger +import mindspore.dataset.text as text + +WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt" + +vocab_english = [ + "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the" +] + +vocab_chinese = [ + "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情' +] + +vocab_mix = vocab_chinese + vocab_english + +test_paras = [ + dict( + first=1, + last=10, + expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], + ['era'], ['[UNK]']], + expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]], + expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]], + vocab_list=vocab_english + ), + dict( + first=1, + last=10, + expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], + ['era'], ['what']], + expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]], + expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]], + vocab_list=vocab_english, + unknown_token="" + ), + dict( + first=1, + last=10, + expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']], + expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]], + expected_offsets_limit=[[2], [5], [4], [2], [4], [5], [3], [5], [3], [4]], + vocab_list=vocab_english, + max_bytes_per_token=4 + ), + dict( + first=11, + last=25, + expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], + ['[UNK]']], + expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]], + expected_offsets_limit=[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]], + vocab_list=vocab_chinese, + ), + dict( + first=25, + last=25, + expect_str=[['您']], + expected_offsets_start=[[0]], + expected_offsets_limit=[[3]], + vocab_list=vocab_chinese, + unknown_token="" + ), + dict( + first=1, + last=25, + expect_str=[ + ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'], + ['[UNK]'], + ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], + ['[UNK]']], + expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0], + [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]], + expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4], + [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]], + vocab_list=vocab_mix, + ), +] + + +def check_wordpiece_tokenizer_default(first, last, expect_str, expected_offsets_start, expected_offsets_limit, + vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): + dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + vocab = text.Vocab.from_list(vocab_list) + tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, + max_bytes_per_token=max_bytes_per_token) + dataset = dataset.map(operations=tokenizer_op) + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['text']) + logger.info("Out:", token) + logger.info("Exp:", expect_str[count]) + np.testing.assert_array_equal(token, expect_str[count]) + count = count + 1 + + +def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit, + vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): + dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) + if first > 1: + dataset = dataset.skip(first - 1) + if last >= first: + dataset = dataset.take(last - first + 1) + vocab = text.Vocab.from_list(vocab_list) + tokenizer_op = text.WordpieceTokenizer(vocab=vocab, with_offsets=True, unknown_token=unknown_token, + max_bytes_per_token=max_bytes_per_token) + dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], + columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) + count = 0 + for i in dataset.create_dict_iterator(): + token = text.to_str(i['token']) + logger.info("Out:", token) + logger.info("Exp:", expect_str[count]) + np.testing.assert_array_equal(token, expect_str[count]) + np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) + np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) + count = count + 1 + + +def test_wordpiece_tokenizer_default(): + """ + Test WordpieceTokenizer + """ + for paras in test_paras: + check_wordpiece_tokenizer_default(**paras) + + +def test_wordpiece_tokenizer_with_offsets(): + """ + Test WordpieceTokenizer + """ + for paras in test_paras: + check_wordpiece_tokenizer_with_offsets(**paras) + + +if __name__ == '__main__': + test_wordpiece_tokenizer_default() + test_wordpiece_tokenizer_with_offsets() diff --git a/tests/ut/python/dataset/test_tokenizer.py b/tests/ut/python/dataset/test_tokenizer.py deleted file mode 100644 index 2ec988d8dc..0000000000 --- a/tests/ut/python/dataset/test_tokenizer.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Testing UnicodeCharTokenizer op in DE -""" -import numpy as np -import mindspore.dataset as ds -from mindspore import log as logger -import mindspore.dataset.text as nlp - -DATA_FILE = "../data/dataset/testTokenizerData/1.txt" -NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" -REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt" -REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt" - - -def split_by_unicode_char(input_strs): - """ - Split utf-8 strings to unicode characters - """ - out = [] - for s in input_strs: - out.append([c for c in s]) - return out - - -def test_unicode_char_tokenizer(): - """ - Test UnicodeCharTokenizer - """ - input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") - dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) - tokenizer = nlp.UnicodeCharTokenizer() - dataset = dataset.map(operations=tokenizer) - tokens = [] - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - tokens.append(text) - logger.info("The out tokens is : {}".format(tokens)) - assert split_by_unicode_char(input_strs) == tokens - - -def test_whitespace_tokenizer(): - """ - Test WhitespaceTokenizer - """ - whitespace_strs = [["Welcome", "to", "Beijing!"], - ["北京欢迎您!"], - ["我喜欢English!"], - [""]] - dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) - tokenizer = nlp.WhitespaceTokenizer() - dataset = dataset.map(operations=tokenizer) - tokens = [] - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - tokens.append(text) - logger.info("The out tokens is : {}".format(tokens)) - assert whitespace_strs == tokens - - -def test_unicode_script_tokenizer(): - """ - Test UnicodeScriptTokenizer when para keep_whitespace=False - """ - unicode_script_strs = [["Welcome", "to", "Beijing", "!"], - ["北京欢迎您", "!"], - ["我喜欢", "English", "!"], - [""]] - dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) - tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False) - dataset = dataset.map(operations=tokenizer) - - tokens = [] - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - tokens.append(text) - logger.info("The out tokens is : {}".format(tokens)) - assert unicode_script_strs == tokens - - -def test_unicode_script_tokenizer2(): - """ - Test UnicodeScriptTokenizer when para keep_whitespace=True - """ - unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], - ["北京欢迎您", "!"], - ["我喜欢", "English", "!"], - [" "]] - dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) - tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True) - dataset = dataset.map(operations=tokenizer) - tokens = [] - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - tokens.append(text) - logger.info("The out tokens is :", tokens) - assert unicode_script_strs2 == tokens - - -def test_case_fold(): - """ - Test CaseFold - """ - expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] - dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) - op = nlp.CaseFold() - dataset = dataset.map(operations=op) - - lower_strs = [] - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - lower_strs.append(text) - assert lower_strs == expect_strs - - -def test_normalize_utf8(): - """ - Test NormalizeUTF8 - """ - - def normalize(normalize_form): - dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) - normalize = nlp.NormalizeUTF8(normalize_form=normalize_form) - dataset = dataset.map(operations=normalize) - out_bytes = [] - out_texts = [] - for i in dataset.create_dict_iterator(): - out_bytes.append(i['text']) - out_texts.append(nlp.to_str(i['text']).tolist()) - logger.info("The out bytes is : ", out_bytes) - logger.info("The out texts is: ", out_texts) - return out_bytes - - expect_normlize_data = [ - # NFC - [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', - b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'], - # NFKC - [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', - b'fi', b'25', b'\xe1\xb9\xa9'], - # NFD - [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', - b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'], - # NFKD - [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', - b'fi', b'25', b's\xcc\xa3\xcc\x87'] - ] - assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0] - assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1] - assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2] - assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3] - - -def test_regex_replace(): - """ - Test RegexReplace - """ - - def regex_replace(first, last, expect_str, pattern, replace): - dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False) - if first > 1: - dataset = dataset.skip(first - 1) - if last >= first: - dataset = dataset.take(last - first + 1) - replace_op = nlp.RegexReplace(pattern, replace) - dataset = dataset.map(operations=replace_op) - out_text = [] - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - out_text.append(text) - logger.info("Out:", out_text) - logger.info("Exp:", expect_str) - assert expect_str == out_text - - regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_') - regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "") - regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "") - regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") - - -def test_regex_tokenizer(): - """ - Test RegexTokenizer - """ - - def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): - dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) - if first > 1: - dataset = dataset.skip(first - 1) - if last >= first: - dataset = dataset.take(last - first + 1) - tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern) - dataset = dataset.map(operations=tokenizer_op) - out_text = [] - count = 0 - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']).tolist() - np.testing.assert_array_equal(text, expect_str[count]) - count += 1 - out_text.append(text) - logger.info("Out:", out_text) - logger.info("Exp:", expect_str) - - regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "") - regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+") - regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}") - regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") - regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "") - regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") - - -if __name__ == '__main__': - test_unicode_char_tokenizer() - test_whitespace_tokenizer() - test_unicode_script_tokenizer() - test_unicode_script_tokenizer2() - test_case_fold() - test_normalize_utf8() - test_regex_replace() - test_regex_tokenizer() diff --git a/tests/ut/python/dataset/test_wordpiece_tokenizer.py b/tests/ut/python/dataset/test_wordpiece_tokenizer.py deleted file mode 100644 index 7934884740..0000000000 --- a/tests/ut/python/dataset/test_wordpiece_tokenizer.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -""" -Testing WordpieceTokenizer op in DE -""" -import numpy as np -import mindspore.dataset as ds -from mindspore import log as logger -import mindspore.dataset.text as nlp - -WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt" - -vocab_english = [ - "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the" -] - -vocab_chinese = [ - "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情' -] - -vocab_mix = vocab_chinese + vocab_english - -test_paras = [ - dict( - first=1, - last=10, - expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], - ['era'], ['[UNK]']], - vocab_list=vocab_english - ), - dict( - first=1, - last=10, - expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], - ['era'], ['what']], - vocab_list=vocab_english, - unknown_token="" - ), - dict( - first=1, - last=10, - expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']], - vocab_list=vocab_english, - max_bytes_per_token=4 - ), - dict( - first=11, - last=25, - expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], - ['[UNK]']], - vocab_list=vocab_chinese, - ), - dict( - first=25, - last=25, - expect_str=[['您']], - vocab_list=vocab_chinese, - unknown_token="" - ), - dict( - first=1, - last=25, - expect_str=[ - ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'], - ['[UNK]'], - ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], - ['[UNK]']], - vocab_list=vocab_mix, - ), -] - - -def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): - dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) - if first > 1: - dataset = dataset.skip(first - 1) - if last >= first: - dataset = dataset.take(last - first + 1) - vocab = nlp.Vocab.from_list(vocab_list) - tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, - max_bytes_per_token=max_bytes_per_token) - dataset = dataset.map(operations=tokenizer_op) - count = 0 - for i in dataset.create_dict_iterator(): - text = nlp.to_str(i['text']) - logger.info("Out:", text) - logger.info("Exp:", expect_str[count]) - np.testing.assert_array_equal(text, expect_str[count]) - count = count + 1 - - -def test_wordpiece_tokenizer(): - """ - Test WordpieceTokenizer - """ - for paras in test_paras: - check_wordpiece_tokenizer(**paras) - - -if __name__ == '__main__': - test_wordpiece_tokenizer()