Merge pull request !2941 from xiefangqi/md_add_offsets_featuretags/v0.6.0-beta
| @@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) { | |||
| void bindTokenizerOps(py::module *m) { | |||
| (void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "") | |||
| .def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"), | |||
| py::arg("mode") = JiebaMode::kMix) | |||
| .def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>(), py::arg("hmm_path"), | |||
| py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix, | |||
| py::arg("with_offsets") = JiebaTokenizerOp::kDefWithOffsets) | |||
| .def("add_word", | |||
| [](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); }); | |||
| (void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>( | |||
| *m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.") | |||
| .def(py::init<>()); | |||
| .def(py::init<const bool &>(), py::arg("with_offsets") = UnicodeCharTokenizerOp::kDefWithOffsets); | |||
| (void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp", | |||
| "Tensor operation to LookUp each word.") | |||
| .def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word) { | |||
| @@ -632,21 +633,25 @@ void bindTokenizerOps(py::module *m) { | |||
| py::arg("separator")); | |||
| (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>( | |||
| *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.") | |||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(), | |||
| py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), | |||
| py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, | |||
| py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken)); | |||
| .def( | |||
| py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &>(), | |||
| py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), | |||
| py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, | |||
| py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), | |||
| py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets); | |||
| } | |||
| void bindDependIcuTokenizerOps(py::module *m) { | |||
| #ifdef ENABLE_ICU4C | |||
| (void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>( | |||
| *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.") | |||
| .def(py::init<>()); | |||
| .def(py::init<const bool &>(), py::arg("with_offsets") = WhitespaceTokenizerOp::kDefWithOffsets); | |||
| (void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>( | |||
| *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.") | |||
| .def(py::init<>()) | |||
| .def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace); | |||
| .def(py::init<const bool &, const bool &>(), | |||
| py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace, | |||
| py::arg("with_offsets") = UnicodeScriptTokenizerOp::kDefWithOffsets); | |||
| (void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>( | |||
| *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor") | |||
| .def(py::init<>()); | |||
| @@ -660,24 +665,28 @@ void bindDependIcuTokenizerOps(py::module *m) { | |||
| py::arg("replace_all")); | |||
| (void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>( | |||
| *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.") | |||
| .def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern")); | |||
| .def(py::init<const std::string &, const std::string &, const bool &>(), py::arg("delim_pattern"), | |||
| py::arg("keep_delim_pattern"), py::arg("with_offsets") = RegexTokenizerOp::kDefWithOffsets); | |||
| (void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>( | |||
| *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.") | |||
| .def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, | |||
| .def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>(), | |||
| py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, | |||
| py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, | |||
| py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, | |||
| py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); | |||
| py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken, | |||
| py::arg("with_offsets") = BasicTokenizerOp::kDefWithOffsets); | |||
| (void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp", | |||
| "Tokenizer used for Bert text process.") | |||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool, | |||
| NormalizeForm, bool>(), | |||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &, | |||
| const bool &, const NormalizeForm &, const bool &, const bool &>(), | |||
| py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), | |||
| py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, | |||
| py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), | |||
| py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, | |||
| py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, | |||
| py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, | |||
| py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); | |||
| py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken, | |||
| py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets); | |||
| #endif | |||
| } | |||
| @@ -27,10 +27,12 @@ | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| const bool BasicTokenizerOp::kDefLowerCase = false; | |||
| const bool BasicTokenizerOp::kDefKeepWhitespace = false; | |||
| const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone; | |||
| const bool BasicTokenizerOp::kDefPreserveUnusedToken = true; | |||
| const bool BasicTokenizerOp::kDefWithOffsets = false; | |||
| const char BasicTokenizerOp::kCommonPattern[] = | |||
| "[!-/]" | |||
| "|[:-@]" | |||
| @@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] = | |||
| "|[\\x{2F800}-\\x{2FA1F}]"; | |||
| const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|"; | |||
| const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"}; | |||
| BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form, | |||
| bool preserve_unused_token) | |||
| BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace, | |||
| const NormalizeForm &normalization_form, const bool &preserve_unused_token, | |||
| const bool &with_offsets) | |||
| : lower_case_(lower_case), | |||
| keep_whitespace_(keep_whitespace), | |||
| preserve_unused_token_(preserve_unused_token), | |||
| with_offsets_(with_offsets), | |||
| case_fold_(std::make_unique<CaseFoldOp>()), | |||
| nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)), | |||
| normalization_form_(normalization_form), | |||
| @@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal | |||
| keep_delim_pattern = kUnusedPattern + keep_delim_pattern; | |||
| delim_pattern = kUnusedPattern + delim_pattern; | |||
| } | |||
| regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern); | |||
| regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern, with_offsets_); | |||
| } | |||
| Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text, | |||
| @@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor | |||
| return Status::OK(); | |||
| } | |||
| Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | |||
| Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); | |||
| if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); | |||
| } | |||
| std::shared_ptr<Tensor> cur_input; | |||
| @@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar | |||
| if (lower_case_) { | |||
| if (!preserve_unused_token_) { | |||
| // to lower case | |||
| RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor)); | |||
| RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor)); | |||
| } else { | |||
| // to lower case except words in kUnusedWords | |||
| RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input, &processed_tensor)); | |||
| RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor)); | |||
| } | |||
| cur_input = processed_tensor; | |||
| // strip accent characters | |||
| @@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar | |||
| cur_input = processed_tensor; | |||
| RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor)); | |||
| } else { | |||
| RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor)); | |||
| RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor)); | |||
| } | |||
| // strip control characters | |||
| cur_input = processed_tensor; | |||
| RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor)); | |||
| return regex_tokenizer_->Compute(processed_tensor, output); | |||
| return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp { | |||
| static const bool kDefKeepWhitespace; | |||
| static const NormalizeForm kDefNormalizationForm; | |||
| static const bool kDefPreserveUnusedToken; | |||
| explicit BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace, | |||
| NormalizeForm normalization_form = kDefNormalizationForm, | |||
| bool preserve_unused_token = kDefPreserveUnusedToken); | |||
| static const bool kDefWithOffsets; | |||
| explicit BasicTokenizerOp(const bool &lower_case = kDefLowerCase, const bool &keep_whitespace = kDefKeepWhitespace, | |||
| const NormalizeForm &normalization_form = kDefNormalizationForm, | |||
| const bool &preserve_unused_token = kDefPreserveUnusedToken, | |||
| const bool &with_offsets = kDefWithOffsets); | |||
| ~BasicTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| protected: | |||
| Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set<std::string> &unused_words, | |||
| @@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp { | |||
| static const char kCommonPattern[]; | |||
| static const char kUnusedPattern[]; | |||
| static const std::unordered_set<std::string> kUnusedWords; | |||
| bool with_offsets_; | |||
| bool lower_case_; | |||
| bool keep_whitespace_; | |||
| NormalizeForm normalization_form_; | |||
| @@ -16,9 +16,9 @@ | |||
| #include "dataset/text/kernels/bert_tokenizer_op.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| std::shared_ptr<Tensor> basic_tensor; | |||
| Status BertTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| TensorRow basic_tensor; | |||
| RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor)); | |||
| RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output)); | |||
| return Status::OK(); | |||
| @@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp { | |||
| const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator, | |||
| const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken, | |||
| const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken, | |||
| bool lower_case = BasicTokenizerOp::kDefLowerCase, | |||
| bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace, | |||
| NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm, | |||
| bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken) | |||
| : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token), | |||
| basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {} | |||
| const bool &lower_case = BasicTokenizerOp::kDefLowerCase, | |||
| const bool &keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace, | |||
| const NormalizeForm &normalization_form = BasicTokenizerOp::kDefNormalizationForm, | |||
| const bool &preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken, | |||
| const bool &with_offsets = WordpieceTokenizerOp::kDefWithOffsets) | |||
| : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets), | |||
| basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token, with_offsets) {} | |||
| ~BertTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "BertTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| private: | |||
| WordpieceTokenizerOp wordpiece_tokenizer_; | |||
| @@ -23,35 +23,63 @@ | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, JiebaMode mode) | |||
| : jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path) { | |||
| const bool JiebaTokenizerOp::kDefWithOffsets = false; | |||
| JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, const JiebaMode &mode, | |||
| const bool &with_offsets) | |||
| : jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path), with_offsets_(with_offsets) { | |||
| jieba_parser_ = std::make_unique<cppjieba::Jieba>(mp_dict_path_, hmm_model_path_, ""); | |||
| } | |||
| Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| Status JiebaTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); | |||
| RETURN_UNEXPECTED_IF_NULL(jieba_parser_); | |||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | |||
| if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor"); | |||
| } | |||
| std::string_view sentence_v; | |||
| RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {})); | |||
| RETURN_IF_NOT_OK(input[0]->GetItemAt(&sentence_v, {})); | |||
| std::string sentence{sentence_v}; | |||
| std::vector<std::string> words; | |||
| std::vector<uint32_t> offsets_start, offsets_limit; | |||
| std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor; | |||
| if (sentence == "") { | |||
| words.push_back(""); | |||
| } else { | |||
| std::vector<cppjieba::Word> tmp; | |||
| if (jieba_mode_ == JiebaMode::kMp) { | |||
| jieba_parser_->CutSmall(sentence, words, MAX_WORD_LENGTH); | |||
| std::unique_ptr<cppjieba::MPSegment> mp_seg = std::make_unique<cppjieba::MPSegment>(jieba_parser_->GetDictTrie()); | |||
| mp_seg->Cut(sentence, tmp, MAX_WORD_LENGTH); | |||
| } else if (jieba_mode_ == JiebaMode::kHmm) { | |||
| jieba_parser_->CutHMM(sentence, words); | |||
| std::unique_ptr<cppjieba::HMMSegment> hmm_seg = | |||
| std::make_unique<cppjieba::HMMSegment>(jieba_parser_->GetHMMModel()); | |||
| hmm_seg->Cut(sentence, tmp); | |||
| } else { // Mix | |||
| jieba_parser_->Cut(sentence, words, true); | |||
| std::unique_ptr<cppjieba::MixSegment> mix_seg = | |||
| std::make_unique<cppjieba::MixSegment>(jieba_parser_->GetDictTrie(), jieba_parser_->GetHMMModel()); | |||
| mix_seg->Cut(sentence, tmp, true); | |||
| } | |||
| GetStringsFromWords(tmp, words); | |||
| for (auto item : tmp) { | |||
| offsets_start.push_back(static_cast<uint32_t>(item.offset)); | |||
| offsets_limit.push_back(static_cast<uint32_t>(item.offset + item.word.length())); | |||
| } | |||
| } | |||
| *output = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()})); | |||
| token_tensor = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()})); | |||
| output->push_back(token_tensor); | |||
| if (with_offsets_) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_start[0]))); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_limit[0]))); | |||
| output->push_back(offsets_start_tensor); | |||
| output->push_back(offsets_limit_tensor); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| @@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 }; | |||
| class JiebaTokenizerOp : public TensorOp { | |||
| public: | |||
| // deffault constant for Jieba MPSegment algorithm. | |||
| // default constant for Jieba MPSegment algorithm. | |||
| static constexpr size_t MAX_WORD_LENGTH = 512; | |||
| // default const for set whether Jieba output offsets tensor. | |||
| static const bool kDefWithOffsets; | |||
| // Constructor for JiebaTokenizerOp. | |||
| // @param hmm_path HMM model file. | |||
| // @param mp_path MP model file. | |||
| // @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will | |||
| // tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and | |||
| // HMMSegment algorithm. | |||
| JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, JiebaMode mode = JiebaMode::kMix); | |||
| // @with_offsets user set this value to choose whether output offset tensor. | |||
| JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix, | |||
| const bool &with_offsets = kDefWithOffsets); | |||
| ~JiebaTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { | |||
| @@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp { | |||
| << mp_dict_path_; | |||
| } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| // @word the word to be added to the JiebaTokenizer. | |||
| // @freq [Default 0] the frequency fo the word to be added. | |||
| @@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp { | |||
| std::string mp_dict_path_; | |||
| std::unique_ptr<cppjieba::Jieba> jieba_parser_; | |||
| JiebaMode jieba_mode_; | |||
| bool with_offsets_; | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -22,8 +22,11 @@ | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, | |||
| icu::UnicodeString *out_unicode) const { | |||
| const bool RegexTokenizerOp::kDefWithOffsets = false; | |||
| Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, | |||
| std::string *out_utf8, icu::UnicodeString *out_unicode) const { | |||
| CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input"); | |||
| int total_len = input.length(); | |||
| int end = start + len; | |||
| @@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s | |||
| return Status::OK(); | |||
| } | |||
| Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const { | |||
| Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens, | |||
| std::vector<uint32_t> *offsets_start, | |||
| std::vector<uint32_t> *offsets_limit) const { | |||
| UErrorCode status = U_ZERO_ERROR; | |||
| out_tokens->clear(); | |||
| icu::RegexMatcher token_matcher(delim_pattern_, 0, status); | |||
| @@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std | |||
| icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text)); | |||
| token_matcher.reset(utext); | |||
| int text_start_index = 0; | |||
| int token_start_index = 0; | |||
| status = U_ZERO_ERROR; | |||
| while (token_matcher.find(status) && U_SUCCESS(status)) { | |||
| @@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std | |||
| int token_len = deli_start_index - token_start_index; | |||
| if (token_len > 0) { | |||
| std::string token; | |||
| uint32_t token_offset = 0; | |||
| RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token)); | |||
| token_offset = token.length(); | |||
| out_tokens->emplace_back(std::move(token)); | |||
| offsets_start->push_back(static_cast<uint32_t>(text_start_index)); | |||
| offsets_limit->push_back(static_cast<uint32_t>(text_start_index + token_offset)); | |||
| text_start_index += token_offset; | |||
| } | |||
| int delim_len = deli_end_index - deli_start_index; | |||
| if (keep_delim_ && delim_len > 0) { | |||
| if (delim_len > 0) { | |||
| icu::UnicodeString delim_str; | |||
| std::string delim_utf8_str; | |||
| uint32_t delim_str_offset = 0; | |||
| RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str)); | |||
| delim_matcher.reset(delim_str); | |||
| if (delim_matcher.matches(status) && U_SUCCESS(status)) { | |||
| delim_str_offset = delim_utf8_str.length(); | |||
| if (keep_delim_ && delim_matcher.matches(status) && U_SUCCESS(status)) { | |||
| out_tokens->emplace_back(std::move(delim_utf8_str)); | |||
| offsets_start->push_back(static_cast<uint32_t>(text_start_index)); | |||
| offsets_limit->push_back(static_cast<uint32_t>(text_start_index + delim_str_offset)); | |||
| } | |||
| text_start_index += delim_str_offset; | |||
| } | |||
| token_start_index = deli_end_index; | |||
| } | |||
| if (token_start_index < utext.length()) { | |||
| std::string temp; | |||
| uint32_t temp_offset = 0; | |||
| RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp)); | |||
| temp_offset = temp.length(); | |||
| out_tokens->emplace_back(std::move(temp)); | |||
| offsets_start->push_back(static_cast<uint32_t>(text_start_index)); | |||
| offsets_limit->push_back(static_cast<uint32_t>(text_start_index + temp_offset)); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | |||
| Status RegexTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); | |||
| if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); | |||
| } | |||
| std::string_view text; | |||
| RETURN_IF_NOT_OK(input->GetItemAt(&text, {})); | |||
| std::vector<std::string> tokens; | |||
| RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens)); | |||
| *output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()})); | |||
| std::vector<uint32_t> offsets_start; | |||
| std::vector<uint32_t> offsets_limit; | |||
| std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor; | |||
| RETURN_IF_NOT_OK(input[0]->GetItemAt(&text, {})); | |||
| RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens, &offsets_start, &offsets_limit)); | |||
| token_tensor = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()})); | |||
| output->push_back(token_tensor); | |||
| if (with_offsets_) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_start[0]))); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_limit[0]))); | |||
| output->push_back(offsets_start_tensor); | |||
| output->push_back(offsets_limit_tensor); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| @@ -32,25 +32,31 @@ namespace dataset { | |||
| class RegexTokenizerOp : public TensorOp { | |||
| public: | |||
| RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern) | |||
| static const bool kDefWithOffsets; | |||
| RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern, | |||
| const bool &with_offsets = kDefWithOffsets) | |||
| : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)), | |||
| keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)), | |||
| with_offsets_(with_offsets), | |||
| keep_delim_(!keep_delim_pattern.empty()) {} | |||
| ~RegexTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| protected: | |||
| Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, | |||
| Status GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, std::string *out_utf8, | |||
| icu::UnicodeString *out_unicode = nullptr) const; | |||
| Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const; | |||
| Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens, | |||
| std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const; | |||
| private: | |||
| const icu::UnicodeString delim_pattern_; | |||
| const icu::UnicodeString keep_delim_pattern_; | |||
| bool with_offsets_; | |||
| const bool keep_delim_; | |||
| }; | |||
| } // namespace dataset | |||
| @@ -27,26 +27,46 @@ using cppjieba::RuneStrArray; | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | |||
| const bool UnicodeCharTokenizerOp::kDefWithOffsets = false; | |||
| Status UnicodeCharTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); | |||
| if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); | |||
| } | |||
| std::string_view str; | |||
| RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); | |||
| RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {})); | |||
| RuneStrArray runes; | |||
| if (!DecodeRunesInString(str.data(), str.size(), runes)) { | |||
| RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); | |||
| } | |||
| std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor; | |||
| std::vector<std::string> splits(runes.size()); | |||
| std::vector<uint32_t> offsets_start, offsets_limit; | |||
| for (size_t i = 0; i < runes.size(); i++) { | |||
| offsets_start.push_back(runes[i].offset); | |||
| offsets_limit.push_back(runes[i].offset + runes[i].len); | |||
| splits[i] = str.substr(runes[i].offset, runes[i].len); | |||
| } | |||
| if (splits.empty()) { | |||
| splits.emplace_back(""); | |||
| offsets_start.push_back(0); | |||
| offsets_limit.push_back(0); | |||
| } | |||
| token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()})); | |||
| output->push_back(token_tensor); | |||
| if (with_offsets_) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_start[0]))); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_limit[0]))); | |||
| output->push_back(offsets_start_tensor); | |||
| output->push_back(offsets_limit_tensor); | |||
| } | |||
| *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()})); | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| @@ -26,13 +26,18 @@ namespace dataset { | |||
| class UnicodeCharTokenizerOp : public TensorOp { | |||
| public: | |||
| UnicodeCharTokenizerOp() {} | |||
| static const bool kDefWithOffsets; | |||
| explicit UnicodeCharTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {} | |||
| ~UnicodeCharTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "UnicodeCharTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| private: | |||
| bool with_offsets_; | |||
| }; | |||
| } // namespace dataset | |||
| @@ -32,24 +32,28 @@ namespace mindspore { | |||
| namespace dataset { | |||
| const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false; | |||
| const bool UnicodeScriptTokenizerOp::kDefWithOffsets = false; | |||
| Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | |||
| Status UnicodeScriptTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); | |||
| if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); | |||
| } | |||
| std::string_view str; | |||
| RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); | |||
| RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {})); | |||
| RuneStrArray runes; | |||
| if (!DecodeRunesInString(str.data(), str.size(), runes)) { | |||
| RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); | |||
| } | |||
| std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor; | |||
| UScriptCode last_script = USCRIPT_INVALID_CODE; | |||
| icu::ErrorCode status; | |||
| int start = 0; | |||
| int len = 0; | |||
| std::vector<std::string> splits; | |||
| std::vector<uint32_t> offsets_start, offsets_limit; | |||
| bool was_space = false; | |||
| for (size_t i = 0; i < runes.size(); i++) { | |||
| @@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s | |||
| if (len > 0 && (script != last_script || is_space != was_space)) { | |||
| // 3) If keep_whitespace_ is false, all the whitespace characters will be discard | |||
| if (keep_whitespace_ || !was_space) { | |||
| offsets_start.push_back(static_cast<uint32_t>(start)); | |||
| offsets_limit.push_back(static_cast<uint32_t>(start + len)); | |||
| std::string temp(str.substr(start, len)); | |||
| splits.emplace_back(std::move(temp)); | |||
| } | |||
| @@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s | |||
| } | |||
| if (len > 0 && (keep_whitespace_ || !was_space)) { | |||
| offsets_start.push_back(static_cast<uint32_t>(start)); | |||
| offsets_limit.push_back(static_cast<uint32_t>(start + len)); | |||
| std::string temp(str.substr(start, len)); | |||
| splits.emplace_back(std::move(temp)); | |||
| } | |||
| // 4) If the input is empty scalar string, the output will be 1-D empty string. | |||
| if (splits.empty()) { | |||
| splits.emplace_back(""); | |||
| offsets_start.push_back(0); | |||
| offsets_limit.push_back(0); | |||
| } | |||
| token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()})); | |||
| output->push_back(token_tensor); | |||
| if (with_offsets_) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_start[0]))); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_limit[0]))); | |||
| output->push_back(offsets_start_tensor); | |||
| output->push_back(offsets_limit_tensor); | |||
| } | |||
| *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()})); | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| @@ -27,17 +27,21 @@ namespace dataset { | |||
| class UnicodeScriptTokenizerOp : public TensorOp { | |||
| public: | |||
| static const bool kDefKeepWhitespace; | |||
| static const bool kDefWithOffsets; | |||
| explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {} | |||
| explicit UnicodeScriptTokenizerOp(const bool &keep_whitespace = kDefKeepWhitespace, | |||
| const bool &with_offsets = kDefWithOffsets) | |||
| : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} | |||
| ~UnicodeScriptTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| private: | |||
| bool keep_whitespace_; // If or not keep whitespace tokens | |||
| bool with_offsets_; | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -30,24 +30,33 @@ using cppjieba::RuneStrArray; | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { | |||
| const bool WhitespaceTokenizerOp::kDefWithOffsets = false; | |||
| Status WhitespaceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor"); | |||
| if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); | |||
| } | |||
| std::string_view str; | |||
| RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); | |||
| RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {})); | |||
| RuneStrArray runes; | |||
| if (!DecodeRunesInString(str.data(), str.size(), runes)) { | |||
| RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); | |||
| } | |||
| std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor; | |||
| std::vector<uint32_t> offsets_start, offsets_limit; | |||
| std::vector<std::string> splits; | |||
| int start = 0; | |||
| int len = 0; | |||
| for (size_t i = 0; i < runes.size(); i++) { | |||
| if (u_isUWhiteSpace(runes[i].rune)) { | |||
| if (len > 0) { | |||
| offsets_start.push_back(static_cast<uint32_t>(start)); | |||
| offsets_limit.push_back(static_cast<uint32_t>(start + len)); | |||
| std::string temp(str.substr(start, len)); | |||
| splits.emplace_back(std::move(temp)); | |||
| len = 0; | |||
| @@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std: | |||
| } | |||
| } | |||
| if (len > 0) { | |||
| offsets_start.push_back(static_cast<uint32_t>(start)); | |||
| offsets_limit.push_back(static_cast<uint32_t>(start + len)); | |||
| std::string temp(str.substr(start, len)); | |||
| splits.emplace_back(std::move(temp)); | |||
| } | |||
| if (splits.empty()) { | |||
| splits.emplace_back(""); | |||
| offsets_start.push_back(0); | |||
| offsets_limit.push_back(0); | |||
| } | |||
| token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()})); | |||
| output->push_back(token_tensor); | |||
| if (with_offsets_) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_start[0]))); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_limit[0]))); | |||
| output->push_back(offsets_start_tensor); | |||
| output->push_back(offsets_limit_tensor); | |||
| } | |||
| *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()})); | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| @@ -26,13 +26,18 @@ namespace dataset { | |||
| class WhitespaceTokenizerOp : public TensorOp { | |||
| public: | |||
| WhitespaceTokenizerOp() {} | |||
| static const bool kDefWithOffsets; | |||
| explicit WhitespaceTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {} | |||
| ~WhitespaceTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| private: | |||
| bool with_offsets_; | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -24,13 +24,16 @@ namespace dataset { | |||
| const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##"; | |||
| const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100; | |||
| const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]"; | |||
| const bool WordpieceTokenizerOp::kDefWithOffsets = false; | |||
| WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| const int &max_bytes_per_token, const std::string &unknown_token) | |||
| const int &max_bytes_per_token, const std::string &unknown_token, | |||
| const bool &with_offsets) | |||
| : vocab_(vocab), | |||
| suffix_indicator_(suffix_indicator), | |||
| max_bytes_per_token_(max_bytes_per_token), | |||
| unknown_token_(unknown_token) {} | |||
| unknown_token_(unknown_token), | |||
| with_offsets_(with_offsets) {} | |||
| Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, | |||
| bool *out_found, int *out_end) const { | |||
| @@ -51,17 +54,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru | |||
| return Status::OK(); | |||
| } | |||
| Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const { | |||
| Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, const uint32_t &basic_start, | |||
| std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start, | |||
| std::vector<uint32_t> *offsets_limit) const { | |||
| out_tokens->clear(); | |||
| offsets_start->push_back(basic_start); | |||
| if (unknown_token_.empty()) { | |||
| out_tokens->emplace_back(input_token); | |||
| offsets_limit->push_back(basic_start + input_token.length()); | |||
| } else { | |||
| out_tokens->emplace_back(unknown_token_); | |||
| offsets_limit->push_back(basic_start + input_token.length()); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end, | |||
| Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int &start, const int &end, | |||
| std::vector<std::string> *out_tokens) const { | |||
| CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range"); | |||
| std::string subword = input_token.substr(start, end - start); | |||
| @@ -72,9 +80,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in | |||
| return Status::OK(); | |||
| } | |||
| Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const { | |||
| Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, const uint32_t &basic_start, | |||
| std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start, | |||
| std::vector<uint32_t> *offsets_limit) const { | |||
| if (input_token.size() > max_bytes_per_token_) { | |||
| return FoundNoToken(input_token, out_tokens); | |||
| offsets_start->push_back(basic_start); | |||
| if (!unknown_token_.empty()) { | |||
| offsets_limit->push_back(basic_start + unknown_token_.size()); | |||
| out_tokens->emplace_back(unknown_token_); | |||
| } else { | |||
| out_tokens->emplace_back(input_token); | |||
| offsets_limit->push_back(basic_start + input_token.size()); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| RuneStrArray runes; | |||
| if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) { | |||
| @@ -86,29 +104,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect | |||
| RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end)); | |||
| if (found) { | |||
| RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens)); | |||
| offsets_start->push_back(static_cast<uint32_t>(basic_start + start)); | |||
| offsets_limit->push_back(static_cast<uint32_t>(basic_start + end)); | |||
| start = end; | |||
| } else { | |||
| return FoundNoToken(input_token, out_tokens); | |||
| return FoundNoToken(input_token, basic_start, out_tokens, offsets_start, offsets_limit); | |||
| } | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { | |||
| Status WordpieceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) { | |||
| IO_CHECK_VECTOR(input, output); | |||
| if (input[0]->Rank() > 1 || input[0]->type() != DataType::DE_STRING) { | |||
| RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); | |||
| } | |||
| dsize_t count = 0; | |||
| std::vector<std::string> out_tokens; | |||
| for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) { | |||
| std::vector<uint32_t> offsets_start, offsets_limit; | |||
| std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor; | |||
| for (auto iter = input[0]->begin<std::string_view>(); iter != input[0]->end<std::string_view>(); iter++) { | |||
| uint32_t basic_start = 0; | |||
| std::vector<std::string> temp_tokens; | |||
| RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens)); | |||
| if (with_offsets_ && input.size() == 3) { | |||
| RETURN_IF_NOT_OK(input[1]->GetItemAt<uint32_t>(&basic_start, {count, 0})); | |||
| } | |||
| RETURN_IF_NOT_OK(GetTokens(std::string(*iter), basic_start, &temp_tokens, &offsets_start, &offsets_limit)); | |||
| out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end()); | |||
| count++; | |||
| } | |||
| if (out_tokens.empty()) { | |||
| out_tokens.emplace_back(""); | |||
| offsets_start.push_back(0); | |||
| offsets_limit.push_back(0); | |||
| } | |||
| token_tensor = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()})); | |||
| output->push_back(token_tensor); | |||
| if (with_offsets_) { | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_start[0]))); | |||
| RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible, | |||
| TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32), | |||
| reinterpret_cast<unsigned char *>(&offsets_limit[0]))); | |||
| output->push_back(offsets_start_tensor); | |||
| output->push_back(offsets_limit_tensor); | |||
| } | |||
| *output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()})); | |||
| return Status::OK(); | |||
| } | |||
| @@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp { | |||
| static const char kDefSuffixIndicator[]; | |||
| static const int kDefMaxBytesPerToken; | |||
| static const char kDefUnknownToken[]; | |||
| static const bool kDefWithOffsets; | |||
| WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator, | |||
| const int &max_bytes_per_token = kDefMaxBytesPerToken, | |||
| const std::string &unknown_token = kDefUnknownToken); | |||
| const std::string &unknown_token = kDefUnknownToken, const bool &with_offsets = kDefWithOffsets); | |||
| ~WordpieceTokenizerOp() override = default; | |||
| void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; } | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| Status Compute(const TensorRow &input, TensorRow *output) override; | |||
| protected: | |||
| Status AddSubword(const std::string &input_token, const int start, const int end, | |||
| Status AddSubword(const std::string &input_token, const int &start, const int &end, | |||
| std::vector<std::string> *out_token) const; | |||
| Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const; | |||
| Status FoundNoToken(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens, | |||
| std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const; | |||
| Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found, | |||
| int *out_end) const; | |||
| Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const; | |||
| Status GetTokens(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens, | |||
| std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const; | |||
| private: | |||
| const std::shared_ptr<Vocab> vocab_; | |||
| const std::string suffix_indicator_; | |||
| const bool with_offsets_; | |||
| const int max_bytes_per_token_; | |||
| const std::string unknown_token_; | |||
| }; | |||
| @@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde | |||
| from .utils import JiebaMode, NormalizeForm, to_str | |||
| from .validators import check_lookup, check_jieba_add_dict, \ | |||
| check_jieba_add_word, check_jieba_init, check_ngram, check_pair_truncate, \ | |||
| check_to_number, check_python_tokenizer | |||
| check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer,\ | |||
| check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate,\ | |||
| check_to_number, check_bert_tokenizer, check_python_tokenizer | |||
| from ..core.datatypes import mstype_to_detype | |||
| @@ -121,15 +122,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): | |||
| - JiebaMode.MP, tokenize with MPSegment algorithm. | |||
| - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. | |||
| - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False) | |||
| >>> data = data.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| @check_jieba_init | |||
| def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): | |||
| def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False): | |||
| if not isinstance(mode, JiebaMode): | |||
| raise TypeError("Wrong input type for mode, should be JiebaMode.") | |||
| self.mode = mode | |||
| self.__check_path__(hmm_path) | |||
| self.__check_path__(mp_path) | |||
| self.with_offsets = with_offsets | |||
| super().__init__(hmm_path, mp_path, | |||
| DE_C_INTER_JIEBA_MODE[mode]) | |||
| DE_C_INTER_JIEBA_MODE[mode], | |||
| self.with_offsets) | |||
| @check_jieba_add_word | |||
| def add_word(self, word, freq=None): | |||
| @@ -222,8 +239,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): | |||
| class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): | |||
| """ | |||
| Tokenize a scalar tensor of UTF-8 string to Unicode characters. | |||
| Args: | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.UnicodeCharTokenizer() | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.UnicodeCharTokenizer(True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| @check_with_offsets | |||
| def __init__(self, with_offsets=False): | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.with_offsets) | |||
| class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||
| """ | |||
| @@ -235,22 +270,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100). | |||
| unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, | |||
| return the token directly, else return 'unknown_token'(default='[UNK]'). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], | |||
| >>> max_bytes_per_token=100, with_offsets=False) | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'], | |||
| >>> max_bytes_per_token=100, with_offsets=True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'): | |||
| @check_wordpiece_tokenizer | |||
| def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, | |||
| unknown_token='[UNK]', with_offsets=False): | |||
| self.vocab = vocab | |||
| self.suffix_indicator = suffix_indicator | |||
| self.max_bytes_per_token = max_bytes_per_token | |||
| self.unknown_token = unknown_token | |||
| super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token) | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, | |||
| self.unknown_token, self.with_offsets) | |||
| if platform.system().lower() != 'windows': | |||
| class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): | |||
| """ | |||
| Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n'). | |||
| Args: | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.WhitespaceTokenizer() | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | |||
| >>> # ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.WhitespaceTokenizer(True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| @check_with_offsets | |||
| def __init__(self, with_offsets=False): | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.with_offsets) | |||
| class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): | |||
| """ | |||
| @@ -258,11 +329,25 @@ if platform.system().lower() != 'windows': | |||
| Args: | |||
| keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False) | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | |||
| >>> # ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| def __init__(self, keep_whitespace=False): | |||
| @check_unicode_script_tokenizer | |||
| def __init__(self, keep_whitespace=False, with_offsets=False): | |||
| self.keep_whitespace = keep_whitespace | |||
| super().__init__(self.keep_whitespace) | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.keep_whitespace, self.with_offsets) | |||
| class CaseFold(cde.CaseFoldOp): | |||
| @@ -298,6 +383,9 @@ if platform.system().lower() != 'windows': | |||
| """ | |||
| def __init__(self, normalize_form=NormalizeForm.NFKC): | |||
| if not isinstance(normalize_form, NormalizeForm): | |||
| raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.") | |||
| self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] | |||
| super().__init__(self.normalize_form) | |||
| @@ -334,12 +422,26 @@ if platform.system().lower() != 'windows': | |||
| keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token | |||
| if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), | |||
| in this situation, delimiters will not kept as a output token(default=''). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False) | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | |||
| >>> # ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| def __init__(self, delim_pattern, keep_delim_pattern=''): | |||
| @check_regex_tokenizer | |||
| def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False): | |||
| self.delim_pattern = delim_pattern | |||
| self.keep_delim_pattern = keep_delim_pattern | |||
| super().__init__(self.delim_pattern, self.keep_delim_pattern) | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) | |||
| class BasicTokenizer(cde.BasicTokenizerOp): | |||
| @@ -355,16 +457,41 @@ if platform.system().lower() != 'windows': | |||
| only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). | |||
| preserve_unused_token(bool, optional): If True, do not split special tokens like | |||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.BasicTokenizer(lower_case=False, | |||
| >>> keep_whitespace=False, | |||
| >>> normalization_form=NormalizeForm.NONE, | |||
| >>> preserve_unused_token=True, | |||
| >>> with_offsets=False) | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | |||
| >>> # ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.BasicTokenizer(lower_case=False, | |||
| >>> keep_whitespace=False, | |||
| >>> normalization_form=NormalizeForm.NONE, | |||
| >>> preserve_unused_token=True, | |||
| >>> with_offsets=True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| def __init__(self, lower_case=False, keep_whitespace=False, | |||
| normalization_form=NormalizeForm.NONE, preserve_unused_token=True): | |||
| @check_basic_tokenizer | |||
| def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, | |||
| preserve_unused_token=True, with_offsets=False): | |||
| if not isinstance(normalization_form, NormalizeForm): | |||
| raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.") | |||
| self.lower_case = lower_case | |||
| self.keep_whitespace = keep_whitespace | |||
| self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] | |||
| self.preserve_unused_token = preserve_unused_token | |||
| super().__init__(self.lower_case, self.keep_whitespace, | |||
| self.normalization_form, self.preserve_unused_token) | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form, | |||
| self.preserve_unused_token, self.with_offsets) | |||
| class BertTokenizer(cde.BertTokenizerOp): | |||
| @@ -385,11 +512,33 @@ if platform.system().lower() != 'windows': | |||
| only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). | |||
| preserve_unused_token(bool, optional): If True, do not split special tokens like | |||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, | |||
| >>> unknown_token=100, lower_case=False, keep_whitespace=False, | |||
| >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, | |||
| >>> with_offsets=False) | |||
| >>> dataset = dataset.map(operations=tokenizer_op) | |||
| >>> # If with_offsets=False, then output three columns {["token", dtype=str], | |||
| >>> # ["offsets_start", dtype=uint32], | |||
| >>> # ["offsets_limit", dtype=uint32]} | |||
| >>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100, | |||
| >>> unknown_token=100, lower_case=False, keep_whitespace=False, | |||
| >>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True, | |||
| >>> with_offsets=True) | |||
| >>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| >>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op) | |||
| """ | |||
| def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, | |||
| unknown_token='[UNK]', lower_case=False, keep_whitespace=False, | |||
| normalization_form=NormalizeForm.NONE, preserve_unused_token=True): | |||
| @check_bert_tokenizer | |||
| def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', | |||
| lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE, | |||
| preserve_unused_token=True, with_offsets=False): | |||
| if not isinstance(normalization_form, NormalizeForm): | |||
| raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.") | |||
| self.vocab = vocab | |||
| self.suffix_indicator = suffix_indicator | |||
| self.max_bytes_per_token = max_bytes_per_token | |||
| @@ -398,8 +547,10 @@ if platform.system().lower() != 'windows': | |||
| self.keep_whitespace = keep_whitespace | |||
| self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] | |||
| self.preserve_unused_token = preserve_unused_token | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, | |||
| self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token) | |||
| self.lower_case, self.keep_whitespace, self.normalization_form, | |||
| self.preserve_unused_token, self.with_offsets) | |||
| class TruncateSequencePair(cde.TruncateSequencePairOp): | |||
| @@ -25,7 +25,6 @@ from mindspore._c_expression import typing | |||
| from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \ | |||
| INT32_MAX, check_value | |||
| def check_unique_list_of_words(words, arg_name): | |||
| """Check that words is a list and each element is a str without any duplication""" | |||
| @@ -116,11 +115,22 @@ def check_from_dict(method): | |||
| def check_jieba_init(method): | |||
| """Wrapper method to check the parameters of jieba add word.""" | |||
| """Wrapper method to check the parameters of jieba init.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| parse_user_args(method, *args, **kwargs) | |||
| [hmm_path, mp_path, _, with_offsets], _ = parse_user_args(method, *args, **kwargs) | |||
| if hmm_path is None: | |||
| raise ValueError("The dict of HMMSegment in cppjieba is not provided.") | |||
| if not isinstance(hmm_path, str): | |||
| raise TypeError("Wrong input type for hmm_path, should be string.") | |||
| if mp_path is None: | |||
| raise ValueError("The dict of MPSegment in cppjieba is not provided.") | |||
| if not isinstance(mp_path, str): | |||
| raise TypeError("Wrong input type for mp_path, should be string.") | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| @@ -152,6 +162,128 @@ def check_jieba_add_dict(method): | |||
| return new_method | |||
| def check_with_offsets(method): | |||
| """Wrapper method to check if with_offsets is the only one parameter.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [with_offsets], _ = parse_user_args(method, *args, **kwargs) | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_unicode_script_tokenizer(method): | |||
| """Wrapper method to check the parameter of UnicodeScriptTokenizer.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [keep_whitespace, with_offsets], _ = parse_user_args(method, *args, **kwargs) | |||
| if not isinstance(keep_whitespace, bool): | |||
| raise TypeError("Wrong input type for keep_whitespace, should be boolean.") | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_wordpiece_tokenizer(method): | |||
| """Wrapper method to check the parameter of WordpieceTokenizer.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets], _ =\ | |||
| parse_user_args(method, *args, **kwargs) | |||
| if vocab is None: | |||
| raise ValueError("vocab is not provided.") | |||
| if not isinstance(vocab, cde.Vocab): | |||
| raise TypeError("Wrong input type for vocab, should be Vocab object.") | |||
| if not isinstance(suffix_indicator, str): | |||
| raise TypeError("Wrong input type for suffix_indicator, should be string.") | |||
| if not isinstance(unknown_token, str): | |||
| raise TypeError("Wrong input type for unknown_token, should be string.") | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| check_uint32(max_bytes_per_token) | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_regex_tokenizer(method): | |||
| """Wrapper method to check the parameter of RegexTokenizer.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [delim_pattern, keep_delim_pattern, with_offsets], _ = parse_user_args(method, *args, **kwargs) | |||
| if delim_pattern is None: | |||
| raise ValueError("delim_pattern is not provided.") | |||
| if not isinstance(delim_pattern, str): | |||
| raise TypeError("Wrong input type for delim_pattern, should be string.") | |||
| if not isinstance(keep_delim_pattern, str): | |||
| raise TypeError("Wrong input type for keep_delim_pattern, should be string.") | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_basic_tokenizer(method): | |||
| """Wrapper method to check the parameter of RegexTokenizer.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [lower_case, keep_whitespace, _, preserve_unused, with_offsets], _ =\ | |||
| parse_user_args(method, *args, **kwargs) | |||
| if not isinstance(lower_case, bool): | |||
| raise TypeError("Wrong input type for lower_case, should be boolean.") | |||
| if not isinstance(keep_whitespace, bool): | |||
| raise TypeError("Wrong input type for keep_whitespace, should be boolean.") | |||
| if not isinstance(preserve_unused, bool): | |||
| raise TypeError("Wrong input type for preserve_unused_token, should be boolean.") | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_bert_tokenizer(method): | |||
| """Wrapper method to check the parameter of BertTokenizer.""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, _, | |||
| preserve_unused_token, with_offsets], _ = parse_user_args(method, *args, **kwargs) | |||
| if vocab is None: | |||
| raise ValueError("vacab is not provided.") | |||
| if not isinstance(vocab, cde.Vocab): | |||
| raise TypeError("Wrong input type for vocab, should be Vocab object.") | |||
| if not isinstance(suffix_indicator, str): | |||
| raise TypeError("Wrong input type for suffix_indicator, should be string.") | |||
| if not isinstance(max_bytes_per_token, int): | |||
| raise TypeError("Wrong input type for max_bytes_per_token, should be int.") | |||
| check_uint32(max_bytes_per_token) | |||
| if not isinstance(unknown_token, str): | |||
| raise TypeError("Wrong input type for unknown_token, should be string.") | |||
| if not isinstance(lower_case, bool): | |||
| raise TypeError("Wrong input type for lower_case, should be boolean.") | |||
| if not isinstance(keep_whitespace, bool): | |||
| raise TypeError("Wrong input type for keep_whitespace, should be boolean.") | |||
| if not isinstance(preserve_unused_token, bool): | |||
| raise TypeError("Wrong input type for preserve_unused_token, should be boolean.") | |||
| if not isinstance(with_offsets, bool): | |||
| raise TypeError("Wrong input type for with_offsets, should be boolean.") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_from_dataset(method): | |||
| """A wrapper that wrap a parameter checker to the original function.""" | |||
| @@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) { | |||
| std::string dataset_path = datasets_root_path_ + "/jiebadict"; | |||
| std::string hmm_path = dataset_path + "/hmm_model.utf8"; | |||
| std::string mp_path = dataset_path + "/jieba.dict.utf8"; | |||
| std::shared_ptr<Tensor> output_tensor; | |||
| TensorRow input, output; | |||
| std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path)); | |||
| std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("今天天气太好了我们一起去外面玩吧"); | |||
| Status s = op->Compute(input_tensor, &output_tensor); | |||
| input.push_back(input_tensor); | |||
| Status s = op->Compute(input, &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output_tensor->Rank(), 1); | |||
| EXPECT_EQ(output_tensor->Size(), 7); | |||
| CheckEqual(output_tensor, {0}, "今天天气"); | |||
| CheckEqual(output_tensor, {1}, "太好了"); | |||
| CheckEqual(output_tensor, {2}, "我们"); | |||
| CheckEqual(output_tensor, {3}, "一起"); | |||
| CheckEqual(output_tensor, {4}, "去"); | |||
| CheckEqual(output_tensor, {5}, "外面"); | |||
| CheckEqual(output_tensor, {6}, "玩吧"); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| EXPECT_EQ(output[0]->Size(), 7); | |||
| CheckEqual(output[0], {0}, "今天天气"); | |||
| CheckEqual(output[0], {1}, "太好了"); | |||
| CheckEqual(output[0], {2}, "我们"); | |||
| CheckEqual(output[0], {3}, "一起"); | |||
| CheckEqual(output[0], {4}, "去"); | |||
| CheckEqual(output[0], {5}, "外面"); | |||
| CheckEqual(output[0], {6}, "玩吧"); | |||
| } | |||
| TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) { | |||
| @@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) { | |||
| std::string dataset_path = datasets_root_path_ + "/jiebadict"; | |||
| std::string hmm_path = dataset_path + "/hmm_model.utf8"; | |||
| std::string mp_path = dataset_path + "/jieba.dict.utf8"; | |||
| std::shared_ptr<Tensor> output_tensor; | |||
| TensorRow input, output; | |||
| std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path)); | |||
| op->AddWord("男默女泪"); | |||
| std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("男默女泪"); | |||
| Status s = op->Compute(input_tensor, &output_tensor); | |||
| input.push_back(input_tensor); | |||
| Status s = op->Compute(input, &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output_tensor->Rank(), 1); | |||
| EXPECT_EQ(output_tensor->Size(), 1); | |||
| CheckEqual(output_tensor, {0}, "男默女泪"); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| CheckEqual(output[0], {0}, "男默女泪"); | |||
| } | |||
| TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) { | |||
| @@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) { | |||
| std::string dataset_path = datasets_root_path_ + "/jiebadict"; | |||
| std::string hmm_path = dataset_path + "/hmm_model.utf8"; | |||
| std::string mp_path = dataset_path + "/jieba.dict.utf8"; | |||
| std::shared_ptr<Tensor> output_tensor; | |||
| TensorRow input, output; | |||
| std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path)); | |||
| op->AddWord("男默女泪"); | |||
| std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(""); | |||
| Status s = op->Compute(input_tensor, &output_tensor); | |||
| input.push_back(input_tensor); | |||
| Status s = op->Compute(input, &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output_tensor->Rank(), 1); | |||
| EXPECT_EQ(output_tensor->Size(), 1); | |||
| CheckEqual(output_tensor, {0}, ""); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| CheckEqual(output[0], {0}, ""); | |||
| } | |||
| @@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common { | |||
| TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) { | |||
| MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp."; | |||
| std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp()); | |||
| std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp(true)); | |||
| std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Hello World!"); | |||
| std::shared_ptr<Tensor> output; | |||
| Status s = op->Compute(input, &output); | |||
| TensorRow output; | |||
| Status s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 12); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor1: " << output->ToString(); | |||
| CheckEqual(output, {0}, "H"); | |||
| CheckEqual(output, {1}, "e"); | |||
| CheckEqual(output, {2}, "l"); | |||
| CheckEqual(output, {3}, "l"); | |||
| CheckEqual(output, {4}, "o"); | |||
| CheckEqual(output, {5}, " "); | |||
| CheckEqual(output, {6}, "W"); | |||
| CheckEqual(output, {7}, "o"); | |||
| CheckEqual(output, {8}, "r"); | |||
| CheckEqual(output, {9}, "l"); | |||
| CheckEqual(output, {10}, "d"); | |||
| CheckEqual(output, {11}, "!"); | |||
| EXPECT_EQ(output[0]->Size(), 12); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "H"); | |||
| CheckEqual(output[0], {1}, "e"); | |||
| CheckEqual(output[0], {2}, "l"); | |||
| CheckEqual(output[0], {3}, "l"); | |||
| CheckEqual(output[0], {4}, "o"); | |||
| CheckEqual(output[0], {5}, " "); | |||
| CheckEqual(output[0], {6}, "W"); | |||
| CheckEqual(output[0], {7}, "o"); | |||
| CheckEqual(output[0], {8}, "r"); | |||
| CheckEqual(output[0], {9}, "l"); | |||
| CheckEqual(output[0], {10}, "d"); | |||
| CheckEqual(output[0], {11}, "!"); | |||
| input = std::make_shared<Tensor>("中国 你好!"); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 6); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor2: " << output->ToString(); | |||
| CheckEqual(output, {0}, "中"); | |||
| CheckEqual(output, {1}, "国"); | |||
| CheckEqual(output, {2}, " "); | |||
| CheckEqual(output, {3}, "你"); | |||
| CheckEqual(output, {4}, "好"); | |||
| CheckEqual(output, {5}, "!"); | |||
| EXPECT_EQ(output[0]->Size(), 6); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "中"); | |||
| CheckEqual(output[0], {1}, "国"); | |||
| CheckEqual(output[0], {2}, " "); | |||
| CheckEqual(output[0], {3}, "你"); | |||
| CheckEqual(output[0], {4}, "好"); | |||
| CheckEqual(output[0], {5}, "!"); | |||
| input = std::make_shared<Tensor>("中"); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor3: " << output->ToString(); | |||
| CheckEqual(output, {0}, "中"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "中"); | |||
| input = std::make_shared<Tensor>("H"); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor4: " << output->ToString(); | |||
| CheckEqual(output, {0}, "H"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "H"); | |||
| input = std::make_shared<Tensor>(" "); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 2); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor5: " << output->ToString(); | |||
| CheckEqual(output, {0}, " "); | |||
| CheckEqual(output, {1}, " "); | |||
| EXPECT_EQ(output[0]->Size(), 2); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, " "); | |||
| CheckEqual(output[0], {1}, " "); | |||
| input = std::make_shared<Tensor>(""); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor6: " << output->ToString(); | |||
| CheckEqual(output, {0}, ""); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, ""); | |||
| } | |||
| TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) { | |||
| MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp."; | |||
| std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp()); | |||
| std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp(true)); | |||
| std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China."); | |||
| std::shared_ptr<Tensor> output; | |||
| Status s = op->Compute(input, &output); | |||
| TensorRow output; | |||
| Status s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 3); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor1: " << output->ToString(); | |||
| CheckEqual(output, {0}, "Welcome"); | |||
| CheckEqual(output, {1}, "to"); | |||
| CheckEqual(output, {2}, "China."); | |||
| EXPECT_EQ(output[0]->Size(), 3); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "Welcome"); | |||
| CheckEqual(output[0], {1}, "to"); | |||
| CheckEqual(output[0], {2}, "China."); | |||
| input = std::make_shared<Tensor>(" hello"); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor2: " << output->ToString(); | |||
| CheckEqual(output, {0}, "hello"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "hello"); | |||
| input = std::make_shared<Tensor>("hello"); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor3: " << output->ToString(); | |||
| CheckEqual(output, {0}, "hello"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "hello"); | |||
| input = std::make_shared<Tensor>("hello "); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor4: " << output->ToString(); | |||
| CheckEqual(output, {0}, "hello"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "hello"); | |||
| input = std::make_shared<Tensor>(" "); | |||
| s = op->Compute(input, &output); | |||
| output.clear(); | |||
| s = op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor5: " << output->ToString(); | |||
| CheckEqual(output, {0}, ""); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, ""); | |||
| } | |||
| TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) { | |||
| MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer."; | |||
| std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true)); | |||
| std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false)); | |||
| std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true)); | |||
| std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true)); | |||
| std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京"); | |||
| std::shared_ptr<Tensor> output; | |||
| Status s = keep_whitespace_op->Compute(input, &output); | |||
| TensorRow output; | |||
| Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 10); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor1: " << output->ToString(); | |||
| CheckEqual(output, {0}, "Welcome"); | |||
| CheckEqual(output, {1}, " "); | |||
| CheckEqual(output, {2}, "to"); | |||
| CheckEqual(output, {3}, " "); | |||
| CheckEqual(output, {4}, "China"); | |||
| CheckEqual(output, {5}, "."); | |||
| CheckEqual(output, {6}, " \n "); | |||
| CheckEqual(output, {7}, "中国"); | |||
| CheckEqual(output, {8}, "\t"); | |||
| CheckEqual(output, {9}, "北京"); | |||
| s = skip_whitespace_op->Compute(input, &output); | |||
| EXPECT_EQ(output[0]->Size(), 10); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "Welcome"); | |||
| CheckEqual(output[0], {1}, " "); | |||
| CheckEqual(output[0], {2}, "to"); | |||
| CheckEqual(output[0], {3}, " "); | |||
| CheckEqual(output[0], {4}, "China"); | |||
| CheckEqual(output[0], {5}, "."); | |||
| CheckEqual(output[0], {6}, " \n "); | |||
| CheckEqual(output[0], {7}, "中国"); | |||
| CheckEqual(output[0], {8}, "\t"); | |||
| CheckEqual(output[0], {9}, "北京"); | |||
| output.clear(); | |||
| s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 6); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor2: " << output->ToString(); | |||
| CheckEqual(output, {0}, "Welcome"); | |||
| CheckEqual(output, {1}, "to"); | |||
| CheckEqual(output, {2}, "China"); | |||
| CheckEqual(output, {3}, "."); | |||
| CheckEqual(output, {4}, "中国"); | |||
| CheckEqual(output, {5}, "北京"); | |||
| EXPECT_EQ(output[0]->Size(), 6); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "Welcome"); | |||
| CheckEqual(output[0], {1}, "to"); | |||
| CheckEqual(output[0], {2}, "China"); | |||
| CheckEqual(output[0], {3}, "."); | |||
| CheckEqual(output[0], {4}, "中国"); | |||
| CheckEqual(output[0], {5}, "北京"); | |||
| input = std::make_shared<Tensor>(" Welcome to 中国. "); | |||
| s = skip_whitespace_op->Compute(input, &output); | |||
| output.clear(); | |||
| s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 4); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor3: " << output->ToString(); | |||
| CheckEqual(output, {0}, "Welcome"); | |||
| CheckEqual(output, {1}, "to"); | |||
| CheckEqual(output, {2}, "中国"); | |||
| CheckEqual(output, {3}, "."); | |||
| s = keep_whitespace_op->Compute(input, &output); | |||
| EXPECT_EQ(output[0]->Size(), 4); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "Welcome"); | |||
| CheckEqual(output[0], {1}, "to"); | |||
| CheckEqual(output[0], {2}, "中国"); | |||
| CheckEqual(output[0], {3}, "."); | |||
| output.clear(); | |||
| s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 8); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor4: " << output->ToString(); | |||
| CheckEqual(output, {0}, " "); | |||
| CheckEqual(output, {1}, "Welcome"); | |||
| CheckEqual(output, {2}, " "); | |||
| CheckEqual(output, {3}, "to"); | |||
| CheckEqual(output, {4}, " "); | |||
| CheckEqual(output, {5}, "中国"); | |||
| CheckEqual(output, {6}, "."); | |||
| CheckEqual(output, {7}, " "); | |||
| EXPECT_EQ(output[0]->Size(), 8); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, " "); | |||
| CheckEqual(output[0], {1}, "Welcome"); | |||
| CheckEqual(output[0], {2}, " "); | |||
| CheckEqual(output[0], {3}, "to"); | |||
| CheckEqual(output[0], {4}, " "); | |||
| CheckEqual(output[0], {5}, "中国"); | |||
| CheckEqual(output[0], {6}, "."); | |||
| CheckEqual(output[0], {7}, " "); | |||
| input = std::make_shared<Tensor>("Hello"); | |||
| s = keep_whitespace_op->Compute(input, &output); | |||
| output.clear(); | |||
| s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor5: " << output->ToString(); | |||
| CheckEqual(output, {0}, "Hello"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "Hello"); | |||
| input = std::make_shared<Tensor>("H"); | |||
| s = keep_whitespace_op->Compute(input, &output); | |||
| output.clear(); | |||
| s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor6: " << output->ToString(); | |||
| CheckEqual(output, {0}, "H"); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "H"); | |||
| input = std::make_shared<Tensor>(""); | |||
| s = keep_whitespace_op->Compute(input, &output); | |||
| output.clear(); | |||
| s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor7: " << output->ToString(); | |||
| CheckEqual(output, {0}, ""); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, ""); | |||
| input = std::make_shared<Tensor>("Hello中国Hello世界"); | |||
| s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 4); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor8: " << output->ToString(); | |||
| CheckEqual(output, {0}, "Hello"); | |||
| CheckEqual(output, {1}, "中国"); | |||
| CheckEqual(output, {2}, "Hello"); | |||
| CheckEqual(output, {3}, "世界"); | |||
| output.clear(); | |||
| s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output[0]->Size(), 4); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, "Hello"); | |||
| CheckEqual(output[0], {1}, "中国"); | |||
| CheckEqual(output[0], {2}, "Hello"); | |||
| CheckEqual(output[0], {3}, "世界"); | |||
| input = std::make_shared<Tensor>(" "); | |||
| s = keep_whitespace_op->Compute(input, &output); | |||
| output.clear(); | |||
| s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor10: " << output->ToString(); | |||
| CheckEqual(output, {0}, " "); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, " "); | |||
| input = std::make_shared<Tensor>(" "); | |||
| s = skip_whitespace_op->Compute(input, &output); | |||
| output.clear(); | |||
| s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| EXPECT_EQ(output->Size(), 1); | |||
| EXPECT_EQ(output->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor11: " << output->ToString(); | |||
| CheckEqual(output, {0}, ""); | |||
| EXPECT_EQ(output[0]->Size(), 1); | |||
| EXPECT_EQ(output[0]->Rank(), 1); | |||
| MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString(); | |||
| CheckEqual(output[0], {0}, ""); | |||
| } | |||
| TEST_F(MindDataTestTokenizerOp, TestCaseFold) { | |||
| @@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) { | |||
| TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) { | |||
| MS_LOG(INFO) << "Doing TestRegexTokenizerOp."; | |||
| std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "")); | |||
| std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true)); | |||
| std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京"); | |||
| std::shared_ptr<Tensor> output; | |||
| Status s = regex_tokenizer_op->Compute(input, &output); | |||
| TensorRow output; | |||
| Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| } | |||
| @@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) { | |||
| MS_LOG(INFO) << "Doing TestBasicTokenizer."; | |||
| //bool lower_case, bool keep_whitespace, | |||
| // NormalizeForm normalization_form, bool preserve_unused_token | |||
| std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false)); | |||
| std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false, | |||
| true)); | |||
| std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京"); | |||
| std::shared_ptr<Tensor> output; | |||
| Status s = basic_tokenizer->Compute(input, &output); | |||
| TensorRow output; | |||
| Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output); | |||
| EXPECT_TRUE(s.IsOk()); | |||
| } | |||
| @@ -1,83 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing BasicTokenizer op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as nlp | |||
| BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" | |||
| test_paras = [ | |||
| dict( | |||
| first=1, | |||
| last=6, | |||
| expected_tokens= | |||
| [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'], | |||
| ['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'], | |||
| ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], | |||
| ['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝', | |||
| '(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封', | |||
| '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'], | |||
| ['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代', | |||
| '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', | |||
| '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], | |||
| ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', | |||
| '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']] | |||
| ), | |||
| dict( | |||
| first=7, | |||
| last=7, | |||
| expected_tokens=[['this', 'is', 'a', 'funky', 'string']], | |||
| lower_case=True | |||
| ), | |||
| ] | |||
| def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False, | |||
| normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): | |||
| dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case, | |||
| keep_whitespace=keep_whitespace, | |||
| normalization_form=normalization_form, | |||
| preserve_unused_token=preserve_unused_token) | |||
| dataset = dataset.map(operations=basic_tokenizer) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']) | |||
| logger.info("Out:", text) | |||
| logger.info("Exp:", expected_tokens[count]) | |||
| np.testing.assert_array_equal(text, expected_tokens[count]) | |||
| count = count + 1 | |||
| def test_basic_tokenizer(): | |||
| """ | |||
| Test BasicTokenizer | |||
| """ | |||
| for paras in test_paras: | |||
| check_basic_tokenizer(**paras) | |||
| if __name__ == '__main__': | |||
| test_basic_tokenizer() | |||
| @@ -1,238 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore.dataset.text import JiebaTokenizer | |||
| from mindspore.dataset.text import JiebaMode, to_str | |||
| DATA_FILE = "../data/dataset/testJiebaDataset/3.txt" | |||
| DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*" | |||
| HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8" | |||
| MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8" | |||
| def test_jieba_1(): | |||
| """Test jieba tokenizer with MP mode""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| ret = [] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_1_1(): | |||
| """Test jieba tokenizer with HMM mode""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_1_2(): | |||
| """Test jieba tokenizer with HMM MIX""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_2(): | |||
| """Test add_word""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("男默女泪") | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_2_1(): | |||
| """Test add_word with freq""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("男默女泪", 10) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_2_2(): | |||
| """Test add_word with invalid None Input""" | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| try: | |||
| jieba_op.add_word(None) | |||
| except ValueError: | |||
| pass | |||
| def test_jieba_2_3(): | |||
| """Test add_word with freq, the value of freq affects the result of segmentation""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("江大桥", 20000) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_3(): | |||
| """Test add_dict with dict""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| user_dict = { | |||
| "男默女泪": 10 | |||
| } | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_dict(user_dict) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_3_1(): | |||
| """Test add_dict with dict""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| user_dict = { | |||
| "男默女泪": 10, | |||
| "江大桥": 20000 | |||
| } | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_dict(user_dict) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['男默女泪', '市长', '江大桥'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_4(): | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" | |||
| DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_dict(DICT_FILE) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_4_1(): | |||
| """Test add dict with invalid file path""" | |||
| DICT_FILE = "" | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| try: | |||
| jieba_op.add_dict(DICT_FILE) | |||
| except ValueError: | |||
| pass | |||
| def test_jieba_5(): | |||
| """Test add dict with file path""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("江大桥", 20000) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def gen(): | |||
| text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S') | |||
| yield (text,) | |||
| def pytoken_op(input_data): | |||
| te = str(to_str(input_data)) | |||
| tokens = [] | |||
| tokens.append(te[:5].encode("UTF8")) | |||
| tokens.append(te[5:10].encode("UTF8")) | |||
| tokens.append(te[10:].encode("UTF8")) | |||
| return np.array(tokens, dtype='S') | |||
| def test_jieba_6(): | |||
| data = ds.GeneratorDataset(gen, column_names=["text"]) | |||
| data = data.map(input_columns=["text"], | |||
| operations=pytoken_op, num_parallel_workers=1) | |||
| expect = ['今天天气太', '好了我们一', '起去外面玩吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| if __name__ == "__main__": | |||
| test_jieba_1() | |||
| test_jieba_1_1() | |||
| test_jieba_1_2() | |||
| test_jieba_2() | |||
| test_jieba_2_1() | |||
| test_jieba_2_2() | |||
| test_jieba_3() | |||
| test_jieba_3_1() | |||
| test_jieba_4() | |||
| test_jieba_4_1() | |||
| test_jieba_5() | |||
| test_jieba_5() | |||
| test_jieba_6() | |||
| @@ -0,0 +1,138 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing BasicTokenizer op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as text | |||
| BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" | |||
| test_paras = [ | |||
| dict( | |||
| first=1, | |||
| last=6, | |||
| expected_tokens= | |||
| [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'], | |||
| ['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'], | |||
| ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], | |||
| ['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝', | |||
| '(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封', | |||
| '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'], | |||
| ['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代', | |||
| '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', | |||
| '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], | |||
| ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', | |||
| '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']], | |||
| expected_offsets_start=[[0, 8, 11, 18, 21, 24, 27, 30], | |||
| [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42], | |||
| [0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], | |||
| [0, 3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, | |||
| 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100], | |||
| [0, 3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, | |||
| 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115], | |||
| [0, 10, 11, 15, 16, 20, 21, 25, 35, 36, 40, 41, 45, 46, 50, 57, 64, 74, 87, 97, 101]], | |||
| expected_offsets_limit=[[7, 10, 18, 21, 24, 27, 30, 33], | |||
| [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45], | |||
| [4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], | |||
| [3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 52, 55, 58, | |||
| 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 103], | |||
| [3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 54, | |||
| 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115, 124], | |||
| [9, 11, 15, 16, 20, 21, 24, 34, 36, 40, 41, 45, 46, 49, 56, 63, 73, 86, 96, 100, 113]] | |||
| ), | |||
| dict( | |||
| first=7, | |||
| last=7, | |||
| expected_tokens=[['this', 'is', 'a', 'funky', 'string']], | |||
| expected_offsets_start=[[0, 5, 8, 10, 16]], | |||
| expected_offsets_limit=[[4, 7, 9, 15, 22]], | |||
| lower_case=True | |||
| ), | |||
| ] | |||
| def check_basic_tokenizer_default(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, | |||
| lower_case=False, keep_whitespace=False, | |||
| normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): | |||
| dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| basic_tokenizer = text.BasicTokenizer(lower_case=lower_case, | |||
| keep_whitespace=keep_whitespace, | |||
| normalization_form=normalization_form, | |||
| preserve_unused_token=preserve_unused_token) | |||
| dataset = dataset.map(operations=basic_tokenizer) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']) | |||
| logger.info("Out:", token) | |||
| logger.info("Exp:", expected_tokens[count]) | |||
| np.testing.assert_array_equal(token, expected_tokens[count]) | |||
| count = count + 1 | |||
| def check_basic_tokenizer_with_offsets(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, | |||
| lower_case=False, keep_whitespace=False, | |||
| normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): | |||
| dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| basic_tokenizer = text.BasicTokenizer(lower_case=lower_case, | |||
| keep_whitespace=keep_whitespace, | |||
| normalization_form=normalization_form, | |||
| preserve_unused_token=preserve_unused_token, | |||
| with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=basic_tokenizer) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']) | |||
| logger.info("Out:", token) | |||
| logger.info("Exp:", expected_tokens[count]) | |||
| np.testing.assert_array_equal(token, expected_tokens[count]) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count = count + 1 | |||
| def test_basic_tokenizer_with_offsets(): | |||
| """ | |||
| Test BasicTokenizer | |||
| """ | |||
| for paras in test_paras: | |||
| check_basic_tokenizer_with_offsets(**paras) | |||
| def test_basic_tokenizer_default(): | |||
| """ | |||
| Test BasicTokenizer | |||
| """ | |||
| for paras in test_paras: | |||
| check_basic_tokenizer_default(**paras) | |||
| if __name__ == '__main__': | |||
| test_basic_tokenizer_default() | |||
| test_basic_tokenizer_with_offsets() | |||
| @@ -18,7 +18,7 @@ Testing BertTokenizer op in DE | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as nlp | |||
| import mindspore.dataset.text as text | |||
| BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt" | |||
| @@ -39,6 +39,14 @@ test_paras = [ | |||
| ['疑', '是', '地', '上', '霜'], | |||
| ['举', '头', '望', '明', '月'], | |||
| ['低', '头', '思', '故', '乡']], | |||
| expected_offsets_start=[[0, 3, 6, 9, 12], | |||
| [0, 3, 6, 9, 12], | |||
| [0, 3, 6, 9, 12], | |||
| [0, 3, 6, 9, 12]], | |||
| expected_offsets_limit=[[3, 6, 9, 12, 15], | |||
| [3, 6, 9, 12, 15], | |||
| [3, 6, 9, 12, 15], | |||
| [3, 6, 9, 12, 15]], | |||
| vocab_list=vocab_bert | |||
| ), | |||
| # test english text | |||
| @@ -46,6 +54,8 @@ test_paras = [ | |||
| first=5, | |||
| last=5, | |||
| expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], | |||
| expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]], | |||
| expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]], | |||
| lower_case=True, | |||
| vocab_list=vocab_bert | |||
| ), | |||
| @@ -53,6 +63,8 @@ test_paras = [ | |||
| first=5, | |||
| last=5, | |||
| expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], | |||
| expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]], | |||
| expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]], | |||
| lower_case=False, | |||
| vocab_list=vocab_bert | |||
| ), | |||
| @@ -63,7 +75,9 @@ test_paras = [ | |||
| expect_str=[ | |||
| ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], | |||
| ['繁', '體', '字']], | |||
| normalization_form=nlp.utils.NormalizeForm.NFKC, | |||
| expected_offsets_start=[[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], [0, 3, 6]], | |||
| expected_offsets_limit=[[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], [3, 6, 9]], | |||
| normalization_form=text.utils.NormalizeForm.NFKC, | |||
| vocab_list=vocab_bert | |||
| ), | |||
| # test preserved tokens | |||
| @@ -79,6 +93,8 @@ test_paras = [ | |||
| ['[unused1]'], | |||
| ['[unused10]'] | |||
| ], | |||
| expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]], | |||
| expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]], | |||
| lower_case=False, | |||
| vocab_list=vocab_bert, | |||
| preserve_unused_token=True, | |||
| @@ -95,6 +111,8 @@ test_paras = [ | |||
| ['[unused1]'], | |||
| ['[unused10]'] | |||
| ], | |||
| expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]], | |||
| expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]], | |||
| lower_case=True, | |||
| vocab_list=vocab_bert, | |||
| preserve_unused_token=True, | |||
| @@ -104,6 +122,8 @@ test_paras = [ | |||
| first=15, | |||
| last=15, | |||
| expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']], | |||
| expected_offsets_start=[[0, 2, 3, 4, 5, 7, 8, 10, 11, 12]], | |||
| expected_offsets_limit=[[2, 3, 4, 5, 7, 8, 10, 11, 12, 14]], | |||
| preserve_unused_token=True, | |||
| vocab_list=vocab_bert | |||
| ), | |||
| @@ -112,6 +132,8 @@ test_paras = [ | |||
| first=8, | |||
| last=8, | |||
| expect_str=[['[UNK]', ' ', '[CLS]']], | |||
| expected_offsets_start=[[0, 6, 7]], | |||
| expected_offsets_limit=[[6, 7, 12]], | |||
| lower_case=False, | |||
| vocab_list=vocab_bert, | |||
| preserve_unused_token=True, | |||
| @@ -121,6 +143,8 @@ test_paras = [ | |||
| first=8, | |||
| last=8, | |||
| expect_str=[['unused', ' ', '[CLS]']], | |||
| expected_offsets_start=[[0, 6, 7]], | |||
| expected_offsets_limit=[[6, 7, 12]], | |||
| lower_case=False, | |||
| vocab_list=vocab_bert, | |||
| preserve_unused_token=True, | |||
| @@ -131,6 +155,8 @@ test_paras = [ | |||
| first=8, | |||
| last=8, | |||
| expect_str=[['unused', ' ', '[', 'CLS', ']']], | |||
| expected_offsets_start=[[0, 6, 7, 8, 11]], | |||
| expected_offsets_limit=[[6, 7, 8, 11, 12]], | |||
| lower_case=False, | |||
| vocab_list=vocab_bert, | |||
| preserve_unused_token=False, | |||
| @@ -140,20 +166,20 @@ test_paras = [ | |||
| ] | |||
| def check_bert_tokenizer(first, last, expect_str, | |||
| vocab_list, | |||
| suffix_indicator='##', | |||
| max_bytes_per_token=100, unknown_token='[UNK]', | |||
| lower_case=False, keep_whitespace=False, | |||
| normalization_form=nlp.utils.NormalizeForm.NONE, | |||
| preserve_unused_token=False): | |||
| def check_bert_tokenizer_default(first, last, expect_str, | |||
| expected_offsets_start, expected_offsets_limit, | |||
| vocab_list, suffix_indicator='##', | |||
| max_bytes_per_token=100, unknown_token='[UNK]', | |||
| lower_case=False, keep_whitespace=False, | |||
| normalization_form=text.utils.NormalizeForm.NONE, | |||
| preserve_unused_token=False): | |||
| dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| vocab = nlp.Vocab.from_list(vocab_list) | |||
| tokenizer_op = nlp.BertTokenizer( | |||
| vocab = text.Vocab.from_list(vocab_list) | |||
| tokenizer_op = text.BertTokenizer( | |||
| vocab=vocab, suffix_indicator=suffix_indicator, | |||
| max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, | |||
| lower_case=lower_case, keep_whitespace=keep_whitespace, | |||
| @@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str, | |||
| dataset = dataset.map(operations=tokenizer_op) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']) | |||
| logger.info("Out:", text) | |||
| token = text.to_str(i['text']) | |||
| logger.info("Out:", token) | |||
| logger.info("Exp:", expect_str[count]) | |||
| np.testing.assert_array_equal(text, expect_str[count]) | |||
| np.testing.assert_array_equal(token, expect_str[count]) | |||
| count = count + 1 | |||
| def test_bert_tokenizer(): | |||
| def check_bert_tokenizer_with_offsets(first, last, expect_str, | |||
| expected_offsets_start, expected_offsets_limit, | |||
| vocab_list, suffix_indicator='##', | |||
| max_bytes_per_token=100, unknown_token='[UNK]', | |||
| lower_case=False, keep_whitespace=False, | |||
| normalization_form=text.utils.NormalizeForm.NONE, | |||
| preserve_unused_token=False): | |||
| dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| vocab = text.Vocab.from_list(vocab_list) | |||
| tokenizer_op = text.BertTokenizer( | |||
| vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, | |||
| unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, | |||
| normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']) | |||
| logger.info("Out:", token) | |||
| logger.info("Exp:", expect_str[count]) | |||
| np.testing.assert_array_equal(token, expect_str[count]) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count = count + 1 | |||
| def test_bert_tokenizer_default(): | |||
| """ | |||
| Test WordpieceTokenizer when with_offsets=False | |||
| """ | |||
| for paras in test_paras: | |||
| check_bert_tokenizer_default(**paras) | |||
| def test_bert_tokenizer_with_offsets(): | |||
| """ | |||
| Test WordpieceTokenizer | |||
| Test WordpieceTokenizer when with_offsets=True | |||
| """ | |||
| for paras in test_paras: | |||
| check_bert_tokenizer(**paras) | |||
| check_bert_tokenizer_with_offsets(**paras) | |||
| if __name__ == '__main__': | |||
| test_bert_tokenizer() | |||
| test_bert_tokenizer_default() | |||
| test_bert_tokenizer_with_offsets() | |||
| @@ -0,0 +1,471 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore.dataset.text import JiebaTokenizer | |||
| from mindspore.dataset.text import JiebaMode, to_str | |||
| DATA_FILE = "../data/dataset/testJiebaDataset/3.txt" | |||
| DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*" | |||
| HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8" | |||
| MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8" | |||
| def test_jieba_1(): | |||
| """Test jieba tokenizer with MP mode""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| ret = [] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_1_1(): | |||
| """Test jieba tokenizer with HMM mode""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_1_2(): | |||
| """Test jieba tokenizer with HMM MIX""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_2(): | |||
| """Test add_word""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("男默女泪") | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_2_1(): | |||
| """Test add_word with freq""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("男默女泪", 10) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_2_2(): | |||
| """Test add_word with invalid None Input""" | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| try: | |||
| jieba_op.add_word(None) | |||
| except ValueError: | |||
| pass | |||
| def test_jieba_2_3(): | |||
| """Test add_word with freq, the value of freq affects the result of segmentation""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("江大桥", 20000) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_3(): | |||
| """Test add_dict with dict""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| user_dict = { | |||
| "男默女泪": 10 | |||
| } | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_dict(user_dict) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_3_1(): | |||
| """Test add_dict with dict""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| user_dict = { | |||
| "男默女泪": 10, | |||
| "江大桥": 20000 | |||
| } | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_dict(user_dict) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['男默女泪', '市长', '江大桥'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_4(): | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" | |||
| DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_dict(DICT_FILE) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_4_1(): | |||
| """Test add dict with invalid file path""" | |||
| DICT_FILE = "" | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| try: | |||
| jieba_op.add_dict(DICT_FILE) | |||
| except ValueError: | |||
| pass | |||
| def test_jieba_5(): | |||
| """Test add dict with file path""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) | |||
| jieba_op.add_word("江大桥", 20000) | |||
| data = data.map(input_columns=["text"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| def test_jieba_with_offsets_1(): | |||
| """Test jieba tokenizer with MP mode""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] | |||
| expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] | |||
| ret = [] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_1_1(): | |||
| """Test jieba tokenizer with HMM mode""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] | |||
| expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45] | |||
| expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_1_2(): | |||
| """Test jieba tokenizer with HMM MIX""" | |||
| data = ds.TextFileDataset(DATA_FILE) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] | |||
| expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_2(): | |||
| """Test add_word""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_word("男默女泪") | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expected_offsets_start = [0, 12, 15] | |||
| expected_offsets_limit = [12, 15, 27] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_2_1(): | |||
| """Test add_word with freq""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_word("男默女泪", 10) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| expected_offsets_start = [0, 12, 15] | |||
| expected_offsets_limit = [12, 15, 27] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_2_2(): | |||
| """Test add_word with freq, the value of freq affects the result of segmentation""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_word("江大桥", 20000) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=2) | |||
| expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] | |||
| expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] | |||
| expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_3(): | |||
| """Test add_dict with dict""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| user_dict = { | |||
| "男默女泪": 10 | |||
| } | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_dict(user_dict) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['男默女泪', '市', '长江大桥'] | |||
| expected_offsets_start = [0, 12, 15] | |||
| expected_offsets_limit = [12, 15, 27] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_3_1(): | |||
| """Test add_dict with dict""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" | |||
| user_dict = { | |||
| "男默女泪": 10, | |||
| "江大桥": 20000 | |||
| } | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_dict(user_dict) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['男默女泪', '市长', '江大桥'] | |||
| expected_offsets_start = [0, 12, 18] | |||
| expected_offsets_limit = [12, 18, 27] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_4(): | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" | |||
| DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_dict(DICT_FILE) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] | |||
| expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] | |||
| expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def test_jieba_with_offsets_5(): | |||
| """Test add dict with file path""" | |||
| DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" | |||
| data = ds.TextFileDataset(DATA_FILE4) | |||
| jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) | |||
| jieba_op.add_word("江大桥", 20000) | |||
| data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], | |||
| columns_order=["token", "offsets_start", "offsets_limit"], | |||
| operations=jieba_op, num_parallel_workers=1) | |||
| expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] | |||
| expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] | |||
| expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["token"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| for index, item in enumerate(i["offsets_start"]): | |||
| assert item == expected_offsets_start[index] | |||
| for index, item in enumerate(i["offsets_limit"]): | |||
| assert item == expected_offsets_limit[index] | |||
| def gen(): | |||
| text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S') | |||
| yield (text,) | |||
| def pytoken_op(input_data): | |||
| te = str(to_str(input_data)) | |||
| tokens = [] | |||
| tokens.append(te[:5].encode("UTF8")) | |||
| tokens.append(te[5:10].encode("UTF8")) | |||
| tokens.append(te[10:].encode("UTF8")) | |||
| return np.array(tokens, dtype='S') | |||
| def test_jieba_6(): | |||
| data = ds.GeneratorDataset(gen, column_names=["text"]) | |||
| data = data.map(input_columns=["text"], | |||
| operations=pytoken_op, num_parallel_workers=1) | |||
| expect = ['今天天气太', '好了我们一', '起去外面玩吧'] | |||
| for i in data.create_dict_iterator(): | |||
| ret = to_str(i["text"]) | |||
| for index, item in enumerate(ret): | |||
| assert item == expect[index] | |||
| if __name__ == "__main__": | |||
| test_jieba_1() | |||
| test_jieba_1_1() | |||
| test_jieba_1_2() | |||
| test_jieba_2() | |||
| test_jieba_2_1() | |||
| test_jieba_2_2() | |||
| test_jieba_3() | |||
| test_jieba_3_1() | |||
| test_jieba_4() | |||
| test_jieba_4_1() | |||
| test_jieba_5() | |||
| test_jieba_5() | |||
| test_jieba_6() | |||
| test_jieba_with_offsets_1() | |||
| test_jieba_with_offsets_1_1() | |||
| test_jieba_with_offsets_1_2() | |||
| test_jieba_with_offsets_2() | |||
| test_jieba_with_offsets_2_1() | |||
| test_jieba_with_offsets_2_2() | |||
| test_jieba_with_offsets_3() | |||
| test_jieba_with_offsets_3_1() | |||
| test_jieba_with_offsets_4() | |||
| test_jieba_with_offsets_5() | |||
| @@ -0,0 +1,380 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing UnicodeCharTokenizer op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as text | |||
| DATA_FILE = "../data/dataset/testTokenizerData/1.txt" | |||
| NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" | |||
| REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt" | |||
| REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt" | |||
| def split_by_unicode_char(input_strs): | |||
| """ | |||
| Split utf-8 strings to unicode characters | |||
| """ | |||
| out = [] | |||
| for s in input_strs: | |||
| out.append([c for c in s]) | |||
| return out | |||
| def test_unicode_char_tokenizer_default(): | |||
| """ | |||
| Test UnicodeCharTokenizer | |||
| """ | |||
| input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.UnicodeCharTokenizer() | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| tokens.append(token) | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert split_by_unicode_char(input_strs) == tokens | |||
| def test_unicode_char_tokenizer_with_offsets(): | |||
| """ | |||
| Test UnicodeCharTokenizer | |||
| """ | |||
| input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.UnicodeCharTokenizer(with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) | |||
| tokens = [] | |||
| expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], | |||
| [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]] | |||
| expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], | |||
| [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']).tolist() | |||
| tokens.append(token) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count += 1 | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert split_by_unicode_char(input_strs) == tokens | |||
| def test_whitespace_tokenizer_default(): | |||
| """ | |||
| Test WhitespaceTokenizer | |||
| """ | |||
| whitespace_strs = [["Welcome", "to", "Beijing!"], | |||
| ["北京欢迎您!"], | |||
| ["我喜欢English!"], | |||
| [""]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.WhitespaceTokenizer() | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| tokens.append(token) | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert whitespace_strs == tokens | |||
| def test_whitespace_tokenizer_with_offsets(): | |||
| """ | |||
| Test WhitespaceTokenizer | |||
| """ | |||
| whitespace_strs = [["Welcome", "to", "Beijing!"], | |||
| ["北京欢迎您!"], | |||
| ["我喜欢English!"], | |||
| [""]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.WhitespaceTokenizer(with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) | |||
| tokens = [] | |||
| expected_offsets_start = [[0, 8, 11], [0], [0], [0]] | |||
| expected_offsets_limit = [[7, 10, 19], [18], [17], [0]] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']).tolist() | |||
| tokens.append(token) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count += 1 | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert whitespace_strs == tokens | |||
| def test_unicode_script_tokenizer_default(): | |||
| """ | |||
| Test UnicodeScriptTokenizer when para keep_whitespace=False | |||
| """ | |||
| unicode_script_strs = [["Welcome", "to", "Beijing", "!"], | |||
| ["北京欢迎您", "!"], | |||
| ["我喜欢", "English", "!"], | |||
| [""]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False) | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| tokens.append(token) | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert unicode_script_strs == tokens | |||
| def test_unicode_script_tokenizer_default2(): | |||
| """ | |||
| Test UnicodeScriptTokenizer when para keep_whitespace=True | |||
| """ | |||
| unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], | |||
| ["北京欢迎您", "!"], | |||
| ["我喜欢", "English", "!"], | |||
| [" "]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True) | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| tokens.append(token) | |||
| logger.info("The out tokens is :", tokens) | |||
| assert unicode_script_strs2 == tokens | |||
| def test_unicode_script_tokenizer_with_offsets(): | |||
| """ | |||
| Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True | |||
| """ | |||
| unicode_script_strs = [["Welcome", "to", "Beijing", "!"], | |||
| ["北京欢迎您", "!"], | |||
| ["我喜欢", "English", "!"], | |||
| [""]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) | |||
| tokens = [] | |||
| expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]] | |||
| expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']).tolist() | |||
| tokens.append(token) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count += 1 | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert unicode_script_strs == tokens | |||
| def test_unicode_script_tokenizer_with_offsets2(): | |||
| """ | |||
| Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True | |||
| """ | |||
| unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], | |||
| ["北京欢迎您", "!"], | |||
| ["我喜欢", "English", "!"], | |||
| [" "]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) | |||
| tokens = [] | |||
| expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]] | |||
| expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']).tolist() | |||
| tokens.append(token) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count += 1 | |||
| logger.info("The out tokens is :", tokens) | |||
| assert unicode_script_strs2 == tokens | |||
| def test_case_fold(): | |||
| """ | |||
| Test CaseFold | |||
| """ | |||
| expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| op = text.CaseFold() | |||
| dataset = dataset.map(operations=op) | |||
| lower_strs = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| lower_strs.append(token) | |||
| assert lower_strs == expect_strs | |||
| def test_normalize_utf8(): | |||
| """ | |||
| Test NormalizeUTF8 | |||
| """ | |||
| def normalize(normalize_form): | |||
| dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) | |||
| normalize = text.NormalizeUTF8(normalize_form=normalize_form) | |||
| dataset = dataset.map(operations=normalize) | |||
| out_bytes = [] | |||
| out_texts = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| out_bytes.append(i['text']) | |||
| out_texts.append(text.to_str(i['text']).tolist()) | |||
| logger.info("The out bytes is : ", out_bytes) | |||
| logger.info("The out texts is: ", out_texts) | |||
| return out_bytes | |||
| expect_normlize_data = [ | |||
| # NFC | |||
| [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'], | |||
| # NFKC | |||
| [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'fi', b'25', b'\xe1\xb9\xa9'], | |||
| # NFD | |||
| [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'], | |||
| # NFKD | |||
| [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'fi', b'25', b's\xcc\xa3\xcc\x87'] | |||
| ] | |||
| assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0] | |||
| assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1] | |||
| assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2] | |||
| assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3] | |||
| def test_regex_replace(): | |||
| """ | |||
| Test RegexReplace | |||
| """ | |||
| def regex_replace(first, last, expect_str, pattern, replace): | |||
| dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| replace_op = text.RegexReplace(pattern, replace) | |||
| dataset = dataset.map(operations=replace_op) | |||
| out_text = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| out_text.append(token) | |||
| logger.info("Out:", out_text) | |||
| logger.info("Exp:", expect_str) | |||
| assert expect_str == out_text | |||
| regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_') | |||
| regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "") | |||
| regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "") | |||
| regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") | |||
| def test_regex_tokenizer_default(): | |||
| """ | |||
| Test RegexTokenizer | |||
| """ | |||
| def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): | |||
| dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern) | |||
| dataset = dataset.map(operations=tokenizer_op) | |||
| out_text = [] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']).tolist() | |||
| np.testing.assert_array_equal(token, expect_str[count]) | |||
| count += 1 | |||
| out_text.append(token) | |||
| logger.info("Out:", out_text) | |||
| logger.info("Exp:", expect_str) | |||
| regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "") | |||
| regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+") | |||
| regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}") | |||
| regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") | |||
| regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "") | |||
| regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") | |||
| def test_regex_tokenizer_with_offsets(): | |||
| """ | |||
| Test RegexTokenizer | |||
| """ | |||
| def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern, | |||
| keep_delim_pattern): | |||
| dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) | |||
| out_text = [] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']).tolist() | |||
| np.testing.assert_array_equal(token, expect_str[count]) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count += 1 | |||
| out_text.append(token) | |||
| logger.info("Out:", out_text) | |||
| logger.info("Exp:", expect_str) | |||
| regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "") | |||
| regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]], | |||
| "\\s+", "\\s+") | |||
| regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]], | |||
| [[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}") | |||
| regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]], | |||
| r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") | |||
| regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "") | |||
| regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "") | |||
| if __name__ == '__main__': | |||
| test_unicode_char_tokenizer_default() | |||
| test_unicode_char_tokenizer_with_offsets() | |||
| test_whitespace_tokenizer_default() | |||
| test_whitespace_tokenizer_with_offsets() | |||
| test_unicode_script_tokenizer_default() | |||
| test_unicode_script_tokenizer_default2() | |||
| test_unicode_script_tokenizer_with_offsets() | |||
| test_unicode_script_tokenizer_with_offsets2() | |||
| test_case_fold() | |||
| test_normalize_utf8() | |||
| test_regex_replace() | |||
| test_regex_tokenizer_default() | |||
| test_regex_tokenizer_with_offsets() | |||
| @@ -0,0 +1,160 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing WordpieceTokenizer op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as text | |||
| WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt" | |||
| vocab_english = [ | |||
| "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the" | |||
| ] | |||
| vocab_chinese = [ | |||
| "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情' | |||
| ] | |||
| vocab_mix = vocab_chinese + vocab_english | |||
| test_paras = [ | |||
| dict( | |||
| first=1, | |||
| last=10, | |||
| expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], | |||
| ['era'], ['[UNK]']], | |||
| expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]], | |||
| expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]], | |||
| vocab_list=vocab_english | |||
| ), | |||
| dict( | |||
| first=1, | |||
| last=10, | |||
| expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], | |||
| ['era'], ['what']], | |||
| expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]], | |||
| expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]], | |||
| vocab_list=vocab_english, | |||
| unknown_token="" | |||
| ), | |||
| dict( | |||
| first=1, | |||
| last=10, | |||
| expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']], | |||
| expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]], | |||
| expected_offsets_limit=[[2], [5], [4], [2], [4], [5], [3], [5], [3], [4]], | |||
| vocab_list=vocab_english, | |||
| max_bytes_per_token=4 | |||
| ), | |||
| dict( | |||
| first=11, | |||
| last=25, | |||
| expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], | |||
| ['[UNK]']], | |||
| expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]], | |||
| expected_offsets_limit=[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]], | |||
| vocab_list=vocab_chinese, | |||
| ), | |||
| dict( | |||
| first=25, | |||
| last=25, | |||
| expect_str=[['您']], | |||
| expected_offsets_start=[[0]], | |||
| expected_offsets_limit=[[3]], | |||
| vocab_list=vocab_chinese, | |||
| unknown_token="" | |||
| ), | |||
| dict( | |||
| first=1, | |||
| last=25, | |||
| expect_str=[ | |||
| ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'], | |||
| ['[UNK]'], | |||
| ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], | |||
| ['[UNK]']], | |||
| expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0], | |||
| [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]], | |||
| expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4], | |||
| [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]], | |||
| vocab_list=vocab_mix, | |||
| ), | |||
| ] | |||
| def check_wordpiece_tokenizer_default(first, last, expect_str, expected_offsets_start, expected_offsets_limit, | |||
| vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): | |||
| dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| vocab = text.Vocab.from_list(vocab_list) | |||
| tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, | |||
| max_bytes_per_token=max_bytes_per_token) | |||
| dataset = dataset.map(operations=tokenizer_op) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['text']) | |||
| logger.info("Out:", token) | |||
| logger.info("Exp:", expect_str[count]) | |||
| np.testing.assert_array_equal(token, expect_str[count]) | |||
| count = count + 1 | |||
| def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit, | |||
| vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): | |||
| dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| vocab = text.Vocab.from_list(vocab_list) | |||
| tokenizer_op = text.WordpieceTokenizer(vocab=vocab, with_offsets=True, unknown_token=unknown_token, | |||
| max_bytes_per_token=max_bytes_per_token) | |||
| dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], | |||
| columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| token = text.to_str(i['token']) | |||
| logger.info("Out:", token) | |||
| logger.info("Exp:", expect_str[count]) | |||
| np.testing.assert_array_equal(token, expect_str[count]) | |||
| np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) | |||
| np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) | |||
| count = count + 1 | |||
| def test_wordpiece_tokenizer_default(): | |||
| """ | |||
| Test WordpieceTokenizer | |||
| """ | |||
| for paras in test_paras: | |||
| check_wordpiece_tokenizer_default(**paras) | |||
| def test_wordpiece_tokenizer_with_offsets(): | |||
| """ | |||
| Test WordpieceTokenizer | |||
| """ | |||
| for paras in test_paras: | |||
| check_wordpiece_tokenizer_with_offsets(**paras) | |||
| if __name__ == '__main__': | |||
| test_wordpiece_tokenizer_default() | |||
| test_wordpiece_tokenizer_with_offsets() | |||
| @@ -1,233 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing UnicodeCharTokenizer op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as nlp | |||
| DATA_FILE = "../data/dataset/testTokenizerData/1.txt" | |||
| NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" | |||
| REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt" | |||
| REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt" | |||
| def split_by_unicode_char(input_strs): | |||
| """ | |||
| Split utf-8 strings to unicode characters | |||
| """ | |||
| out = [] | |||
| for s in input_strs: | |||
| out.append([c for c in s]) | |||
| return out | |||
| def test_unicode_char_tokenizer(): | |||
| """ | |||
| Test UnicodeCharTokenizer | |||
| """ | |||
| input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = nlp.UnicodeCharTokenizer() | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| tokens.append(text) | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert split_by_unicode_char(input_strs) == tokens | |||
| def test_whitespace_tokenizer(): | |||
| """ | |||
| Test WhitespaceTokenizer | |||
| """ | |||
| whitespace_strs = [["Welcome", "to", "Beijing!"], | |||
| ["北京欢迎您!"], | |||
| ["我喜欢English!"], | |||
| [""]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = nlp.WhitespaceTokenizer() | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| tokens.append(text) | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert whitespace_strs == tokens | |||
| def test_unicode_script_tokenizer(): | |||
| """ | |||
| Test UnicodeScriptTokenizer when para keep_whitespace=False | |||
| """ | |||
| unicode_script_strs = [["Welcome", "to", "Beijing", "!"], | |||
| ["北京欢迎您", "!"], | |||
| ["我喜欢", "English", "!"], | |||
| [""]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False) | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| tokens.append(text) | |||
| logger.info("The out tokens is : {}".format(tokens)) | |||
| assert unicode_script_strs == tokens | |||
| def test_unicode_script_tokenizer2(): | |||
| """ | |||
| Test UnicodeScriptTokenizer when para keep_whitespace=True | |||
| """ | |||
| unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], | |||
| ["北京欢迎您", "!"], | |||
| ["我喜欢", "English", "!"], | |||
| [" "]] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True) | |||
| dataset = dataset.map(operations=tokenizer) | |||
| tokens = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| tokens.append(text) | |||
| logger.info("The out tokens is :", tokens) | |||
| assert unicode_script_strs2 == tokens | |||
| def test_case_fold(): | |||
| """ | |||
| Test CaseFold | |||
| """ | |||
| expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] | |||
| dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) | |||
| op = nlp.CaseFold() | |||
| dataset = dataset.map(operations=op) | |||
| lower_strs = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| lower_strs.append(text) | |||
| assert lower_strs == expect_strs | |||
| def test_normalize_utf8(): | |||
| """ | |||
| Test NormalizeUTF8 | |||
| """ | |||
| def normalize(normalize_form): | |||
| dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) | |||
| normalize = nlp.NormalizeUTF8(normalize_form=normalize_form) | |||
| dataset = dataset.map(operations=normalize) | |||
| out_bytes = [] | |||
| out_texts = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| out_bytes.append(i['text']) | |||
| out_texts.append(nlp.to_str(i['text']).tolist()) | |||
| logger.info("The out bytes is : ", out_bytes) | |||
| logger.info("The out texts is: ", out_texts) | |||
| return out_bytes | |||
| expect_normlize_data = [ | |||
| # NFC | |||
| [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'], | |||
| # NFKC | |||
| [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'fi', b'25', b'\xe1\xb9\xa9'], | |||
| # NFD | |||
| [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'], | |||
| # NFKD | |||
| [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', | |||
| b'fi', b'25', b's\xcc\xa3\xcc\x87'] | |||
| ] | |||
| assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0] | |||
| assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1] | |||
| assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2] | |||
| assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3] | |||
| def test_regex_replace(): | |||
| """ | |||
| Test RegexReplace | |||
| """ | |||
| def regex_replace(first, last, expect_str, pattern, replace): | |||
| dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| replace_op = nlp.RegexReplace(pattern, replace) | |||
| dataset = dataset.map(operations=replace_op) | |||
| out_text = [] | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| out_text.append(text) | |||
| logger.info("Out:", out_text) | |||
| logger.info("Exp:", expect_str) | |||
| assert expect_str == out_text | |||
| regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_') | |||
| regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "") | |||
| regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "") | |||
| regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") | |||
| def test_regex_tokenizer(): | |||
| """ | |||
| Test RegexTokenizer | |||
| """ | |||
| def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): | |||
| dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern) | |||
| dataset = dataset.map(operations=tokenizer_op) | |||
| out_text = [] | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']).tolist() | |||
| np.testing.assert_array_equal(text, expect_str[count]) | |||
| count += 1 | |||
| out_text.append(text) | |||
| logger.info("Out:", out_text) | |||
| logger.info("Exp:", expect_str) | |||
| regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "") | |||
| regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+") | |||
| regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}") | |||
| regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+") | |||
| regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "") | |||
| regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") | |||
| if __name__ == '__main__': | |||
| test_unicode_char_tokenizer() | |||
| test_whitespace_tokenizer() | |||
| test_unicode_script_tokenizer() | |||
| test_unicode_script_tokenizer2() | |||
| test_case_fold() | |||
| test_normalize_utf8() | |||
| test_regex_replace() | |||
| test_regex_tokenizer() | |||
| @@ -1,113 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing WordpieceTokenizer op in DE | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| from mindspore import log as logger | |||
| import mindspore.dataset.text as nlp | |||
| WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt" | |||
| vocab_english = [ | |||
| "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the" | |||
| ] | |||
| vocab_chinese = [ | |||
| "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情' | |||
| ] | |||
| vocab_mix = vocab_chinese + vocab_english | |||
| test_paras = [ | |||
| dict( | |||
| first=1, | |||
| last=10, | |||
| expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], | |||
| ['era'], ['[UNK]']], | |||
| vocab_list=vocab_english | |||
| ), | |||
| dict( | |||
| first=1, | |||
| last=10, | |||
| expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], | |||
| ['era'], ['what']], | |||
| vocab_list=vocab_english, | |||
| unknown_token="" | |||
| ), | |||
| dict( | |||
| first=1, | |||
| last=10, | |||
| expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']], | |||
| vocab_list=vocab_english, | |||
| max_bytes_per_token=4 | |||
| ), | |||
| dict( | |||
| first=11, | |||
| last=25, | |||
| expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], | |||
| ['[UNK]']], | |||
| vocab_list=vocab_chinese, | |||
| ), | |||
| dict( | |||
| first=25, | |||
| last=25, | |||
| expect_str=[['您']], | |||
| vocab_list=vocab_chinese, | |||
| unknown_token="" | |||
| ), | |||
| dict( | |||
| first=1, | |||
| last=25, | |||
| expect_str=[ | |||
| ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'], | |||
| ['[UNK]'], | |||
| ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], | |||
| ['[UNK]']], | |||
| vocab_list=vocab_mix, | |||
| ), | |||
| ] | |||
| def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): | |||
| dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) | |||
| if first > 1: | |||
| dataset = dataset.skip(first - 1) | |||
| if last >= first: | |||
| dataset = dataset.take(last - first + 1) | |||
| vocab = nlp.Vocab.from_list(vocab_list) | |||
| tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, | |||
| max_bytes_per_token=max_bytes_per_token) | |||
| dataset = dataset.map(operations=tokenizer_op) | |||
| count = 0 | |||
| for i in dataset.create_dict_iterator(): | |||
| text = nlp.to_str(i['text']) | |||
| logger.info("Out:", text) | |||
| logger.info("Exp:", expect_str[count]) | |||
| np.testing.assert_array_equal(text, expect_str[count]) | |||
| count = count + 1 | |||
| def test_wordpiece_tokenizer(): | |||
| """ | |||
| Test WordpieceTokenizer | |||
| """ | |||
| for paras in test_paras: | |||
| check_wordpiece_tokenizer(**paras) | |||
| if __name__ == '__main__': | |||
| test_wordpiece_tokenizer() | |||