/** * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "minddata/dataset/include/text.h" #include "minddata/dataset/text/ir/kernels/text_ir.h" namespace mindspore { namespace dataset { // Transform operations for text. namespace text { // FUNCTIONS TO CREATE TEXT OPERATIONS // (In alphabetical order) #ifndef _WIN32 // BasicTokenizer BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets) : lower_case_(lower_case), keep_whitespace_(keep_whitespace), normalize_form_(normalize_form), preserve_unused_token_(preserve_unused_token), with_offsets_(with_offsets) {} std::shared_ptr BasicTokenizer::Parse() { return std::make_shared(lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); } // BertTokenizer BertTokenizer::BertTokenizer(const std::shared_ptr &vocab, const std::string &suffix_indicator, int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets) : vocab_(vocab), suffix_indicator_(suffix_indicator), max_bytes_per_token_(max_bytes_per_token), unknown_token_(unknown_token), lower_case_(lower_case), keep_whitespace_(keep_whitespace), normalize_form_(normalize_form), preserve_unused_token_(preserve_unused_token), with_offsets_(with_offsets) {} std::shared_ptr BertTokenizer::Parse() { return std::make_shared(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); } // CaseFold CaseFold::CaseFold() {} std::shared_ptr CaseFold::Parse() { return std::make_shared(); } #endif // JiebaTokenizer JiebaTokenizer::JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} std::shared_ptr JiebaTokenizer::Parse() { std::shared_ptr jieba_tokenizer = std::make_shared(hmm_path_, mp_path_, mode_, with_offsets_); for (auto &word : words_list_) { Status rc = jieba_tokenizer->AddWord(word.first, word.second); if (rc.IsError()) { MS_LOG(ERROR) << rc; return {}; } } return jieba_tokenizer; } Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { if (word.empty()) { std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } if (freq < 0) { std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; MS_LOG(ERROR) << err_msg; RETURN_STATUS_SYNTAX_ERROR(err_msg); } words_list_.emplace_back(word, freq); return Status::OK(); } // Lookup Lookup::Lookup(const std::shared_ptr &vocab, const std::optional &unknown_token, const std::string &data_type) : vocab_(vocab), unknown_token_(unknown_token), data_type_(data_type) {} std::shared_ptr Lookup::Parse() { return std::make_shared(vocab_, unknown_token_, data_type_); } // Ngram Ngram::Ngram(const std::vector &ngrams, const std::pair &left_pad, const std::pair &right_pad, const std::string &separator) : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} std::shared_ptr Ngram::Parse() { return std::make_shared(ngrams_, left_pad_, right_pad_, separator_); } #ifndef _WIN32 // NormalizeUTF8 NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} std::shared_ptr NormalizeUTF8::Parse() { return std::make_shared(normalize_form_); } // RegexReplace RegexReplace::RegexReplace(std::string pattern, std::string replace, bool replace_all) : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} std::shared_ptr RegexReplace::Parse() { return std::make_shared(pattern_, replace_, replace_all_); } // RegexTokenizer RegexTokenizer::RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets) : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} std::shared_ptr RegexTokenizer::Parse() { return std::make_shared(delim_pattern_, keep_delim_pattern_, with_offsets_); } #endif // SentencePieceTokenizer SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) : vocab_(vocab), out_type_(out_type) {} SentencePieceTokenizer::SentencePieceTokenizer(const std::string &vocab_path, SPieceTokenizerOutType out_type) : vocab_path_(vocab_path), out_type_(out_type) {} std::shared_ptr SentencePieceTokenizer::Parse() { if (vocab_ != nullptr) { return std::make_shared(vocab_, out_type_); } else { return std::make_shared(vocab_path_, out_type_); } } // SlidingWindow SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} std::shared_ptr SlidingWindow::Parse() { return std::make_shared(width_, axis_); } // ToNumber ToNumber::ToNumber(const std::string &data_type) : data_type_(data_type) {} std::shared_ptr ToNumber::Parse() { return std::make_shared(data_type_); } // TruncateSequencePair TruncateSequencePair::TruncateSequencePair(int32_t max_length) : max_length_(max_length) {} std::shared_ptr TruncateSequencePair::Parse() { return std::make_shared(max_length_); } // UnicodeCharTokenizer UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : with_offsets_(with_offsets) {} std::shared_ptr UnicodeCharTokenizer::Parse() { return std::make_shared(with_offsets_); } #ifndef _WIN32 // UnicodeScriptTokenizer UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} std::shared_ptr UnicodeScriptTokenizer::Parse() { return std::make_shared(keep_whitespace_, with_offsets_); } // WhitespaceTokenizer WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : with_offsets_(with_offsets) {} std::shared_ptr WhitespaceTokenizer::Parse() { return std::make_shared(with_offsets_); } #endif } // namespace text } // namespace dataset } // namespace mindspore