/** * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "minddata/dataset/include/text.h" namespace mindspore { namespace dataset { // Transform operations for text. namespace text { // FUNCTIONS TO CREATE TEXT OPERATIONS // (In alphabetical order) #ifndef _WIN32 std::shared_ptr BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets) { auto op = std::make_shared(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets); return op->ValidateParams() ? op : nullptr; } std::shared_ptr BertTokenizer(const std::shared_ptr &vocab, const std::string &suffix_indicator, int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets) { auto op = std::make_shared(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets); return op->ValidateParams() ? op : nullptr; } std::shared_ptr CaseFold() { auto op = std::make_shared(); return op->ValidateParams() ? op : nullptr; } #endif std::shared_ptr JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) { auto op = std::make_shared(hmm_path, mp_path, mode, with_offsets); return op->ValidateParams() ? op : nullptr; } std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::optional &unknown_token, const std::string &data_type) { auto op = std::make_shared(vocab, unknown_token, data_type); return op->ValidateParams() ? op : nullptr; } std::shared_ptr Ngram(const std::vector &ngrams, const std::pair &left_pad, const std::pair &right_pad, const std::string &separator) { auto op = std::make_shared(ngrams, left_pad, right_pad, separator); return op->ValidateParams() ? op : nullptr; } #ifndef _WIN32 std::shared_ptr NormalizeUTF8(NormalizeForm normalize_form) { auto op = std::make_shared(normalize_form); return op->ValidateParams() ? op : nullptr; } std::shared_ptr RegexReplace(std::string pattern, std::string replace, bool replace_all) { auto op = std::make_shared(pattern, replace, replace_all); return op->ValidateParams() ? op : nullptr; } std::shared_ptr RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets) { auto op = std::make_shared(delim_pattern, keep_delim_pattern, with_offsets); return op->ValidateParams() ? op : nullptr; } #endif std::shared_ptr SentencePieceTokenizer( const std::shared_ptr &vocab, SPieceTokenizerOutType out_type) { auto op = std::make_shared(vocab, out_type); return op->ValidateParams() ? op : nullptr; } std::shared_ptr SentencePieceTokenizer(const std::string &vocab_path, SPieceTokenizerOutType out_type) { auto op = std::make_shared(vocab_path, out_type); return op->ValidateParams() ? op : nullptr; } std::shared_ptr SlidingWindow(const int32_t width, const int32_t axis) { auto op = std::make_shared(width, axis); return op->ValidateParams() ? op : nullptr; } std::shared_ptr ToNumber(const std::string &data_type) { auto op = std::make_shared(data_type); return op->ValidateParams() ? op : nullptr; } std::shared_ptr TruncateSequencePair(int32_t max_length) { auto op = std::make_shared(max_length); return op->ValidateParams() ? op : nullptr; } std::shared_ptr UnicodeCharTokenizer(bool with_offsets) { auto op = std::make_shared(with_offsets); return op->ValidateParams() ? op : nullptr; } #ifndef _WIN32 std::shared_ptr UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets) { auto op = std::make_shared(keep_whitespace, with_offsets); return op->ValidateParams() ? op : nullptr; } std::shared_ptr WhitespaceTokenizer(bool with_offsets) { auto op = std::make_shared(with_offsets); return op->ValidateParams() ? op : nullptr; } #endif } // namespace text } // namespace dataset } // namespace mindspore