From: @tina_mengting_zhang Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -92,11 +92,14 @@ add_dependencies(engine core) | |||
| add_dependencies(callback core) | |||
| add_dependencies(text core) | |||
| add_dependencies(text-kernels core) | |||
| add_dependencies(text-ir core) | |||
| add_dependencies(text-ir-kernels core) | |||
| add_dependencies(cpp-API core) | |||
| add_dependencies(engine-ir-datasetops core) | |||
| add_dependencies(engine-ir-datasetops-source core) | |||
| add_dependencies(engine-ir-cache core) | |||
| add_dependencies(kernels-ir core) | |||
| add_dependencies(kernels-ir-data core) | |||
| add_dependencies(kernels-ir-vision core) | |||
| if(ENABLE_ACL) | |||
| @@ -146,7 +149,10 @@ set(submodules | |||
| $<TARGET_OBJECTS:engine> | |||
| $<TARGET_OBJECTS:text> | |||
| $<TARGET_OBJECTS:text-kernels> | |||
| $<TARGET_OBJECTS:text-ir> | |||
| $<TARGET_OBJECTS:text-ir-kernels> | |||
| $<TARGET_OBJECTS:kernels-ir> | |||
| $<TARGET_OBJECTS:kernels-ir-data> | |||
| $<TARGET_OBJECTS:kernels-ir-vision> | |||
| ) | |||
| @@ -17,9 +17,9 @@ | |||
| #include "minddata/dataset/api/python/pybind_register.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/kernels/py_func_op.h" | |||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||
| #include "minddata/dataset/kernels/ir/vision/vision_ir.h" | |||
| namespace mindspore { | |||
| @@ -18,7 +18,7 @@ | |||
| #include "pybind11/stl.h" | |||
| #include "pybind11/stl_bind.h" | |||
| #include "minddata/dataset/api/python/pybind_register.h" | |||
| #include "minddata/dataset/include/text.h" | |||
| #include "minddata/dataset/text/ir/kernels/text_ir.h" | |||
| #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | |||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||
| #include "minddata/dataset/text/vocab.h" | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -30,10 +30,10 @@ | |||
| #include "pybind11/stl_bind.h" | |||
| #include "minddata/dataset/include/datasets.h" | |||
| #include "minddata/dataset/include/samplers.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/api/python/pybind_register.h" | |||
| #include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" | |||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||
| #include "minddata/dataset/kernels/py_func_op.h" | |||
| namespace py = pybind11; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -17,30 +17,6 @@ | |||
| #include <unistd.h> | |||
| #include "minddata/dataset/include/text.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/bert_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/case_fold_op.h" | |||
| #endif | |||
| #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||
| #include "minddata/dataset/text/kernels/ngram_op.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/normalize_utf8_op.h" | |||
| #include "minddata/dataset/text/kernels/regex_replace_op.h" | |||
| #include "minddata/dataset/text/kernels/regex_tokenizer_op.h" | |||
| #endif | |||
| #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/sliding_window_op.h" | |||
| #include "minddata/dataset/text/kernels/to_number_op.h" | |||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | |||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||
| #endif | |||
| #include "minddata/dataset/core/data_type.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -174,426 +150,6 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs | |||
| return op->ValidateParams() ? op : nullptr; | |||
| } | |||
| #endif | |||
| /* ####################################### Validator Functions ############################################ */ | |||
| // Helper function to validate tokenizer directory parameter | |||
| Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) { | |||
| if (tokenizer_file.empty()) { | |||
| std::string err_msg = tokenizer_name + ": tokenizer_file is not specified."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| Path file(tokenizer_file); | |||
| if (!file.Exists()) { | |||
| std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (access(tokenizer_file.c_str(), R_OK) == -1) { | |||
| std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| // Helper functions to help validate data type passed by user | |||
| bool IsTypeNumeric(const std::string &data_type) { | |||
| if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || | |||
| data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || | |||
| data_type == "float16" || data_type == "float32" || data_type == "float64") | |||
| return true; | |||
| return false; | |||
| } | |||
| bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } | |||
| bool IsTypeString(const std::string &data_type) { return data_type == "string"; } | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| // (In alphabetical order) | |||
| #ifndef _WIN32 | |||
| // BasicTokenizerOperation | |||
| BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace, | |||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets) | |||
| : lower_case_(lower_case), | |||
| keep_whitespace_(keep_whitespace), | |||
| normalize_form_(normalize_form), | |||
| preserve_unused_token_(preserve_unused_token), | |||
| with_offsets_(with_offsets) {} | |||
| Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() { | |||
| std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>( | |||
| lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| // BertTokenizerOperation | |||
| BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||
| bool lower_case, bool keep_whitespace, | |||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets) | |||
| : vocab_(vocab), | |||
| suffix_indicator_(suffix_indicator), | |||
| max_bytes_per_token_(max_bytes_per_token), | |||
| unknown_token_(unknown_token), | |||
| lower_case_(lower_case), | |||
| keep_whitespace_(keep_whitespace), | |||
| normalize_form_(normalize_form), | |||
| preserve_unused_token_(preserve_unused_token), | |||
| with_offsets_(with_offsets) {} | |||
| BertTokenizerOperation::~BertTokenizerOperation() = default; | |||
| Status BertTokenizerOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (max_bytes_per_token_ < 0) { | |||
| std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + | |||
| std::to_string(max_bytes_per_token_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> BertTokenizerOperation::Build() { | |||
| std::shared_ptr<BertTokenizerOp> tensor_op = | |||
| std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_, | |||
| keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| // CaseFoldOperation | |||
| Status CaseFoldOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> CaseFoldOperation::Build() { | |||
| std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>(); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| // JiebaTokenizerOperation | |||
| JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, | |||
| const JiebaMode &mode, bool with_offsets) | |||
| : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} | |||
| Status JiebaTokenizerOperation::ValidateParams() { | |||
| if (hmm_path_.empty()) { | |||
| std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (mp_path_.empty()) { | |||
| std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_)); | |||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | |||
| std::shared_ptr<JiebaTokenizerOp> tensor_op = | |||
| std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_); | |||
| for (auto &word : words_list_) { | |||
| Status rc = tensor_op->AddWord(word.first, word.second); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << rc; | |||
| return {}; | |||
| } | |||
| } | |||
| return tensor_op; | |||
| } | |||
| Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { | |||
| if (word.empty()) { | |||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (freq < 0) { | |||
| std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| words_list_.emplace_back(word, freq); | |||
| return Status::OK(); | |||
| } | |||
| // LookupOperation | |||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||
| const std::string &data_type) | |||
| : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} | |||
| LookupOperation::~LookupOperation() = default; | |||
| Status LookupOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "Lookup: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (unknown_token_ != std::nullopt) { | |||
| default_id_ = vocab_->Lookup(*unknown_token_); | |||
| if (default_id_ == Vocab::kNoTokenExists) { | |||
| std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } | |||
| if (!IsTypeNumeric(data_type_)) { | |||
| std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> LookupOperation::Build() { | |||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_)); | |||
| return tensor_op; | |||
| } | |||
| // NgramOperation | |||
| NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator) | |||
| : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} | |||
| Status NgramOperation::ValidateParams() { | |||
| if (ngrams_.size() == 0) { | |||
| std::string err_msg = "Ngram : Container cannot be empty."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } else { | |||
| for (int32_t i = 0; i < ngrams_.size(); ++i) { | |||
| if (ngrams_[i] <= 0) { | |||
| std::string err_msg = | |||
| "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } | |||
| } | |||
| if (left_pad_.second < 0) { | |||
| std::string err_msg = | |||
| "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " + | |||
| std::to_string(left_pad_.second); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (right_pad_.second < 0) { | |||
| std::string err_msg = | |||
| "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " + | |||
| std::to_string(right_pad_.second); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> NgramOperation::Build() { | |||
| int32_t l_len = left_pad_.second; | |||
| int32_t r_len = right_pad_.second; | |||
| std::string l_pad = left_pad_.first; | |||
| std::string r_pad = right_pad_.first; | |||
| std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_); | |||
| return tensor_op; | |||
| } | |||
| #ifndef _WIN32 | |||
| // NormalizeUTF8Operation | |||
| NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} | |||
| Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() { | |||
| std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_); | |||
| return tensor_op; | |||
| } | |||
| // RegexReplaceOperation | |||
| RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all) | |||
| : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} | |||
| Status RegexReplaceOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> RegexReplaceOperation::Build() { | |||
| std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_); | |||
| return tensor_op; | |||
| } | |||
| // RegexTokenizerOperation | |||
| RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, | |||
| bool with_offsets) | |||
| : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} | |||
| Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() { | |||
| std::shared_ptr<RegexTokenizerOp> tensor_op = | |||
| std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| // SentencePieceTokenizerOperation | |||
| SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; | |||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, | |||
| SPieceTokenizerOutType out_type) | |||
| : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} | |||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path, | |||
| SPieceTokenizerOutType out_type) | |||
| : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {} | |||
| Status SentencePieceTokenizerOperation::ValidateParams() { | |||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } else { | |||
| Path vocab_file(vocab_path_); | |||
| if (!vocab_file.Exists() || vocab_file.IsDirectory()) { | |||
| std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (access(vocab_file.toString().c_str(), R_OK) == -1) { | |||
| std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() { | |||
| std::shared_ptr<SentencePieceTokenizerOp> tensor_op; | |||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_); | |||
| } else { | |||
| Path vocab_file(vocab_path_); | |||
| std::string model_path = vocab_file.ParentPath(); | |||
| std::string model_filename = vocab_file.Basename(); | |||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_); | |||
| } | |||
| return tensor_op; | |||
| } | |||
| // SlidingWindowOperation | |||
| SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} | |||
| Status SlidingWindowOperation::ValidateParams() { | |||
| if (width_ < 1) { | |||
| std::string err_msg = | |||
| "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> SlidingWindowOperation::Build() { | |||
| std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_); | |||
| return tensor_op; | |||
| } | |||
| // ToNumberOperation | |||
| ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} | |||
| Status ToNumberOperation::ValidateParams() { | |||
| if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { | |||
| std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> ToNumberOperation::Build() { | |||
| std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_); | |||
| return tensor_op; | |||
| } | |||
| TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} | |||
| Status TruncateSequencePairOperation::ValidateParams() { | |||
| if (max_length_ < 0) { | |||
| std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " + | |||
| std::to_string(max_length_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() { | |||
| std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_); | |||
| return tensor_op; | |||
| } | |||
| // UnicodeCharTokenizerOperation | |||
| UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||
| Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() { | |||
| std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #ifndef _WIN32 | |||
| // UnicodeScriptTokenizerOperation | |||
| UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | |||
| : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} | |||
| Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() { | |||
| std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op = | |||
| std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| // WhitespaceTokenizerOperation | |||
| WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||
| Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() { | |||
| std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| } // namespace text | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -15,18 +15,6 @@ | |||
| */ | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/kernels/ir/validators.h" | |||
| // Kernel data headers (in alphabetical order) | |||
| #include "minddata/dataset/kernels/data/compose_op.h" | |||
| #include "minddata/dataset/kernels/data/duplicate_op.h" | |||
| #include "minddata/dataset/kernels/data/one_hot_op.h" | |||
| #include "minddata/dataset/kernels/data/random_apply_op.h" | |||
| #include "minddata/dataset/kernels/data/random_choice_op.h" | |||
| #include "minddata/dataset/kernels/data/type_cast_op.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/kernels/data/unique_op.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -88,122 +76,6 @@ std::shared_ptr<UniqueOperation> Unique() { | |||
| return op->ValidateParams() ? op : nullptr; | |||
| } | |||
| #endif | |||
| /* ####################################### Validator Functions ############################################ */ | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| // (In alphabetical order) | |||
| // ComposeOperation | |||
| ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||
| : transforms_(transforms) {} | |||
| Status ComposeOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> ComposeOperation::Build() { | |||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||
| return std::make_shared<ComposeOp>(tensor_ops); | |||
| } | |||
| // DuplicateOperation | |||
| Status DuplicateOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); } | |||
| // OneHotOperation | |||
| OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {} | |||
| Status OneHotOperation::ValidateParams() { | |||
| if (num_classes_ <= 0) { | |||
| std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); } | |||
| // PreBuiltOperation | |||
| PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {} | |||
| Status PreBuiltOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; } | |||
| std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; } | |||
| Status PreBuiltOperation::to_json(nlohmann::json *out_json) { | |||
| RETURN_IF_NOT_OK(op_->to_json(out_json)); | |||
| return Status::OK(); | |||
| } | |||
| // RandomApplyOperation | |||
| RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob) | |||
| : TensorOperation(true), transforms_(transforms), prob_(prob) {} | |||
| Status RandomApplyOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_)); | |||
| RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> RandomApplyOperation::Build() { | |||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||
| return std::make_shared<RandomApplyOp>(prob_, tensor_ops); | |||
| } | |||
| // RandomChoiceOperation | |||
| RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||
| : TensorOperation(true), transforms_(transforms) {} | |||
| Status RandomChoiceOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> RandomChoiceOperation::Build() { | |||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||
| return std::make_shared<RandomChoiceOp>(tensor_ops); | |||
| } | |||
| // TypeCastOperation | |||
| TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {} | |||
| Status TypeCastOperation::ValidateParams() { | |||
| std::vector<std::string> predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32", | |||
| "int64", "uint64", "float16", "float32", "float64", "string"}; | |||
| auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_); | |||
| if (itr == predefine_type.end()) { | |||
| std::string err_msg = "TypeCast: Invalid data type: " + data_type_; | |||
| MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, " | |||
| << "int64, uint64, float16, float32, float64, string, but got: " << data_type_; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); } | |||
| #ifndef ENABLE_ANDROID | |||
| // UniqueOperation | |||
| Status UniqueOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); } | |||
| #endif | |||
| } // namespace transforms | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -21,9 +21,9 @@ | |||
| #include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h" | |||
| #include "minddata/dataset/engine/ir/datasetops/map_node.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h" | |||
| #include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h" | |||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||
| #include "minddata/dataset/kernels/ir/vision/vision_ir.h" | |||
| namespace mindspore { | |||
| @@ -27,6 +27,9 @@ | |||
| #include "minddata/dataset/include/constants.h" | |||
| #include "minddata/dataset/include/transforms.h" | |||
| // FIXME - This internal IR header will be removed when external API classes are provided | |||
| #include "minddata/dataset/text/ir/kernels/text_ir.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| @@ -36,24 +39,6 @@ class SentencePieceVocab; | |||
| // Transform operations for text | |||
| namespace text { | |||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||
| constexpr char kBasicTokenizerOperation[] = "BasicTokenizer"; | |||
| constexpr char kBertTokenizerOperation[] = "BertTokenizer"; | |||
| constexpr char kCaseFoldOperation[] = "CaseFold"; | |||
| constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; | |||
| constexpr char kLookupOperation[] = "Lookup"; | |||
| constexpr char kNgramOperation[] = "Ngram"; | |||
| constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; | |||
| constexpr char kRegexReplaceOperation[] = "RegexReplace"; | |||
| constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; | |||
| constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; | |||
| constexpr char kSlidingWindowOperation[] = "SlidingWindow"; | |||
| constexpr char kToNumberOperation[] = "ToNumber"; | |||
| constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; | |||
| constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | |||
| constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | |||
| constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | |||
| // Text Op classes (in alphabetical order) | |||
| #ifndef _WIN32 | |||
| class BasicTokenizerOperation; | |||
| @@ -255,309 +240,6 @@ std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool kee | |||
| /// \return Shared pointer to the current TensorOperation. | |||
| std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false); | |||
| #endif | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| #ifndef _WIN32 | |||
| class BasicTokenizerOperation : public TensorOperation { | |||
| public: | |||
| BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, | |||
| bool preserve_unused_token, bool with_offsets); | |||
| ~BasicTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kBasicTokenizerOperation; } | |||
| private: | |||
| bool lower_case_; | |||
| bool keep_whitespace_; | |||
| NormalizeForm normalize_form_; | |||
| bool preserve_unused_token_; | |||
| bool with_offsets_; | |||
| }; | |||
| class BertTokenizerOperation : public TensorOperation { | |||
| public: | |||
| BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, | |||
| bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets); | |||
| ~BertTokenizerOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kBertTokenizerOperation; } | |||
| private: | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::string suffix_indicator_; | |||
| int32_t max_bytes_per_token_; | |||
| std::string unknown_token_; | |||
| bool lower_case_; | |||
| bool keep_whitespace_; | |||
| NormalizeForm normalize_form_; | |||
| bool preserve_unused_token_; | |||
| bool with_offsets_; | |||
| }; | |||
| class CaseFoldOperation : public TensorOperation { | |||
| public: | |||
| CaseFoldOperation() = default; | |||
| ~CaseFoldOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kCaseFoldOperation; } | |||
| }; | |||
| #endif | |||
| class JiebaTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, | |||
| bool with_offsets); | |||
| ~JiebaTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kJiebaTokenizerOperation; } | |||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||
| private: | |||
| std::string hmm_path_; | |||
| std::string mp_path_; | |||
| JiebaMode mode_; | |||
| bool with_offsets_; | |||
| std::vector<std::pair<std::string, int64_t>> words_list_; | |||
| }; | |||
| class LookupOperation : public TensorOperation { | |||
| public: | |||
| explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||
| const std::string &data_type); | |||
| ~LookupOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kLookupOperation; } | |||
| private: | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::optional<std::string> unknown_token_; | |||
| int32_t default_id_; | |||
| std::string data_type_; | |||
| }; | |||
| class NgramOperation : public TensorOperation { | |||
| public: | |||
| explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator); | |||
| ~NgramOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kNgramOperation; } | |||
| private: | |||
| std::vector<int32_t> ngrams_; | |||
| std::pair<std::string, int32_t> left_pad_; | |||
| std::pair<std::string, int32_t> right_pad_; | |||
| std::string separator_; | |||
| }; | |||
| #ifndef _WIN32 | |||
| class NormalizeUTF8Operation : public TensorOperation { | |||
| public: | |||
| explicit NormalizeUTF8Operation(NormalizeForm normalize_form); | |||
| ~NormalizeUTF8Operation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kNormalizeUTF8Operation; } | |||
| private: | |||
| NormalizeForm normalize_form_; | |||
| }; | |||
| class RegexReplaceOperation : public TensorOperation { | |||
| public: | |||
| RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all); | |||
| ~RegexReplaceOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRegexReplaceOperation; } | |||
| private: | |||
| std::string pattern_; | |||
| std::string replace_; | |||
| bool replace_all_; | |||
| }; | |||
| class RegexTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets); | |||
| ~RegexTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRegexTokenizerOperation; } | |||
| private: | |||
| std::string delim_pattern_; | |||
| std::string keep_delim_pattern_; | |||
| bool with_offsets_; | |||
| }; | |||
| #endif | |||
| class SentencePieceTokenizerOperation : public TensorOperation { | |||
| public: | |||
| SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type); | |||
| SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); | |||
| ~SentencePieceTokenizerOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kSentencepieceTokenizerOperation; } | |||
| private: | |||
| std::shared_ptr<SentencePieceVocab> vocab_; | |||
| std::string vocab_path_; | |||
| SPieceTokenizerLoadType load_type_; | |||
| SPieceTokenizerOutType out_type_; | |||
| }; | |||
| class SlidingWindowOperation : public TensorOperation { | |||
| public: | |||
| explicit SlidingWindowOperation(const int32_t width, const int32_t axis); | |||
| ~SlidingWindowOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kSlidingWindowOperation; } | |||
| private: | |||
| int32_t width_; | |||
| int32_t axis_; | |||
| }; | |||
| class ToNumberOperation : public TensorOperation { | |||
| public: | |||
| explicit ToNumberOperation(std::string data_type); | |||
| ~ToNumberOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kToNumberOperation; } | |||
| private: | |||
| std::string data_type_; | |||
| }; | |||
| class TruncateSequencePairOperation : public TensorOperation { | |||
| public: | |||
| explicit TruncateSequencePairOperation(int32_t max_length); | |||
| ~TruncateSequencePairOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kTruncateSequencePairOperation; } | |||
| private: | |||
| int32_t max_length_; | |||
| }; | |||
| class UnicodeCharTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit UnicodeCharTokenizerOperation(bool with_offsets); | |||
| ~UnicodeCharTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kUnicodeCharTokenizerOperation; } | |||
| private: | |||
| bool with_offsets_; | |||
| }; | |||
| #ifndef _WIN32 | |||
| class UnicodeScriptTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets); | |||
| ~UnicodeScriptTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kUnicodeScriptTokenizerOperation; } | |||
| private: | |||
| bool keep_whitespace_; | |||
| bool with_offsets_; | |||
| }; | |||
| class WhitespaceTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit WhitespaceTokenizerOperation(bool with_offsets); | |||
| ~WhitespaceTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kWhitespaceTokenizerOperation; } | |||
| private: | |||
| bool with_offsets_; | |||
| }; | |||
| #endif | |||
| } // namespace text | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -25,40 +25,12 @@ | |||
| #include "include/api/status.h" | |||
| #include "minddata/dataset/include/constants.h" | |||
| // (TEMPORARY) will be removed when Tensor op ir moved down | |||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||
| #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ | |||
| #define INCLUDE_NLOHMANN_JSON_FWD_HPP_ | |||
| namespace nlohmann { | |||
| template <typename T = void, typename SFINAE = void> | |||
| struct adl_serializer; | |||
| template <template <typename U, typename V, typename... Args> class ObjectType = std::map, | |||
| template <typename U, typename... Args> class ArrayType = std::vector, class StringType = std::string, | |||
| class BooleanType = bool, class NumberIntegerType = std::int64_t, class NumberUnsignedType = std::uint64_t, | |||
| class NumberFloatType = double, template <typename U> class AllocatorType = std::allocator, | |||
| template <typename T, typename SFINAE = void> class JSONSerializer = adl_serializer> | |||
| class basic_json; | |||
| template <typename BasicJsonType> | |||
| class json_pointer; | |||
| using json = basic_json<>; | |||
| } // namespace nlohmann | |||
| #endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ | |||
| // FIXME - This internal IR header will be removed when external API classes are provided | |||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||
| constexpr char kComposeOperation[] = "Compose"; | |||
| constexpr char kDuplicateOperation[] = "Duplicate"; | |||
| constexpr char kOneHotOperation[] = "OneHot"; | |||
| constexpr char kPreBuiltOperation[] = "PreBuilt"; | |||
| constexpr char kRandomApplyOperation[] = "RandomApply"; | |||
| constexpr char kRandomChoiceOperation[] = "RandomChoice"; | |||
| constexpr char kRandomSelectSubpolicyOperation[] = "RandomSelectSubpolicy"; | |||
| constexpr char kTypeCastOperation[] = "TypeCast"; | |||
| constexpr char kUniqueOperation[] = "Unique"; | |||
| // Transform operations for performing data transformation. | |||
| namespace transforms { | |||
| @@ -119,134 +91,6 @@ std::shared_ptr<TypeCastOperation> TypeCast(std::string data_type); | |||
| /// \return Shared pointer to the current TensorOperation. | |||
| std::shared_ptr<UniqueOperation> Unique(); | |||
| #endif | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| class ComposeOperation : public TensorOperation { | |||
| public: | |||
| explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||
| ~ComposeOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kComposeOperation; } | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||
| }; | |||
| class DuplicateOperation : public TensorOperation { | |||
| public: | |||
| DuplicateOperation() = default; | |||
| ~DuplicateOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kDuplicateOperation; } | |||
| }; | |||
| class OneHotOperation : public TensorOperation { | |||
| public: | |||
| explicit OneHotOperation(int32_t num_classes_); | |||
| ~OneHotOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kOneHotOperation; } | |||
| private: | |||
| float num_classes_; | |||
| }; | |||
| class PreBuiltOperation : public TensorOperation { | |||
| public: | |||
| explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op); | |||
| ~PreBuiltOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override; | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::shared_ptr<TensorOp> op_; | |||
| }; | |||
| class RandomApplyOperation : public TensorOperation { | |||
| public: | |||
| explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob); | |||
| ~RandomApplyOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRandomApplyOperation; } | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||
| double prob_; | |||
| }; | |||
| class RandomChoiceOperation : public TensorOperation { | |||
| public: | |||
| explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||
| ~RandomChoiceOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRandomChoiceOperation; } | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||
| }; | |||
| class TypeCastOperation : public TensorOperation { | |||
| public: | |||
| explicit TypeCastOperation(std::string data_type); | |||
| ~TypeCastOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kTypeCastOperation; } | |||
| private: | |||
| std::string data_type_; | |||
| }; | |||
| #ifndef ENABLE_ANDROID | |||
| class UniqueOperation : public TensorOperation { | |||
| public: | |||
| UniqueOperation() = default; | |||
| ~UniqueOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kUniqueOperation; } | |||
| }; | |||
| #endif | |||
| } // namespace transforms | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -1,3 +1,4 @@ | |||
| add_subdirectory(data) | |||
| add_subdirectory(vision) | |||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| @@ -0,0 +1,8 @@ | |||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| set(DATASET_KERNELS_IR_DATA_SRC_FILES | |||
| transforms_ir.cc | |||
| ) | |||
| add_library(kernels-ir-data OBJECT ${DATASET_KERNELS_IR_DATA_SRC_FILES}) | |||
| @@ -0,0 +1,155 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <algorithm> | |||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||
| // Kernel data headers (in alphabetical order) | |||
| #include "minddata/dataset/kernels/data/compose_op.h" | |||
| #include "minddata/dataset/kernels/data/duplicate_op.h" | |||
| #include "minddata/dataset/kernels/data/one_hot_op.h" | |||
| #include "minddata/dataset/kernels/data/random_apply_op.h" | |||
| #include "minddata/dataset/kernels/data/random_choice_op.h" | |||
| #include "minddata/dataset/kernels/data/type_cast_op.h" | |||
| #ifndef ENABLE_ANDROID | |||
| #include "minddata/dataset/kernels/data/unique_op.h" | |||
| #endif | |||
| #include "minddata/dataset/kernels/ir/validators.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // Transform operations for data. | |||
| namespace transforms { | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| // (In alphabetical order) | |||
| // ComposeOperation | |||
| ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||
| : transforms_(transforms) {} | |||
| Status ComposeOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> ComposeOperation::Build() { | |||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||
| return std::make_shared<ComposeOp>(tensor_ops); | |||
| } | |||
| // DuplicateOperation | |||
| Status DuplicateOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); } | |||
| // OneHotOperation | |||
| OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {} | |||
| Status OneHotOperation::ValidateParams() { | |||
| if (num_classes_ <= 0) { | |||
| std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); } | |||
| // PreBuiltOperation | |||
| PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {} | |||
| Status PreBuiltOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; } | |||
| std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; } | |||
| Status PreBuiltOperation::to_json(nlohmann::json *out_json) { | |||
| RETURN_IF_NOT_OK(op_->to_json(out_json)); | |||
| return Status::OK(); | |||
| } | |||
| // RandomApplyOperation | |||
| RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob) | |||
| : TensorOperation(true), transforms_(transforms), prob_(prob) {} | |||
| Status RandomApplyOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_)); | |||
| RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> RandomApplyOperation::Build() { | |||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||
| return std::make_shared<RandomApplyOp>(prob_, tensor_ops); | |||
| } | |||
| // RandomChoiceOperation | |||
| RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||
| : TensorOperation(true), transforms_(transforms) {} | |||
| Status RandomChoiceOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> RandomChoiceOperation::Build() { | |||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||
| return std::make_shared<RandomChoiceOp>(tensor_ops); | |||
| } | |||
| // TypeCastOperation | |||
| TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {} | |||
| Status TypeCastOperation::ValidateParams() { | |||
| std::vector<std::string> predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32", | |||
| "int64", "uint64", "float16", "float32", "float64", "string"}; | |||
| auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_); | |||
| if (itr == predefine_type.end()) { | |||
| std::string err_msg = "TypeCast: Invalid data type: " + data_type_; | |||
| MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, " | |||
| << "int64, uint64, float16, float32, float64, string, but got: " << data_type_; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); } | |||
| #ifndef ENABLE_ANDROID | |||
| // UniqueOperation | |||
| Status UniqueOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); } | |||
| #endif | |||
| } // namespace transforms | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,172 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_ | |||
| #include <map> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||
| constexpr char kComposeOperation[] = "Compose"; | |||
| constexpr char kDuplicateOperation[] = "Duplicate"; | |||
| constexpr char kOneHotOperation[] = "OneHot"; | |||
| constexpr char kPreBuiltOperation[] = "PreBuilt"; | |||
| constexpr char kRandomApplyOperation[] = "RandomApply"; | |||
| constexpr char kRandomChoiceOperation[] = "RandomChoice"; | |||
| constexpr char kTypeCastOperation[] = "TypeCast"; | |||
| constexpr char kUniqueOperation[] = "Unique"; | |||
| // Transform operations for performing data transformation. | |||
| namespace transforms { | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| class ComposeOperation : public TensorOperation { | |||
| public: | |||
| explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||
| ~ComposeOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kComposeOperation; } | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||
| }; | |||
| class DuplicateOperation : public TensorOperation { | |||
| public: | |||
| DuplicateOperation() = default; | |||
| ~DuplicateOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kDuplicateOperation; } | |||
| }; | |||
| class OneHotOperation : public TensorOperation { | |||
| public: | |||
| explicit OneHotOperation(int32_t num_classes_); | |||
| ~OneHotOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kOneHotOperation; } | |||
| private: | |||
| float num_classes_; | |||
| }; | |||
| class PreBuiltOperation : public TensorOperation { | |||
| public: | |||
| explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op); | |||
| ~PreBuiltOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override; | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| std::shared_ptr<TensorOp> op_; | |||
| }; | |||
| class RandomApplyOperation : public TensorOperation { | |||
| public: | |||
| explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob); | |||
| ~RandomApplyOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRandomApplyOperation; } | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||
| double prob_; | |||
| }; | |||
| class RandomChoiceOperation : public TensorOperation { | |||
| public: | |||
| explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||
| ~RandomChoiceOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRandomChoiceOperation; } | |||
| private: | |||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||
| }; | |||
| class TypeCastOperation : public TensorOperation { | |||
| public: | |||
| explicit TypeCastOperation(std::string data_type); | |||
| ~TypeCastOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kTypeCastOperation; } | |||
| private: | |||
| std::string data_type_; | |||
| }; | |||
| #ifndef ENABLE_ANDROID | |||
| class UniqueOperation : public TensorOperation { | |||
| public: | |||
| UniqueOperation() = default; | |||
| ~UniqueOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kUniqueOperation; } | |||
| }; | |||
| #endif | |||
| } // namespace transforms | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_ | |||
| @@ -1,3 +1,4 @@ | |||
| add_subdirectory(ir) | |||
| add_subdirectory(kernels) | |||
| file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| @@ -0,0 +1,6 @@ | |||
| add_subdirectory(kernels) | |||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| add_library(text-ir OBJECT validators.cc) | |||
| @@ -0,0 +1,8 @@ | |||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||
| set(DATASET_TEXT_IR_KERNELS_SRC_FILES | |||
| text_ir.cc | |||
| ) | |||
| add_library(text-ir-kernels OBJECT ${DATASET_TEXT_IR_KERNELS_SRC_FILES}) | |||
| @@ -0,0 +1,436 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <unistd.h> | |||
| #include "minddata/dataset/text/ir/kernels/text_ir.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/bert_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/case_fold_op.h" | |||
| #endif | |||
| #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||
| #include "minddata/dataset/text/kernels/ngram_op.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/normalize_utf8_op.h" | |||
| #include "minddata/dataset/text/kernels/regex_replace_op.h" | |||
| #include "minddata/dataset/text/kernels/regex_tokenizer_op.h" | |||
| #endif | |||
| #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/sliding_window_op.h" | |||
| #include "minddata/dataset/text/kernels/to_number_op.h" | |||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | |||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||
| #endif | |||
| #include "minddata/dataset/core/data_type.h" | |||
| #include "minddata/dataset/util/path.h" | |||
| #include "minddata/dataset/text/ir/validators.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // Transform operations for text. | |||
| namespace text { | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| // (In alphabetical order) | |||
| #ifndef _WIN32 | |||
| // BasicTokenizerOperation | |||
| BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace, | |||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets) | |||
| : lower_case_(lower_case), | |||
| keep_whitespace_(keep_whitespace), | |||
| normalize_form_(normalize_form), | |||
| preserve_unused_token_(preserve_unused_token), | |||
| with_offsets_(with_offsets) {} | |||
| Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() { | |||
| std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>( | |||
| lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| // BertTokenizerOperation | |||
| BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||
| bool lower_case, bool keep_whitespace, | |||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets) | |||
| : vocab_(vocab), | |||
| suffix_indicator_(suffix_indicator), | |||
| max_bytes_per_token_(max_bytes_per_token), | |||
| unknown_token_(unknown_token), | |||
| lower_case_(lower_case), | |||
| keep_whitespace_(keep_whitespace), | |||
| normalize_form_(normalize_form), | |||
| preserve_unused_token_(preserve_unused_token), | |||
| with_offsets_(with_offsets) {} | |||
| BertTokenizerOperation::~BertTokenizerOperation() = default; | |||
| Status BertTokenizerOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (max_bytes_per_token_ < 0) { | |||
| std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + | |||
| std::to_string(max_bytes_per_token_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> BertTokenizerOperation::Build() { | |||
| std::shared_ptr<BertTokenizerOp> tensor_op = | |||
| std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_, | |||
| keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| // CaseFoldOperation | |||
| Status CaseFoldOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> CaseFoldOperation::Build() { | |||
| std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>(); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| // JiebaTokenizerOperation | |||
| JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, | |||
| const JiebaMode &mode, bool with_offsets) | |||
| : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} | |||
| Status JiebaTokenizerOperation::ValidateParams() { | |||
| if (hmm_path_.empty()) { | |||
| std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (mp_path_.empty()) { | |||
| std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_)); | |||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | |||
| std::shared_ptr<JiebaTokenizerOp> tensor_op = | |||
| std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_); | |||
| for (auto &word : words_list_) { | |||
| Status rc = tensor_op->AddWord(word.first, word.second); | |||
| if (rc.IsError()) { | |||
| MS_LOG(ERROR) << rc; | |||
| return {}; | |||
| } | |||
| } | |||
| return tensor_op; | |||
| } | |||
| Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { | |||
| if (word.empty()) { | |||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (freq < 0) { | |||
| std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| words_list_.emplace_back(word, freq); | |||
| return Status::OK(); | |||
| } | |||
| // LookupOperation | |||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||
| const std::string &data_type) | |||
| : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} | |||
| LookupOperation::~LookupOperation() = default; | |||
| Status LookupOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "Lookup: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (unknown_token_ != std::nullopt) { | |||
| default_id_ = vocab_->Lookup(*unknown_token_); | |||
| if (default_id_ == Vocab::kNoTokenExists) { | |||
| std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } | |||
| if (!IsTypeNumeric(data_type_)) { | |||
| std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> LookupOperation::Build() { | |||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_)); | |||
| return tensor_op; | |||
| } | |||
| // NgramOperation | |||
| NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator) | |||
| : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} | |||
| Status NgramOperation::ValidateParams() { | |||
| if (ngrams_.size() == 0) { | |||
| std::string err_msg = "Ngram : Container cannot be empty."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } else { | |||
| for (int32_t i = 0; i < ngrams_.size(); ++i) { | |||
| if (ngrams_[i] <= 0) { | |||
| std::string err_msg = | |||
| "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } | |||
| } | |||
| if (left_pad_.second < 0) { | |||
| std::string err_msg = | |||
| "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " + | |||
| std::to_string(left_pad_.second); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (right_pad_.second < 0) { | |||
| std::string err_msg = | |||
| "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " + | |||
| std::to_string(right_pad_.second); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> NgramOperation::Build() { | |||
| int32_t l_len = left_pad_.second; | |||
| int32_t r_len = right_pad_.second; | |||
| std::string l_pad = left_pad_.first; | |||
| std::string r_pad = right_pad_.first; | |||
| std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_); | |||
| return tensor_op; | |||
| } | |||
| #ifndef _WIN32 | |||
| // NormalizeUTF8Operation | |||
| NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} | |||
| Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() { | |||
| std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_); | |||
| return tensor_op; | |||
| } | |||
| // RegexReplaceOperation | |||
| RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all) | |||
| : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} | |||
| Status RegexReplaceOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> RegexReplaceOperation::Build() { | |||
| std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_); | |||
| return tensor_op; | |||
| } | |||
| // RegexTokenizerOperation | |||
| RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, | |||
| bool with_offsets) | |||
| : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} | |||
| Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() { | |||
| std::shared_ptr<RegexTokenizerOp> tensor_op = | |||
| std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| // SentencePieceTokenizerOperation | |||
| SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; | |||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, | |||
| SPieceTokenizerOutType out_type) | |||
| : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} | |||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path, | |||
| SPieceTokenizerOutType out_type) | |||
| : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {} | |||
| Status SentencePieceTokenizerOperation::ValidateParams() { | |||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } else { | |||
| Path vocab_file(vocab_path_); | |||
| if (!vocab_file.Exists() || vocab_file.IsDirectory()) { | |||
| std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (access(vocab_file.toString().c_str(), R_OK) == -1) { | |||
| std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() { | |||
| std::shared_ptr<SentencePieceTokenizerOp> tensor_op; | |||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_); | |||
| } else { | |||
| Path vocab_file(vocab_path_); | |||
| std::string model_path = vocab_file.ParentPath(); | |||
| std::string model_filename = vocab_file.Basename(); | |||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_); | |||
| } | |||
| return tensor_op; | |||
| } | |||
| // SlidingWindowOperation | |||
| SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} | |||
| Status SlidingWindowOperation::ValidateParams() { | |||
| if (width_ < 1) { | |||
| std::string err_msg = | |||
| "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> SlidingWindowOperation::Build() { | |||
| std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_); | |||
| return tensor_op; | |||
| } | |||
| // ToNumberOperation | |||
| ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} | |||
| Status ToNumberOperation::ValidateParams() { | |||
| if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { | |||
| std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> ToNumberOperation::Build() { | |||
| std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_); | |||
| return tensor_op; | |||
| } | |||
| TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} | |||
| Status TruncateSequencePairOperation::ValidateParams() { | |||
| if (max_length_ < 0) { | |||
| std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " + | |||
| std::to_string(max_length_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() { | |||
| std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_); | |||
| return tensor_op; | |||
| } | |||
| // UnicodeCharTokenizerOperation | |||
| UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||
| Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() { | |||
| std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #ifndef _WIN32 | |||
| // UnicodeScriptTokenizerOperation | |||
| UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | |||
| : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} | |||
| Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() { | |||
| std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op = | |||
| std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| // WhitespaceTokenizerOperation | |||
| WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||
| Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() { | |||
| std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| } // namespace text | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,360 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_ | |||
| #include <memory> | |||
| #include <optional> | |||
| #include <string> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| class Vocab; | |||
| class SentencePieceVocab; | |||
| // Transform operations for text | |||
| namespace text { | |||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||
| constexpr char kBasicTokenizerOperation[] = "BasicTokenizer"; | |||
| constexpr char kBertTokenizerOperation[] = "BertTokenizer"; | |||
| constexpr char kCaseFoldOperation[] = "CaseFold"; | |||
| constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; | |||
| constexpr char kLookupOperation[] = "Lookup"; | |||
| constexpr char kNgramOperation[] = "Ngram"; | |||
| constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; | |||
| constexpr char kRegexReplaceOperation[] = "RegexReplace"; | |||
| constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; | |||
| constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; | |||
| constexpr char kSlidingWindowOperation[] = "SlidingWindow"; | |||
| constexpr char kToNumberOperation[] = "ToNumber"; | |||
| constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; | |||
| constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | |||
| constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | |||
| constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| #ifndef _WIN32 | |||
| class BasicTokenizerOperation : public TensorOperation { | |||
| public: | |||
| BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, | |||
| bool preserve_unused_token, bool with_offsets); | |||
| ~BasicTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kBasicTokenizerOperation; } | |||
| private: | |||
| bool lower_case_; | |||
| bool keep_whitespace_; | |||
| NormalizeForm normalize_form_; | |||
| bool preserve_unused_token_; | |||
| bool with_offsets_; | |||
| }; | |||
| class BertTokenizerOperation : public TensorOperation { | |||
| public: | |||
| BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, | |||
| bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, | |||
| bool with_offsets); | |||
| ~BertTokenizerOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kBertTokenizerOperation; } | |||
| private: | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::string suffix_indicator_; | |||
| int32_t max_bytes_per_token_; | |||
| std::string unknown_token_; | |||
| bool lower_case_; | |||
| bool keep_whitespace_; | |||
| NormalizeForm normalize_form_; | |||
| bool preserve_unused_token_; | |||
| bool with_offsets_; | |||
| }; | |||
| class CaseFoldOperation : public TensorOperation { | |||
| public: | |||
| CaseFoldOperation() = default; | |||
| ~CaseFoldOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kCaseFoldOperation; } | |||
| }; | |||
| #endif | |||
| class JiebaTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, | |||
| bool with_offsets); | |||
| ~JiebaTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kJiebaTokenizerOperation; } | |||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||
| private: | |||
| std::string hmm_path_; | |||
| std::string mp_path_; | |||
| JiebaMode mode_; | |||
| bool with_offsets_; | |||
| std::vector<std::pair<std::string, int64_t>> words_list_; | |||
| }; | |||
| class LookupOperation : public TensorOperation { | |||
| public: | |||
| explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||
| const std::string &data_type); | |||
| ~LookupOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kLookupOperation; } | |||
| private: | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::optional<std::string> unknown_token_; | |||
| int32_t default_id_; | |||
| std::string data_type_; | |||
| }; | |||
| class NgramOperation : public TensorOperation { | |||
| public: | |||
| explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator); | |||
| ~NgramOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kNgramOperation; } | |||
| private: | |||
| std::vector<int32_t> ngrams_; | |||
| std::pair<std::string, int32_t> left_pad_; | |||
| std::pair<std::string, int32_t> right_pad_; | |||
| std::string separator_; | |||
| }; | |||
| #ifndef _WIN32 | |||
| class NormalizeUTF8Operation : public TensorOperation { | |||
| public: | |||
| explicit NormalizeUTF8Operation(NormalizeForm normalize_form); | |||
| ~NormalizeUTF8Operation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kNormalizeUTF8Operation; } | |||
| private: | |||
| NormalizeForm normalize_form_; | |||
| }; | |||
| class RegexReplaceOperation : public TensorOperation { | |||
| public: | |||
| RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all); | |||
| ~RegexReplaceOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRegexReplaceOperation; } | |||
| private: | |||
| std::string pattern_; | |||
| std::string replace_; | |||
| bool replace_all_; | |||
| }; | |||
| class RegexTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets); | |||
| ~RegexTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kRegexTokenizerOperation; } | |||
| private: | |||
| std::string delim_pattern_; | |||
| std::string keep_delim_pattern_; | |||
| bool with_offsets_; | |||
| }; | |||
| #endif | |||
| class SentencePieceTokenizerOperation : public TensorOperation { | |||
| public: | |||
| SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type); | |||
| SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); | |||
| ~SentencePieceTokenizerOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kSentencepieceTokenizerOperation; } | |||
| private: | |||
| std::shared_ptr<SentencePieceVocab> vocab_; | |||
| std::string vocab_path_; | |||
| SPieceTokenizerLoadType load_type_; | |||
| SPieceTokenizerOutType out_type_; | |||
| }; | |||
| class SlidingWindowOperation : public TensorOperation { | |||
| public: | |||
| explicit SlidingWindowOperation(const int32_t width, const int32_t axis); | |||
| ~SlidingWindowOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kSlidingWindowOperation; } | |||
| private: | |||
| int32_t width_; | |||
| int32_t axis_; | |||
| }; | |||
| class ToNumberOperation : public TensorOperation { | |||
| public: | |||
| explicit ToNumberOperation(std::string data_type); | |||
| ~ToNumberOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kToNumberOperation; } | |||
| private: | |||
| std::string data_type_; | |||
| }; | |||
| class TruncateSequencePairOperation : public TensorOperation { | |||
| public: | |||
| explicit TruncateSequencePairOperation(int32_t max_length); | |||
| ~TruncateSequencePairOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kTruncateSequencePairOperation; } | |||
| private: | |||
| int32_t max_length_; | |||
| }; | |||
| class UnicodeCharTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit UnicodeCharTokenizerOperation(bool with_offsets); | |||
| ~UnicodeCharTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kUnicodeCharTokenizerOperation; } | |||
| private: | |||
| bool with_offsets_; | |||
| }; | |||
| #ifndef _WIN32 | |||
| class UnicodeScriptTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets); | |||
| ~UnicodeScriptTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kUnicodeScriptTokenizerOperation; } | |||
| private: | |||
| bool keep_whitespace_; | |||
| bool with_offsets_; | |||
| }; | |||
| class WhitespaceTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit WhitespaceTokenizerOperation(bool with_offsets); | |||
| ~WhitespaceTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kWhitespaceTokenizerOperation; } | |||
| private: | |||
| bool with_offsets_; | |||
| }; | |||
| #endif | |||
| } // namespace text | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_ | |||
| @@ -0,0 +1,60 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/text/ir/validators.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| /* ####################################### Validator Functions ############################################ */ | |||
| // Helper function to validate tokenizer directory parameter | |||
| Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) { | |||
| if (tokenizer_file.empty()) { | |||
| std::string err_msg = tokenizer_name + ": tokenizer_file is not specified."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| Path file(tokenizer_file); | |||
| if (!file.Exists()) { | |||
| std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (access(tokenizer_file.c_str(), R_OK) == -1) { | |||
| std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| // Helper functions to help validate data type passed by user | |||
| bool IsTypeNumeric(const std::string &data_type) { | |||
| if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || | |||
| data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || | |||
| data_type == "float16" || data_type == "float32" || data_type == "float64") | |||
| return true; | |||
| return false; | |||
| } | |||
| bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } | |||
| bool IsTypeString(const std::string &data_type) { return data_type == "string"; } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,41 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_ | |||
| #include <string> | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // Helper function to validate tokenizer directory parameter | |||
| Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file); | |||
| // Helper function to validate data type passed by user | |||
| bool IsTypeNumeric(const std::string &data_type); | |||
| // Helper function to validate data type is boolean | |||
| bool IsTypeBoolean(const std::string &data_type); | |||
| // Helper function to validate data type is string | |||
| bool IsTypeString(const std::string &data_type); | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_ | |||
| @@ -202,6 +202,7 @@ if(BUILD_MINDDATA STREQUAL "full") | |||
| ${MINDDATA_DIR}/kernels/data/type_cast_op.cc | |||
| ${MINDDATA_DIR}/kernels/image/exif_utils.cc | |||
| ${MINDDATA_DIR}/kernels/ir/validators.cc | |||
| ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc | |||
| ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | |||
| ${MINDDATA_DIR}/callback/callback_manager.cc | |||
| ${MINDDATA_DIR}/util/task_manager.cc | |||
| @@ -281,6 +282,7 @@ elseif(BUILD_MINDDATA STREQUAL "wrapper") | |||
| ${MINDDATA_DIR}/kernels/data/data_utils.cc | |||
| ${MINDDATA_DIR}/kernels/image/exif_utils.cc | |||
| ${MINDDATA_DIR}/kernels/ir/validators.cc | |||
| ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc | |||
| ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/MDToDApi.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/album_op_android.cc | |||
| @@ -393,6 +395,7 @@ elseif(BUILD_MINDDATA STREQUAL "lite") | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc | |||
| ${CORE_DIR}/utils/ms_utils.cc | |||
| ${MINDDATA_DIR}/kernels/ir/validators.cc | |||
| ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc | |||
| ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | |||
| ) | |||