From: @tina_mengting_zhang Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -92,11 +92,14 @@ add_dependencies(engine core) | |||||
| add_dependencies(callback core) | add_dependencies(callback core) | ||||
| add_dependencies(text core) | add_dependencies(text core) | ||||
| add_dependencies(text-kernels core) | add_dependencies(text-kernels core) | ||||
| add_dependencies(text-ir core) | |||||
| add_dependencies(text-ir-kernels core) | |||||
| add_dependencies(cpp-API core) | add_dependencies(cpp-API core) | ||||
| add_dependencies(engine-ir-datasetops core) | add_dependencies(engine-ir-datasetops core) | ||||
| add_dependencies(engine-ir-datasetops-source core) | add_dependencies(engine-ir-datasetops-source core) | ||||
| add_dependencies(engine-ir-cache core) | add_dependencies(engine-ir-cache core) | ||||
| add_dependencies(kernels-ir core) | add_dependencies(kernels-ir core) | ||||
| add_dependencies(kernels-ir-data core) | |||||
| add_dependencies(kernels-ir-vision core) | add_dependencies(kernels-ir-vision core) | ||||
| if(ENABLE_ACL) | if(ENABLE_ACL) | ||||
| @@ -146,7 +149,10 @@ set(submodules | |||||
| $<TARGET_OBJECTS:engine> | $<TARGET_OBJECTS:engine> | ||||
| $<TARGET_OBJECTS:text> | $<TARGET_OBJECTS:text> | ||||
| $<TARGET_OBJECTS:text-kernels> | $<TARGET_OBJECTS:text-kernels> | ||||
| $<TARGET_OBJECTS:text-ir> | |||||
| $<TARGET_OBJECTS:text-ir-kernels> | |||||
| $<TARGET_OBJECTS:kernels-ir> | $<TARGET_OBJECTS:kernels-ir> | ||||
| $<TARGET_OBJECTS:kernels-ir-data> | |||||
| $<TARGET_OBJECTS:kernels-ir-vision> | $<TARGET_OBJECTS:kernels-ir-vision> | ||||
| ) | ) | ||||
| @@ -17,9 +17,9 @@ | |||||
| #include "minddata/dataset/api/python/pybind_register.h" | #include "minddata/dataset/api/python/pybind_register.h" | ||||
| #include "minddata/dataset/core/global_context.h" | #include "minddata/dataset/core/global_context.h" | ||||
| #include "minddata/dataset/include/transforms.h" | |||||
| #include "minddata/dataset/kernels/py_func_op.h" | #include "minddata/dataset/kernels/py_func_op.h" | ||||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||||
| #include "minddata/dataset/kernels/ir/vision/vision_ir.h" | #include "minddata/dataset/kernels/ir/vision/vision_ir.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -18,7 +18,7 @@ | |||||
| #include "pybind11/stl.h" | #include "pybind11/stl.h" | ||||
| #include "pybind11/stl_bind.h" | #include "pybind11/stl_bind.h" | ||||
| #include "minddata/dataset/api/python/pybind_register.h" | #include "minddata/dataset/api/python/pybind_register.h" | ||||
| #include "minddata/dataset/include/text.h" | |||||
| #include "minddata/dataset/text/ir/kernels/text_ir.h" | |||||
| #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | ||||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | #include "minddata/dataset/text/sentence_piece_vocab.h" | ||||
| #include "minddata/dataset/text/vocab.h" | #include "minddata/dataset/text/vocab.h" | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -30,10 +30,10 @@ | |||||
| #include "pybind11/stl_bind.h" | #include "pybind11/stl_bind.h" | ||||
| #include "minddata/dataset/include/datasets.h" | #include "minddata/dataset/include/datasets.h" | ||||
| #include "minddata/dataset/include/samplers.h" | #include "minddata/dataset/include/samplers.h" | ||||
| #include "minddata/dataset/include/transforms.h" | |||||
| #include "minddata/dataset/api/python/pybind_register.h" | #include "minddata/dataset/api/python/pybind_register.h" | ||||
| #include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h" | #include "minddata/dataset/engine/ir/cache/pre_built_dataset_cache.h" | ||||
| #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" | #include "minddata/dataset/engine/ir/datasetops/source/csv_node.h" | ||||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||||
| #include "minddata/dataset/kernels/py_func_op.h" | #include "minddata/dataset/kernels/py_func_op.h" | ||||
| namespace py = pybind11; | namespace py = pybind11; | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -17,30 +17,6 @@ | |||||
| #include <unistd.h> | #include <unistd.h> | ||||
| #include "minddata/dataset/include/text.h" | #include "minddata/dataset/include/text.h" | ||||
| #ifndef _WIN32 | |||||
| #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/bert_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/case_fold_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||||
| #include "minddata/dataset/text/kernels/ngram_op.h" | |||||
| #ifndef _WIN32 | |||||
| #include "minddata/dataset/text/kernels/normalize_utf8_op.h" | |||||
| #include "minddata/dataset/text/kernels/regex_replace_op.h" | |||||
| #include "minddata/dataset/text/kernels/regex_tokenizer_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/sliding_window_op.h" | |||||
| #include "minddata/dataset/text/kernels/to_number_op.h" | |||||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | |||||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | |||||
| #ifndef _WIN32 | |||||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/core/data_type.h" | |||||
| #include "minddata/dataset/util/path.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| @@ -174,426 +150,6 @@ std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offs | |||||
| return op->ValidateParams() ? op : nullptr; | return op->ValidateParams() ? op : nullptr; | ||||
| } | } | ||||
| #endif | #endif | ||||
| /* ####################################### Validator Functions ############################################ */ | |||||
| // Helper function to validate tokenizer directory parameter | |||||
| Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) { | |||||
| if (tokenizer_file.empty()) { | |||||
| std::string err_msg = tokenizer_name + ": tokenizer_file is not specified."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| Path file(tokenizer_file); | |||||
| if (!file.Exists()) { | |||||
| std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (access(tokenizer_file.c_str(), R_OK) == -1) { | |||||
| std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| // Helper functions to help validate data type passed by user | |||||
| bool IsTypeNumeric(const std::string &data_type) { | |||||
| if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || | |||||
| data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || | |||||
| data_type == "float16" || data_type == "float32" || data_type == "float64") | |||||
| return true; | |||||
| return false; | |||||
| } | |||||
| bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } | |||||
| bool IsTypeString(const std::string &data_type) { return data_type == "string"; } | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| // (In alphabetical order) | |||||
| #ifndef _WIN32 | |||||
| // BasicTokenizerOperation | |||||
| BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace, | |||||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets) | |||||
| : lower_case_(lower_case), | |||||
| keep_whitespace_(keep_whitespace), | |||||
| normalize_form_(normalize_form), | |||||
| preserve_unused_token_(preserve_unused_token), | |||||
| with_offsets_(with_offsets) {} | |||||
| Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() { | |||||
| std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>( | |||||
| lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| // BertTokenizerOperation | |||||
| BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||||
| bool lower_case, bool keep_whitespace, | |||||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets) | |||||
| : vocab_(vocab), | |||||
| suffix_indicator_(suffix_indicator), | |||||
| max_bytes_per_token_(max_bytes_per_token), | |||||
| unknown_token_(unknown_token), | |||||
| lower_case_(lower_case), | |||||
| keep_whitespace_(keep_whitespace), | |||||
| normalize_form_(normalize_form), | |||||
| preserve_unused_token_(preserve_unused_token), | |||||
| with_offsets_(with_offsets) {} | |||||
| BertTokenizerOperation::~BertTokenizerOperation() = default; | |||||
| Status BertTokenizerOperation::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (max_bytes_per_token_ < 0) { | |||||
| std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + | |||||
| std::to_string(max_bytes_per_token_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> BertTokenizerOperation::Build() { | |||||
| std::shared_ptr<BertTokenizerOp> tensor_op = | |||||
| std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_, | |||||
| keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| // CaseFoldOperation | |||||
| Status CaseFoldOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> CaseFoldOperation::Build() { | |||||
| std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>(); | |||||
| return tensor_op; | |||||
| } | |||||
| #endif | |||||
| // JiebaTokenizerOperation | |||||
| JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, | |||||
| const JiebaMode &mode, bool with_offsets) | |||||
| : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} | |||||
| Status JiebaTokenizerOperation::ValidateParams() { | |||||
| if (hmm_path_.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (mp_path_.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_)); | |||||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | |||||
| std::shared_ptr<JiebaTokenizerOp> tensor_op = | |||||
| std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_); | |||||
| for (auto &word : words_list_) { | |||||
| Status rc = tensor_op->AddWord(word.first, word.second); | |||||
| if (rc.IsError()) { | |||||
| MS_LOG(ERROR) << rc; | |||||
| return {}; | |||||
| } | |||||
| } | |||||
| return tensor_op; | |||||
| } | |||||
| Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { | |||||
| if (word.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (freq < 0) { | |||||
| std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| words_list_.emplace_back(word, freq); | |||||
| return Status::OK(); | |||||
| } | |||||
| // LookupOperation | |||||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||||
| const std::string &data_type) | |||||
| : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} | |||||
| LookupOperation::~LookupOperation() = default; | |||||
| Status LookupOperation::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "Lookup: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (unknown_token_ != std::nullopt) { | |||||
| default_id_ = vocab_->Lookup(*unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | |||||
| if (!IsTypeNumeric(data_type_)) { | |||||
| std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> LookupOperation::Build() { | |||||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_)); | |||||
| return tensor_op; | |||||
| } | |||||
| // NgramOperation | |||||
| NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator) | |||||
| : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} | |||||
| Status NgramOperation::ValidateParams() { | |||||
| if (ngrams_.size() == 0) { | |||||
| std::string err_msg = "Ngram : Container cannot be empty."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } else { | |||||
| for (int32_t i = 0; i < ngrams_.size(); ++i) { | |||||
| if (ngrams_[i] <= 0) { | |||||
| std::string err_msg = | |||||
| "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | |||||
| } | |||||
| if (left_pad_.second < 0) { | |||||
| std::string err_msg = | |||||
| "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " + | |||||
| std::to_string(left_pad_.second); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (right_pad_.second < 0) { | |||||
| std::string err_msg = | |||||
| "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " + | |||||
| std::to_string(right_pad_.second); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> NgramOperation::Build() { | |||||
| int32_t l_len = left_pad_.second; | |||||
| int32_t r_len = right_pad_.second; | |||||
| std::string l_pad = left_pad_.first; | |||||
| std::string r_pad = right_pad_.first; | |||||
| std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_); | |||||
| return tensor_op; | |||||
| } | |||||
| #ifndef _WIN32 | |||||
| // NormalizeUTF8Operation | |||||
| NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} | |||||
| Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() { | |||||
| std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_); | |||||
| return tensor_op; | |||||
| } | |||||
| // RegexReplaceOperation | |||||
| RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all) | |||||
| : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} | |||||
| Status RegexReplaceOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> RegexReplaceOperation::Build() { | |||||
| std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_); | |||||
| return tensor_op; | |||||
| } | |||||
| // RegexTokenizerOperation | |||||
| RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, | |||||
| bool with_offsets) | |||||
| : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} | |||||
| Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() { | |||||
| std::shared_ptr<RegexTokenizerOp> tensor_op = | |||||
| std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #endif | |||||
| // SentencePieceTokenizerOperation | |||||
| SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; | |||||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, | |||||
| SPieceTokenizerOutType out_type) | |||||
| : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} | |||||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path, | |||||
| SPieceTokenizerOutType out_type) | |||||
| : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {} | |||||
| Status SentencePieceTokenizerOperation::ValidateParams() { | |||||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } else { | |||||
| Path vocab_file(vocab_path_); | |||||
| if (!vocab_file.Exists() || vocab_file.IsDirectory()) { | |||||
| std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (access(vocab_file.toString().c_str(), R_OK) == -1) { | |||||
| std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() { | |||||
| std::shared_ptr<SentencePieceTokenizerOp> tensor_op; | |||||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_); | |||||
| } else { | |||||
| Path vocab_file(vocab_path_); | |||||
| std::string model_path = vocab_file.ParentPath(); | |||||
| std::string model_filename = vocab_file.Basename(); | |||||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_); | |||||
| } | |||||
| return tensor_op; | |||||
| } | |||||
| // SlidingWindowOperation | |||||
| SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} | |||||
| Status SlidingWindowOperation::ValidateParams() { | |||||
| if (width_ < 1) { | |||||
| std::string err_msg = | |||||
| "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> SlidingWindowOperation::Build() { | |||||
| std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_); | |||||
| return tensor_op; | |||||
| } | |||||
| // ToNumberOperation | |||||
| ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} | |||||
| Status ToNumberOperation::ValidateParams() { | |||||
| if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { | |||||
| std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> ToNumberOperation::Build() { | |||||
| std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_); | |||||
| return tensor_op; | |||||
| } | |||||
| TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} | |||||
| Status TruncateSequencePairOperation::ValidateParams() { | |||||
| if (max_length_ < 0) { | |||||
| std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " + | |||||
| std::to_string(max_length_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() { | |||||
| std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_); | |||||
| return tensor_op; | |||||
| } | |||||
| // UnicodeCharTokenizerOperation | |||||
| UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||||
| Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() { | |||||
| std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #ifndef _WIN32 | |||||
| // UnicodeScriptTokenizerOperation | |||||
| UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | |||||
| : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} | |||||
| Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() { | |||||
| std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op = | |||||
| std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| // WhitespaceTokenizerOperation | |||||
| WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||||
| Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() { | |||||
| std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #endif | |||||
| } // namespace text | } // namespace text | ||||
| } // namespace dataset | } // namespace dataset | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -15,18 +15,6 @@ | |||||
| */ | */ | ||||
| #include "minddata/dataset/include/transforms.h" | #include "minddata/dataset/include/transforms.h" | ||||
| #include "minddata/dataset/kernels/ir/validators.h" | |||||
| // Kernel data headers (in alphabetical order) | |||||
| #include "minddata/dataset/kernels/data/compose_op.h" | |||||
| #include "minddata/dataset/kernels/data/duplicate_op.h" | |||||
| #include "minddata/dataset/kernels/data/one_hot_op.h" | |||||
| #include "minddata/dataset/kernels/data/random_apply_op.h" | |||||
| #include "minddata/dataset/kernels/data/random_choice_op.h" | |||||
| #include "minddata/dataset/kernels/data/type_cast_op.h" | |||||
| #ifndef ENABLE_ANDROID | |||||
| #include "minddata/dataset/kernels/data/unique_op.h" | |||||
| #endif | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| @@ -88,122 +76,6 @@ std::shared_ptr<UniqueOperation> Unique() { | |||||
| return op->ValidateParams() ? op : nullptr; | return op->ValidateParams() ? op : nullptr; | ||||
| } | } | ||||
| #endif | #endif | ||||
| /* ####################################### Validator Functions ############################################ */ | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| // (In alphabetical order) | |||||
| // ComposeOperation | |||||
| ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||||
| : transforms_(transforms) {} | |||||
| Status ComposeOperation::ValidateParams() { | |||||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> ComposeOperation::Build() { | |||||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||||
| return std::make_shared<ComposeOp>(tensor_ops); | |||||
| } | |||||
| // DuplicateOperation | |||||
| Status DuplicateOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); } | |||||
| // OneHotOperation | |||||
| OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {} | |||||
| Status OneHotOperation::ValidateParams() { | |||||
| if (num_classes_ <= 0) { | |||||
| std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); } | |||||
| // PreBuiltOperation | |||||
| PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {} | |||||
| Status PreBuiltOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; } | |||||
| std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; } | |||||
| Status PreBuiltOperation::to_json(nlohmann::json *out_json) { | |||||
| RETURN_IF_NOT_OK(op_->to_json(out_json)); | |||||
| return Status::OK(); | |||||
| } | |||||
| // RandomApplyOperation | |||||
| RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob) | |||||
| : TensorOperation(true), transforms_(transforms), prob_(prob) {} | |||||
| Status RandomApplyOperation::ValidateParams() { | |||||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_)); | |||||
| RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> RandomApplyOperation::Build() { | |||||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||||
| return std::make_shared<RandomApplyOp>(prob_, tensor_ops); | |||||
| } | |||||
| // RandomChoiceOperation | |||||
| RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||||
| : TensorOperation(true), transforms_(transforms) {} | |||||
| Status RandomChoiceOperation::ValidateParams() { | |||||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> RandomChoiceOperation::Build() { | |||||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||||
| return std::make_shared<RandomChoiceOp>(tensor_ops); | |||||
| } | |||||
| // TypeCastOperation | |||||
| TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {} | |||||
| Status TypeCastOperation::ValidateParams() { | |||||
| std::vector<std::string> predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32", | |||||
| "int64", "uint64", "float16", "float32", "float64", "string"}; | |||||
| auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_); | |||||
| if (itr == predefine_type.end()) { | |||||
| std::string err_msg = "TypeCast: Invalid data type: " + data_type_; | |||||
| MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, " | |||||
| << "int64, uint64, float16, float32, float64, string, but got: " << data_type_; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); } | |||||
| #ifndef ENABLE_ANDROID | |||||
| // UniqueOperation | |||||
| Status UniqueOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); } | |||||
| #endif | |||||
| } // namespace transforms | } // namespace transforms | ||||
| } // namespace dataset | } // namespace dataset | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -21,9 +21,9 @@ | |||||
| #include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h" | #include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h" | ||||
| #include "minddata/dataset/engine/ir/datasetops/map_node.h" | #include "minddata/dataset/engine/ir/datasetops/map_node.h" | ||||
| #include "minddata/dataset/include/transforms.h" | |||||
| #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h" | #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h" | ||||
| #include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h" | #include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h" | ||||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||||
| #include "minddata/dataset/kernels/ir/vision/vision_ir.h" | #include "minddata/dataset/kernels/ir/vision/vision_ir.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -27,6 +27,9 @@ | |||||
| #include "minddata/dataset/include/constants.h" | #include "minddata/dataset/include/constants.h" | ||||
| #include "minddata/dataset/include/transforms.h" | #include "minddata/dataset/include/transforms.h" | ||||
| // FIXME - This internal IR header will be removed when external API classes are provided | |||||
| #include "minddata/dataset/text/ir/kernels/text_ir.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| @@ -36,24 +39,6 @@ class SentencePieceVocab; | |||||
| // Transform operations for text | // Transform operations for text | ||||
| namespace text { | namespace text { | ||||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||||
| constexpr char kBasicTokenizerOperation[] = "BasicTokenizer"; | |||||
| constexpr char kBertTokenizerOperation[] = "BertTokenizer"; | |||||
| constexpr char kCaseFoldOperation[] = "CaseFold"; | |||||
| constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; | |||||
| constexpr char kLookupOperation[] = "Lookup"; | |||||
| constexpr char kNgramOperation[] = "Ngram"; | |||||
| constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; | |||||
| constexpr char kRegexReplaceOperation[] = "RegexReplace"; | |||||
| constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; | |||||
| constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; | |||||
| constexpr char kSlidingWindowOperation[] = "SlidingWindow"; | |||||
| constexpr char kToNumberOperation[] = "ToNumber"; | |||||
| constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; | |||||
| constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | |||||
| constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | |||||
| constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | |||||
| // Text Op classes (in alphabetical order) | // Text Op classes (in alphabetical order) | ||||
| #ifndef _WIN32 | #ifndef _WIN32 | ||||
| class BasicTokenizerOperation; | class BasicTokenizerOperation; | ||||
| @@ -255,309 +240,6 @@ std::shared_ptr<UnicodeScriptTokenizerOperation> UnicodeScriptTokenizer(bool kee | |||||
| /// \return Shared pointer to the current TensorOperation. | /// \return Shared pointer to the current TensorOperation. | ||||
| std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false); | std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false); | ||||
| #endif | #endif | ||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| #ifndef _WIN32 | |||||
| class BasicTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, | |||||
| bool preserve_unused_token, bool with_offsets); | |||||
| ~BasicTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kBasicTokenizerOperation; } | |||||
| private: | |||||
| bool lower_case_; | |||||
| bool keep_whitespace_; | |||||
| NormalizeForm normalize_form_; | |||||
| bool preserve_unused_token_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| class BertTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, | |||||
| bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets); | |||||
| ~BertTokenizerOperation(); | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kBertTokenizerOperation; } | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::string suffix_indicator_; | |||||
| int32_t max_bytes_per_token_; | |||||
| std::string unknown_token_; | |||||
| bool lower_case_; | |||||
| bool keep_whitespace_; | |||||
| NormalizeForm normalize_form_; | |||||
| bool preserve_unused_token_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| class CaseFoldOperation : public TensorOperation { | |||||
| public: | |||||
| CaseFoldOperation() = default; | |||||
| ~CaseFoldOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kCaseFoldOperation; } | |||||
| }; | |||||
| #endif | |||||
| class JiebaTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, | |||||
| bool with_offsets); | |||||
| ~JiebaTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kJiebaTokenizerOperation; } | |||||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||||
| private: | |||||
| std::string hmm_path_; | |||||
| std::string mp_path_; | |||||
| JiebaMode mode_; | |||||
| bool with_offsets_; | |||||
| std::vector<std::pair<std::string, int64_t>> words_list_; | |||||
| }; | |||||
| class LookupOperation : public TensorOperation { | |||||
| public: | |||||
| explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||||
| const std::string &data_type); | |||||
| ~LookupOperation(); | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kLookupOperation; } | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::optional<std::string> unknown_token_; | |||||
| int32_t default_id_; | |||||
| std::string data_type_; | |||||
| }; | |||||
| class NgramOperation : public TensorOperation { | |||||
| public: | |||||
| explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator); | |||||
| ~NgramOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kNgramOperation; } | |||||
| private: | |||||
| std::vector<int32_t> ngrams_; | |||||
| std::pair<std::string, int32_t> left_pad_; | |||||
| std::pair<std::string, int32_t> right_pad_; | |||||
| std::string separator_; | |||||
| }; | |||||
| #ifndef _WIN32 | |||||
| class NormalizeUTF8Operation : public TensorOperation { | |||||
| public: | |||||
| explicit NormalizeUTF8Operation(NormalizeForm normalize_form); | |||||
| ~NormalizeUTF8Operation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kNormalizeUTF8Operation; } | |||||
| private: | |||||
| NormalizeForm normalize_form_; | |||||
| }; | |||||
| class RegexReplaceOperation : public TensorOperation { | |||||
| public: | |||||
| RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all); | |||||
| ~RegexReplaceOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRegexReplaceOperation; } | |||||
| private: | |||||
| std::string pattern_; | |||||
| std::string replace_; | |||||
| bool replace_all_; | |||||
| }; | |||||
| class RegexTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets); | |||||
| ~RegexTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRegexTokenizerOperation; } | |||||
| private: | |||||
| std::string delim_pattern_; | |||||
| std::string keep_delim_pattern_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #endif | |||||
| class SentencePieceTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type); | |||||
| SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); | |||||
| ~SentencePieceTokenizerOperation(); | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kSentencepieceTokenizerOperation; } | |||||
| private: | |||||
| std::shared_ptr<SentencePieceVocab> vocab_; | |||||
| std::string vocab_path_; | |||||
| SPieceTokenizerLoadType load_type_; | |||||
| SPieceTokenizerOutType out_type_; | |||||
| }; | |||||
| class SlidingWindowOperation : public TensorOperation { | |||||
| public: | |||||
| explicit SlidingWindowOperation(const int32_t width, const int32_t axis); | |||||
| ~SlidingWindowOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kSlidingWindowOperation; } | |||||
| private: | |||||
| int32_t width_; | |||||
| int32_t axis_; | |||||
| }; | |||||
| class ToNumberOperation : public TensorOperation { | |||||
| public: | |||||
| explicit ToNumberOperation(std::string data_type); | |||||
| ~ToNumberOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kToNumberOperation; } | |||||
| private: | |||||
| std::string data_type_; | |||||
| }; | |||||
| class TruncateSequencePairOperation : public TensorOperation { | |||||
| public: | |||||
| explicit TruncateSequencePairOperation(int32_t max_length); | |||||
| ~TruncateSequencePairOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kTruncateSequencePairOperation; } | |||||
| private: | |||||
| int32_t max_length_; | |||||
| }; | |||||
| class UnicodeCharTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit UnicodeCharTokenizerOperation(bool with_offsets); | |||||
| ~UnicodeCharTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kUnicodeCharTokenizerOperation; } | |||||
| private: | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #ifndef _WIN32 | |||||
| class UnicodeScriptTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets); | |||||
| ~UnicodeScriptTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kUnicodeScriptTokenizerOperation; } | |||||
| private: | |||||
| bool keep_whitespace_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| class WhitespaceTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit WhitespaceTokenizerOperation(bool with_offsets); | |||||
| ~WhitespaceTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kWhitespaceTokenizerOperation; } | |||||
| private: | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #endif | |||||
| } // namespace text | } // namespace text | ||||
| } // namespace dataset | } // namespace dataset | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -25,40 +25,12 @@ | |||||
| #include "include/api/status.h" | #include "include/api/status.h" | ||||
| #include "minddata/dataset/include/constants.h" | #include "minddata/dataset/include/constants.h" | ||||
| // (TEMPORARY) will be removed when Tensor op ir moved down | |||||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||||
| #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ | |||||
| #define INCLUDE_NLOHMANN_JSON_FWD_HPP_ | |||||
| namespace nlohmann { | |||||
| template <typename T = void, typename SFINAE = void> | |||||
| struct adl_serializer; | |||||
| template <template <typename U, typename V, typename... Args> class ObjectType = std::map, | |||||
| template <typename U, typename... Args> class ArrayType = std::vector, class StringType = std::string, | |||||
| class BooleanType = bool, class NumberIntegerType = std::int64_t, class NumberUnsignedType = std::uint64_t, | |||||
| class NumberFloatType = double, template <typename U> class AllocatorType = std::allocator, | |||||
| template <typename T, typename SFINAE = void> class JSONSerializer = adl_serializer> | |||||
| class basic_json; | |||||
| template <typename BasicJsonType> | |||||
| class json_pointer; | |||||
| using json = basic_json<>; | |||||
| } // namespace nlohmann | |||||
| #endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_ | |||||
| // FIXME - This internal IR header will be removed when external API classes are provided | |||||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace dataset { | namespace dataset { | ||||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||||
| constexpr char kComposeOperation[] = "Compose"; | |||||
| constexpr char kDuplicateOperation[] = "Duplicate"; | |||||
| constexpr char kOneHotOperation[] = "OneHot"; | |||||
| constexpr char kPreBuiltOperation[] = "PreBuilt"; | |||||
| constexpr char kRandomApplyOperation[] = "RandomApply"; | |||||
| constexpr char kRandomChoiceOperation[] = "RandomChoice"; | |||||
| constexpr char kRandomSelectSubpolicyOperation[] = "RandomSelectSubpolicy"; | |||||
| constexpr char kTypeCastOperation[] = "TypeCast"; | |||||
| constexpr char kUniqueOperation[] = "Unique"; | |||||
| // Transform operations for performing data transformation. | // Transform operations for performing data transformation. | ||||
| namespace transforms { | namespace transforms { | ||||
| @@ -119,134 +91,6 @@ std::shared_ptr<TypeCastOperation> TypeCast(std::string data_type); | |||||
| /// \return Shared pointer to the current TensorOperation. | /// \return Shared pointer to the current TensorOperation. | ||||
| std::shared_ptr<UniqueOperation> Unique(); | std::shared_ptr<UniqueOperation> Unique(); | ||||
| #endif | #endif | ||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| class ComposeOperation : public TensorOperation { | |||||
| public: | |||||
| explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||||
| ~ComposeOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kComposeOperation; } | |||||
| private: | |||||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||||
| }; | |||||
| class DuplicateOperation : public TensorOperation { | |||||
| public: | |||||
| DuplicateOperation() = default; | |||||
| ~DuplicateOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kDuplicateOperation; } | |||||
| }; | |||||
| class OneHotOperation : public TensorOperation { | |||||
| public: | |||||
| explicit OneHotOperation(int32_t num_classes_); | |||||
| ~OneHotOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kOneHotOperation; } | |||||
| private: | |||||
| float num_classes_; | |||||
| }; | |||||
| class PreBuiltOperation : public TensorOperation { | |||||
| public: | |||||
| explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op); | |||||
| ~PreBuiltOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override; | |||||
| Status to_json(nlohmann::json *out_json) override; | |||||
| private: | |||||
| std::shared_ptr<TensorOp> op_; | |||||
| }; | |||||
| class RandomApplyOperation : public TensorOperation { | |||||
| public: | |||||
| explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob); | |||||
| ~RandomApplyOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRandomApplyOperation; } | |||||
| private: | |||||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||||
| double prob_; | |||||
| }; | |||||
| class RandomChoiceOperation : public TensorOperation { | |||||
| public: | |||||
| explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||||
| ~RandomChoiceOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRandomChoiceOperation; } | |||||
| private: | |||||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||||
| }; | |||||
| class TypeCastOperation : public TensorOperation { | |||||
| public: | |||||
| explicit TypeCastOperation(std::string data_type); | |||||
| ~TypeCastOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kTypeCastOperation; } | |||||
| private: | |||||
| std::string data_type_; | |||||
| }; | |||||
| #ifndef ENABLE_ANDROID | |||||
| class UniqueOperation : public TensorOperation { | |||||
| public: | |||||
| UniqueOperation() = default; | |||||
| ~UniqueOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kUniqueOperation; } | |||||
| }; | |||||
| #endif | |||||
| } // namespace transforms | } // namespace transforms | ||||
| } // namespace dataset | } // namespace dataset | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -1,3 +1,4 @@ | |||||
| add_subdirectory(data) | |||||
| add_subdirectory(vision) | add_subdirectory(vision) | ||||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | ||||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | ||||
| @@ -0,0 +1,8 @@ | |||||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||||
| set(DATASET_KERNELS_IR_DATA_SRC_FILES | |||||
| transforms_ir.cc | |||||
| ) | |||||
| add_library(kernels-ir-data OBJECT ${DATASET_KERNELS_IR_DATA_SRC_FILES}) | |||||
| @@ -0,0 +1,155 @@ | |||||
| /** | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <algorithm> | |||||
| #include "minddata/dataset/kernels/ir/data/transforms_ir.h" | |||||
| // Kernel data headers (in alphabetical order) | |||||
| #include "minddata/dataset/kernels/data/compose_op.h" | |||||
| #include "minddata/dataset/kernels/data/duplicate_op.h" | |||||
| #include "minddata/dataset/kernels/data/one_hot_op.h" | |||||
| #include "minddata/dataset/kernels/data/random_apply_op.h" | |||||
| #include "minddata/dataset/kernels/data/random_choice_op.h" | |||||
| #include "minddata/dataset/kernels/data/type_cast_op.h" | |||||
| #ifndef ENABLE_ANDROID | |||||
| #include "minddata/dataset/kernels/data/unique_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/kernels/ir/validators.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| // Transform operations for data. | |||||
| namespace transforms { | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| // (In alphabetical order) | |||||
| // ComposeOperation | |||||
| ComposeOperation::ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||||
| : transforms_(transforms) {} | |||||
| Status ComposeOperation::ValidateParams() { | |||||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("Compose", transforms_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> ComposeOperation::Build() { | |||||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||||
| return std::make_shared<ComposeOp>(tensor_ops); | |||||
| } | |||||
| // DuplicateOperation | |||||
| Status DuplicateOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> DuplicateOperation::Build() { return std::make_shared<DuplicateOp>(); } | |||||
| // OneHotOperation | |||||
| OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {} | |||||
| Status OneHotOperation::ValidateParams() { | |||||
| if (num_classes_ <= 0) { | |||||
| std::string err_msg = "OneHot: Number of classes must be greater than 0, but got: " + std::to_string(num_classes_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); } | |||||
| // PreBuiltOperation | |||||
| PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {} | |||||
| Status PreBuiltOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> PreBuiltOperation::Build() { return op_; } | |||||
| std::string PreBuiltOperation::Name() const { return op_ ? op_->Name() : kPreBuiltOperation; } | |||||
| Status PreBuiltOperation::to_json(nlohmann::json *out_json) { | |||||
| RETURN_IF_NOT_OK(op_->to_json(out_json)); | |||||
| return Status::OK(); | |||||
| } | |||||
| // RandomApplyOperation | |||||
| RandomApplyOperation::RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob) | |||||
| : TensorOperation(true), transforms_(transforms), prob_(prob) {} | |||||
| Status RandomApplyOperation::ValidateParams() { | |||||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomApply", transforms_)); | |||||
| RETURN_IF_NOT_OK(ValidateProbability("RandomApply", prob_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> RandomApplyOperation::Build() { | |||||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||||
| return std::make_shared<RandomApplyOp>(prob_, tensor_ops); | |||||
| } | |||||
| // RandomChoiceOperation | |||||
| RandomChoiceOperation::RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms) | |||||
| : TensorOperation(true), transforms_(transforms) {} | |||||
| Status RandomChoiceOperation::ValidateParams() { | |||||
| RETURN_IF_NOT_OK(ValidateVectorTransforms("RandomChoice", transforms_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> RandomChoiceOperation::Build() { | |||||
| std::vector<std::shared_ptr<TensorOp>> tensor_ops; | |||||
| (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops), | |||||
| [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); }); | |||||
| return std::make_shared<RandomChoiceOp>(tensor_ops); | |||||
| } | |||||
| // TypeCastOperation | |||||
| TypeCastOperation::TypeCastOperation(std::string data_type) : data_type_(data_type) {} | |||||
| Status TypeCastOperation::ValidateParams() { | |||||
| std::vector<std::string> predefine_type = {"bool", "int8", "uint8", "int16", "uint16", "int32", "uint32", | |||||
| "int64", "uint64", "float16", "float32", "float64", "string"}; | |||||
| auto itr = std::find(predefine_type.begin(), predefine_type.end(), data_type_); | |||||
| if (itr == predefine_type.end()) { | |||||
| std::string err_msg = "TypeCast: Invalid data type: " + data_type_; | |||||
| MS_LOG(ERROR) << "TypeCast: Only supports data type bool, int8, uint8, int16, uint16, int32, uint32, " | |||||
| << "int64, uint64, float16, float32, float64, string, but got: " << data_type_; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> TypeCastOperation::Build() { return std::make_shared<TypeCastOp>(data_type_); } | |||||
| #ifndef ENABLE_ANDROID | |||||
| // UniqueOperation | |||||
| Status UniqueOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> UniqueOperation::Build() { return std::make_shared<UniqueOp>(); } | |||||
| #endif | |||||
| } // namespace transforms | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,172 @@ | |||||
| /** | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_ | |||||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_ | |||||
| #include <map> | |||||
| #include <memory> | |||||
| #include <string> | |||||
| #include <vector> | |||||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||||
| constexpr char kComposeOperation[] = "Compose"; | |||||
| constexpr char kDuplicateOperation[] = "Duplicate"; | |||||
| constexpr char kOneHotOperation[] = "OneHot"; | |||||
| constexpr char kPreBuiltOperation[] = "PreBuilt"; | |||||
| constexpr char kRandomApplyOperation[] = "RandomApply"; | |||||
| constexpr char kRandomChoiceOperation[] = "RandomChoice"; | |||||
| constexpr char kTypeCastOperation[] = "TypeCast"; | |||||
| constexpr char kUniqueOperation[] = "Unique"; | |||||
| // Transform operations for performing data transformation. | |||||
| namespace transforms { | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| class ComposeOperation : public TensorOperation { | |||||
| public: | |||||
| explicit ComposeOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||||
| ~ComposeOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kComposeOperation; } | |||||
| private: | |||||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||||
| }; | |||||
| class DuplicateOperation : public TensorOperation { | |||||
| public: | |||||
| DuplicateOperation() = default; | |||||
| ~DuplicateOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kDuplicateOperation; } | |||||
| }; | |||||
| class OneHotOperation : public TensorOperation { | |||||
| public: | |||||
| explicit OneHotOperation(int32_t num_classes_); | |||||
| ~OneHotOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kOneHotOperation; } | |||||
| private: | |||||
| float num_classes_; | |||||
| }; | |||||
| class PreBuiltOperation : public TensorOperation { | |||||
| public: | |||||
| explicit PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op); | |||||
| ~PreBuiltOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override; | |||||
| Status to_json(nlohmann::json *out_json) override; | |||||
| private: | |||||
| std::shared_ptr<TensorOp> op_; | |||||
| }; | |||||
| class RandomApplyOperation : public TensorOperation { | |||||
| public: | |||||
| explicit RandomApplyOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms, double prob); | |||||
| ~RandomApplyOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRandomApplyOperation; } | |||||
| private: | |||||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||||
| double prob_; | |||||
| }; | |||||
| class RandomChoiceOperation : public TensorOperation { | |||||
| public: | |||||
| explicit RandomChoiceOperation(const std::vector<std::shared_ptr<TensorOperation>> &transforms); | |||||
| ~RandomChoiceOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRandomChoiceOperation; } | |||||
| private: | |||||
| std::vector<std::shared_ptr<TensorOperation>> transforms_; | |||||
| }; | |||||
| class TypeCastOperation : public TensorOperation { | |||||
| public: | |||||
| explicit TypeCastOperation(std::string data_type); | |||||
| ~TypeCastOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kTypeCastOperation; } | |||||
| private: | |||||
| std::string data_type_; | |||||
| }; | |||||
| #ifndef ENABLE_ANDROID | |||||
| class UniqueOperation : public TensorOperation { | |||||
| public: | |||||
| UniqueOperation() = default; | |||||
| ~UniqueOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kUniqueOperation; } | |||||
| }; | |||||
| #endif | |||||
| } // namespace transforms | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IR_DATA_TRANSFORMS_IR_H_ | |||||
| @@ -1,3 +1,4 @@ | |||||
| add_subdirectory(ir) | |||||
| add_subdirectory(kernels) | add_subdirectory(kernels) | ||||
| file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | ||||
| @@ -0,0 +1,6 @@ | |||||
| add_subdirectory(kernels) | |||||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||||
| add_library(text-ir OBJECT validators.cc) | |||||
| @@ -0,0 +1,8 @@ | |||||
| file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") | |||||
| set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) | |||||
| set(DATASET_TEXT_IR_KERNELS_SRC_FILES | |||||
| text_ir.cc | |||||
| ) | |||||
| add_library(text-ir-kernels OBJECT ${DATASET_TEXT_IR_KERNELS_SRC_FILES}) | |||||
| @@ -0,0 +1,436 @@ | |||||
| /** | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <unistd.h> | |||||
| #include "minddata/dataset/text/ir/kernels/text_ir.h" | |||||
| #ifndef _WIN32 | |||||
| #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/bert_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/case_fold_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||||
| #include "minddata/dataset/text/kernels/ngram_op.h" | |||||
| #ifndef _WIN32 | |||||
| #include "minddata/dataset/text/kernels/normalize_utf8_op.h" | |||||
| #include "minddata/dataset/text/kernels/regex_replace_op.h" | |||||
| #include "minddata/dataset/text/kernels/regex_tokenizer_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/sliding_window_op.h" | |||||
| #include "minddata/dataset/text/kernels/to_number_op.h" | |||||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | |||||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | |||||
| #ifndef _WIN32 | |||||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||||
| #endif | |||||
| #include "minddata/dataset/core/data_type.h" | |||||
| #include "minddata/dataset/util/path.h" | |||||
| #include "minddata/dataset/text/ir/validators.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| // Transform operations for text. | |||||
| namespace text { | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| // (In alphabetical order) | |||||
| #ifndef _WIN32 | |||||
| // BasicTokenizerOperation | |||||
| BasicTokenizerOperation::BasicTokenizerOperation(bool lower_case, bool keep_whitespace, | |||||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets) | |||||
| : lower_case_(lower_case), | |||||
| keep_whitespace_(keep_whitespace), | |||||
| normalize_form_(normalize_form), | |||||
| preserve_unused_token_(preserve_unused_token), | |||||
| with_offsets_(with_offsets) {} | |||||
| Status BasicTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> BasicTokenizerOperation::Build() { | |||||
| std::shared_ptr<BasicTokenizerOp> tensor_op = std::make_shared<BasicTokenizerOp>( | |||||
| lower_case_, keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| // BertTokenizerOperation | |||||
| BertTokenizerOperation::BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||||
| bool lower_case, bool keep_whitespace, | |||||
| const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets) | |||||
| : vocab_(vocab), | |||||
| suffix_indicator_(suffix_indicator), | |||||
| max_bytes_per_token_(max_bytes_per_token), | |||||
| unknown_token_(unknown_token), | |||||
| lower_case_(lower_case), | |||||
| keep_whitespace_(keep_whitespace), | |||||
| normalize_form_(normalize_form), | |||||
| preserve_unused_token_(preserve_unused_token), | |||||
| with_offsets_(with_offsets) {} | |||||
| BertTokenizerOperation::~BertTokenizerOperation() = default; | |||||
| Status BertTokenizerOperation::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "BertTokenizer: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (max_bytes_per_token_ < 0) { | |||||
| std::string err_msg = "BertTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + | |||||
| std::to_string(max_bytes_per_token_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> BertTokenizerOperation::Build() { | |||||
| std::shared_ptr<BertTokenizerOp> tensor_op = | |||||
| std::make_shared<BertTokenizerOp>(vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, lower_case_, | |||||
| keep_whitespace_, normalize_form_, preserve_unused_token_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| // CaseFoldOperation | |||||
| Status CaseFoldOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> CaseFoldOperation::Build() { | |||||
| std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>(); | |||||
| return tensor_op; | |||||
| } | |||||
| #endif | |||||
| // JiebaTokenizerOperation | |||||
| JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, | |||||
| const JiebaMode &mode, bool with_offsets) | |||||
| : hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {} | |||||
| Status JiebaTokenizerOperation::ValidateParams() { | |||||
| if (hmm_path_.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (mp_path_.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_)); | |||||
| RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_)); | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | |||||
| std::shared_ptr<JiebaTokenizerOp> tensor_op = | |||||
| std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_); | |||||
| for (auto &word : words_list_) { | |||||
| Status rc = tensor_op->AddWord(word.first, word.second); | |||||
| if (rc.IsError()) { | |||||
| MS_LOG(ERROR) << rc; | |||||
| return {}; | |||||
| } | |||||
| } | |||||
| return tensor_op; | |||||
| } | |||||
| Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { | |||||
| if (word.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (freq < 0) { | |||||
| std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| words_list_.emplace_back(word, freq); | |||||
| return Status::OK(); | |||||
| } | |||||
| // LookupOperation | |||||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||||
| const std::string &data_type) | |||||
| : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {} | |||||
| LookupOperation::~LookupOperation() = default; | |||||
| Status LookupOperation::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "Lookup: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (unknown_token_ != std::nullopt) { | |||||
| default_id_ = vocab_->Lookup(*unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | |||||
| if (!IsTypeNumeric(data_type_)) { | |||||
| std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> LookupOperation::Build() { | |||||
| std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, DataType(data_type_)); | |||||
| return tensor_op; | |||||
| } | |||||
| // NgramOperation | |||||
| NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator) | |||||
| : ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {} | |||||
| Status NgramOperation::ValidateParams() { | |||||
| if (ngrams_.size() == 0) { | |||||
| std::string err_msg = "Ngram : Container cannot be empty."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } else { | |||||
| for (int32_t i = 0; i < ngrams_.size(); ++i) { | |||||
| if (ngrams_[i] <= 0) { | |||||
| std::string err_msg = | |||||
| "Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | |||||
| } | |||||
| if (left_pad_.second < 0) { | |||||
| std::string err_msg = | |||||
| "Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " + | |||||
| std::to_string(left_pad_.second); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (right_pad_.second < 0) { | |||||
| std::string err_msg = | |||||
| "Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " + | |||||
| std::to_string(right_pad_.second); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> NgramOperation::Build() { | |||||
| int32_t l_len = left_pad_.second; | |||||
| int32_t r_len = right_pad_.second; | |||||
| std::string l_pad = left_pad_.first; | |||||
| std::string r_pad = right_pad_.first; | |||||
| std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_); | |||||
| return tensor_op; | |||||
| } | |||||
| #ifndef _WIN32 | |||||
| // NormalizeUTF8Operation | |||||
| NormalizeUTF8Operation::NormalizeUTF8Operation(NormalizeForm normalize_form) : normalize_form_(normalize_form) {} | |||||
| Status NormalizeUTF8Operation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> NormalizeUTF8Operation::Build() { | |||||
| std::shared_ptr<NormalizeUTF8Op> tensor_op = std::make_shared<NormalizeUTF8Op>(normalize_form_); | |||||
| return tensor_op; | |||||
| } | |||||
| // RegexReplaceOperation | |||||
| RegexReplaceOperation::RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all) | |||||
| : pattern_(pattern), replace_(replace), replace_all_(replace_all) {} | |||||
| Status RegexReplaceOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> RegexReplaceOperation::Build() { | |||||
| std::shared_ptr<RegexReplaceOp> tensor_op = std::make_shared<RegexReplaceOp>(pattern_, replace_, replace_all_); | |||||
| return tensor_op; | |||||
| } | |||||
| // RegexTokenizerOperation | |||||
| RegexTokenizerOperation::RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, | |||||
| bool with_offsets) | |||||
| : delim_pattern_(delim_pattern), keep_delim_pattern_(keep_delim_pattern), with_offsets_(with_offsets) {} | |||||
| Status RegexTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> RegexTokenizerOperation::Build() { | |||||
| std::shared_ptr<RegexTokenizerOp> tensor_op = | |||||
| std::make_shared<RegexTokenizerOp>(delim_pattern_, keep_delim_pattern_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #endif | |||||
| // SentencePieceTokenizerOperation | |||||
| SentencePieceTokenizerOperation::~SentencePieceTokenizerOperation() = default; | |||||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, | |||||
| SPieceTokenizerOutType out_type) | |||||
| : vocab_(vocab), vocab_path_(std::string()), load_type_(SPieceTokenizerLoadType::kModel), out_type_(out_type) {} | |||||
| SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::string &vocab_path, | |||||
| SPieceTokenizerOutType out_type) | |||||
| : vocab_(nullptr), vocab_path_(vocab_path), load_type_(SPieceTokenizerLoadType::kFile), out_type_(out_type) {} | |||||
| Status SentencePieceTokenizerOperation::ValidateParams() { | |||||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "SentencePieceTokenizer: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } else { | |||||
| Path vocab_file(vocab_path_); | |||||
| if (!vocab_file.Exists() || vocab_file.IsDirectory()) { | |||||
| std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (access(vocab_file.toString().c_str(), R_OK) == -1) { | |||||
| std::string err_msg = "SentencePieceTokenizer : no access to specified dataset file: " + vocab_path_; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() { | |||||
| std::shared_ptr<SentencePieceTokenizerOp> tensor_op; | |||||
| if (load_type_ == SPieceTokenizerLoadType::kModel) { | |||||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(vocab_, load_type_, out_type_); | |||||
| } else { | |||||
| Path vocab_file(vocab_path_); | |||||
| std::string model_path = vocab_file.ParentPath(); | |||||
| std::string model_filename = vocab_file.Basename(); | |||||
| tensor_op = std::make_shared<SentencePieceTokenizerOp>(model_path, model_filename, load_type_, out_type_); | |||||
| } | |||||
| return tensor_op; | |||||
| } | |||||
| // SlidingWindowOperation | |||||
| SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {} | |||||
| Status SlidingWindowOperation::ValidateParams() { | |||||
| if (width_ < 1) { | |||||
| std::string err_msg = | |||||
| "SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> SlidingWindowOperation::Build() { | |||||
| std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_); | |||||
| return tensor_op; | |||||
| } | |||||
| // ToNumberOperation | |||||
| ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {} | |||||
| Status ToNumberOperation::ValidateParams() { | |||||
| if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) { | |||||
| std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> ToNumberOperation::Build() { | |||||
| std::shared_ptr<ToNumberOp> tensor_op = std::make_shared<ToNumberOp>(data_type_); | |||||
| return tensor_op; | |||||
| } | |||||
| TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {} | |||||
| Status TruncateSequencePairOperation::ValidateParams() { | |||||
| if (max_length_ < 0) { | |||||
| std::string err_msg = "TruncateSequencePair : The parameter max_length must be greater than or equal to 0: " + | |||||
| std::to_string(max_length_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> TruncateSequencePairOperation::Build() { | |||||
| std::shared_ptr<TruncateSequencePairOp> tensor_op = std::make_shared<TruncateSequencePairOp>(max_length_); | |||||
| return tensor_op; | |||||
| } | |||||
| // UnicodeCharTokenizerOperation | |||||
| UnicodeCharTokenizerOperation::UnicodeCharTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||||
| Status UnicodeCharTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() { | |||||
| std::shared_ptr<UnicodeCharTokenizerOp> tensor_op = std::make_shared<UnicodeCharTokenizerOp>(with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #ifndef _WIN32 | |||||
| // UnicodeScriptTokenizerOperation | |||||
| UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | |||||
| : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {} | |||||
| Status UnicodeScriptTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> UnicodeScriptTokenizerOperation::Build() { | |||||
| std::shared_ptr<UnicodeScriptTokenizerOp> tensor_op = | |||||
| std::make_shared<UnicodeScriptTokenizerOp>(keep_whitespace_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| // WhitespaceTokenizerOperation | |||||
| WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {} | |||||
| Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); } | |||||
| std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() { | |||||
| std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #endif | |||||
| } // namespace text | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,360 @@ | |||||
| /** | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_ | |||||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_ | |||||
| #include <memory> | |||||
| #include <optional> | |||||
| #include <string> | |||||
| #include <utility> | |||||
| #include <vector> | |||||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| class Vocab; | |||||
| class SentencePieceVocab; | |||||
| // Transform operations for text | |||||
| namespace text { | |||||
| // Char arrays storing name of corresponding classes (in alphabetical order) | |||||
| constexpr char kBasicTokenizerOperation[] = "BasicTokenizer"; | |||||
| constexpr char kBertTokenizerOperation[] = "BertTokenizer"; | |||||
| constexpr char kCaseFoldOperation[] = "CaseFold"; | |||||
| constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; | |||||
| constexpr char kLookupOperation[] = "Lookup"; | |||||
| constexpr char kNgramOperation[] = "Ngram"; | |||||
| constexpr char kNormalizeUTF8Operation[] = "NormalizeUTF8"; | |||||
| constexpr char kRegexReplaceOperation[] = "RegexReplace"; | |||||
| constexpr char kRegexTokenizerOperation[] = "RegexTokenizer"; | |||||
| constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer"; | |||||
| constexpr char kSlidingWindowOperation[] = "SlidingWindow"; | |||||
| constexpr char kToNumberOperation[] = "ToNumber"; | |||||
| constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; | |||||
| constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | |||||
| constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | |||||
| constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||||
| #ifndef _WIN32 | |||||
| class BasicTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| BasicTokenizerOperation(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, | |||||
| bool preserve_unused_token, bool with_offsets); | |||||
| ~BasicTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kBasicTokenizerOperation; } | |||||
| private: | |||||
| bool lower_case_; | |||||
| bool keep_whitespace_; | |||||
| NormalizeForm normalize_form_; | |||||
| bool preserve_unused_token_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| class BertTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| BertTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, | |||||
| bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets); | |||||
| ~BertTokenizerOperation(); | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kBertTokenizerOperation; } | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::string suffix_indicator_; | |||||
| int32_t max_bytes_per_token_; | |||||
| std::string unknown_token_; | |||||
| bool lower_case_; | |||||
| bool keep_whitespace_; | |||||
| NormalizeForm normalize_form_; | |||||
| bool preserve_unused_token_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| class CaseFoldOperation : public TensorOperation { | |||||
| public: | |||||
| CaseFoldOperation() = default; | |||||
| ~CaseFoldOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kCaseFoldOperation; } | |||||
| }; | |||||
| #endif | |||||
| class JiebaTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, | |||||
| bool with_offsets); | |||||
| ~JiebaTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kJiebaTokenizerOperation; } | |||||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||||
| private: | |||||
| std::string hmm_path_; | |||||
| std::string mp_path_; | |||||
| JiebaMode mode_; | |||||
| bool with_offsets_; | |||||
| std::vector<std::pair<std::string, int64_t>> words_list_; | |||||
| }; | |||||
| class LookupOperation : public TensorOperation { | |||||
| public: | |||||
| explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token, | |||||
| const std::string &data_type); | |||||
| ~LookupOperation(); | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kLookupOperation; } | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::optional<std::string> unknown_token_; | |||||
| int32_t default_id_; | |||||
| std::string data_type_; | |||||
| }; | |||||
| class NgramOperation : public TensorOperation { | |||||
| public: | |||||
| explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator); | |||||
| ~NgramOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kNgramOperation; } | |||||
| private: | |||||
| std::vector<int32_t> ngrams_; | |||||
| std::pair<std::string, int32_t> left_pad_; | |||||
| std::pair<std::string, int32_t> right_pad_; | |||||
| std::string separator_; | |||||
| }; | |||||
| #ifndef _WIN32 | |||||
| class NormalizeUTF8Operation : public TensorOperation { | |||||
| public: | |||||
| explicit NormalizeUTF8Operation(NormalizeForm normalize_form); | |||||
| ~NormalizeUTF8Operation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kNormalizeUTF8Operation; } | |||||
| private: | |||||
| NormalizeForm normalize_form_; | |||||
| }; | |||||
| class RegexReplaceOperation : public TensorOperation { | |||||
| public: | |||||
| RegexReplaceOperation(std::string pattern, std::string replace, bool replace_all); | |||||
| ~RegexReplaceOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRegexReplaceOperation; } | |||||
| private: | |||||
| std::string pattern_; | |||||
| std::string replace_; | |||||
| bool replace_all_; | |||||
| }; | |||||
| class RegexTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit RegexTokenizerOperation(std::string delim_pattern, std::string keep_delim_pattern, bool with_offsets); | |||||
| ~RegexTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kRegexTokenizerOperation; } | |||||
| private: | |||||
| std::string delim_pattern_; | |||||
| std::string keep_delim_pattern_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #endif | |||||
| class SentencePieceTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type); | |||||
| SentencePieceTokenizerOperation(const std::string &vocab_path, SPieceTokenizerOutType out_type); | |||||
| ~SentencePieceTokenizerOperation(); | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kSentencepieceTokenizerOperation; } | |||||
| private: | |||||
| std::shared_ptr<SentencePieceVocab> vocab_; | |||||
| std::string vocab_path_; | |||||
| SPieceTokenizerLoadType load_type_; | |||||
| SPieceTokenizerOutType out_type_; | |||||
| }; | |||||
| class SlidingWindowOperation : public TensorOperation { | |||||
| public: | |||||
| explicit SlidingWindowOperation(const int32_t width, const int32_t axis); | |||||
| ~SlidingWindowOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kSlidingWindowOperation; } | |||||
| private: | |||||
| int32_t width_; | |||||
| int32_t axis_; | |||||
| }; | |||||
| class ToNumberOperation : public TensorOperation { | |||||
| public: | |||||
| explicit ToNumberOperation(std::string data_type); | |||||
| ~ToNumberOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kToNumberOperation; } | |||||
| private: | |||||
| std::string data_type_; | |||||
| }; | |||||
| class TruncateSequencePairOperation : public TensorOperation { | |||||
| public: | |||||
| explicit TruncateSequencePairOperation(int32_t max_length); | |||||
| ~TruncateSequencePairOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kTruncateSequencePairOperation; } | |||||
| private: | |||||
| int32_t max_length_; | |||||
| }; | |||||
| class UnicodeCharTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit UnicodeCharTokenizerOperation(bool with_offsets); | |||||
| ~UnicodeCharTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kUnicodeCharTokenizerOperation; } | |||||
| private: | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #ifndef _WIN32 | |||||
| class UnicodeScriptTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets); | |||||
| ~UnicodeScriptTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kUnicodeScriptTokenizerOperation; } | |||||
| private: | |||||
| bool keep_whitespace_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| class WhitespaceTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit WhitespaceTokenizerOperation(bool with_offsets); | |||||
| ~WhitespaceTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kWhitespaceTokenizerOperation; } | |||||
| private: | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #endif | |||||
| } // namespace text | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_KERNELS_TEXT_IR_H_ | |||||
| @@ -0,0 +1,60 @@ | |||||
| /** | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "minddata/dataset/text/ir/validators.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| /* ####################################### Validator Functions ############################################ */ | |||||
| // Helper function to validate tokenizer directory parameter | |||||
| Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) { | |||||
| if (tokenizer_file.empty()) { | |||||
| std::string err_msg = tokenizer_name + ": tokenizer_file is not specified."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| Path file(tokenizer_file); | |||||
| if (!file.Exists()) { | |||||
| std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (access(tokenizer_file.c_str(), R_OK) == -1) { | |||||
| std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| // Helper functions to help validate data type passed by user | |||||
| bool IsTypeNumeric(const std::string &data_type) { | |||||
| if (data_type == "int8" || data_type == "uint8" || data_type == "int16" || data_type == "uint16" || | |||||
| data_type == "int32" || data_type == "uint32" || data_type == "int64" || data_type == "uint64" || | |||||
| data_type == "float16" || data_type == "float32" || data_type == "float64") | |||||
| return true; | |||||
| return false; | |||||
| } | |||||
| bool IsTypeBoolean(const std::string &data_type) { return data_type == "bool"; } | |||||
| bool IsTypeString(const std::string &data_type) { return data_type == "string"; } | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,41 @@ | |||||
| /** | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_ | |||||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_ | |||||
| #include <string> | |||||
| #include "minddata/dataset/core/tensor.h" | |||||
| #include "minddata/dataset/util/status.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| // Helper function to validate tokenizer directory parameter | |||||
| Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file); | |||||
| // Helper function to validate data type passed by user | |||||
| bool IsTypeNumeric(const std::string &data_type); | |||||
| // Helper function to validate data type is boolean | |||||
| bool IsTypeBoolean(const std::string &data_type); | |||||
| // Helper function to validate data type is string | |||||
| bool IsTypeString(const std::string &data_type); | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_IR_VALIDATORS_H_ | |||||
| @@ -202,6 +202,7 @@ if(BUILD_MINDDATA STREQUAL "full") | |||||
| ${MINDDATA_DIR}/kernels/data/type_cast_op.cc | ${MINDDATA_DIR}/kernels/data/type_cast_op.cc | ||||
| ${MINDDATA_DIR}/kernels/image/exif_utils.cc | ${MINDDATA_DIR}/kernels/image/exif_utils.cc | ||||
| ${MINDDATA_DIR}/kernels/ir/validators.cc | ${MINDDATA_DIR}/kernels/ir/validators.cc | ||||
| ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc | |||||
| ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | ||||
| ${MINDDATA_DIR}/callback/callback_manager.cc | ${MINDDATA_DIR}/callback/callback_manager.cc | ||||
| ${MINDDATA_DIR}/util/task_manager.cc | ${MINDDATA_DIR}/util/task_manager.cc | ||||
| @@ -281,6 +282,7 @@ elseif(BUILD_MINDDATA STREQUAL "wrapper") | |||||
| ${MINDDATA_DIR}/kernels/data/data_utils.cc | ${MINDDATA_DIR}/kernels/data/data_utils.cc | ||||
| ${MINDDATA_DIR}/kernels/image/exif_utils.cc | ${MINDDATA_DIR}/kernels/image/exif_utils.cc | ||||
| ${MINDDATA_DIR}/kernels/ir/validators.cc | ${MINDDATA_DIR}/kernels/ir/validators.cc | ||||
| ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc | |||||
| ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/MDToDApi.cc | ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/MDToDApi.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/album_op_android.cc | ${CMAKE_CURRENT_SOURCE_DIR}/wrapper/album_op_android.cc | ||||
| @@ -393,6 +395,7 @@ elseif(BUILD_MINDDATA STREQUAL "lite") | |||||
| ${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc | ${CMAKE_CURRENT_SOURCE_DIR}/../src/common/log_adapter.cc | ||||
| ${CORE_DIR}/utils/ms_utils.cc | ${CORE_DIR}/utils/ms_utils.cc | ||||
| ${MINDDATA_DIR}/kernels/ir/validators.cc | ${MINDDATA_DIR}/kernels/ir/validators.cc | ||||
| ${MINDDATA_DIR}/kernels/ir/data/transforms_ir.cc | |||||
| ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | ${MINDDATA_DIR}/kernels/ir/vision/vision_ir.cc | ||||
| ) | ) | ||||