| @@ -18,7 +18,7 @@ if(ENABLE_PYTHON) | |||||
| python/bindings/dataset/kernels/ir/bindings.cc | python/bindings/dataset/kernels/ir/bindings.cc | ||||
| python/bindings/dataset/kernels/ir/image/bindings.cc | python/bindings/dataset/kernels/ir/image/bindings.cc | ||||
| python/bindings/dataset/text/bindings.cc | python/bindings/dataset/text/bindings.cc | ||||
| python/bindings/dataset/text/kernels/bindings.cc | |||||
| python/bindings/dataset/text/kernels/ir/bindings.cc | |||||
| python/bindings/mindrecord/include/bindings.cc | python/bindings/mindrecord/include/bindings.cc | ||||
| python/pybind_conversion.cc | python/pybind_conversion.cc | ||||
| python/pybind_register.cc | python/pybind_register.cc | ||||
| @@ -1,205 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "pybind11/pybind11.h" | |||||
| #include "pybind11/stl.h" | |||||
| #include "pybind11/stl_bind.h" | |||||
| #include "minddata/dataset/api/python/pybind_register.h" | |||||
| #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||||
| #include "minddata/dataset/text/kernels/ngram_op.h" | |||||
| #include "minddata/dataset/text/kernels/sliding_window_op.h" | |||||
| #include "minddata/dataset/text/kernels/to_number_op.h" | |||||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | |||||
| #ifdef ENABLE_ICU4C | |||||
| #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/bert_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/case_fold_op.h" | |||||
| #include "minddata/dataset/text/kernels/normalize_utf8_op.h" | |||||
| #include "minddata/dataset/text/kernels/regex_replace_op.h" | |||||
| #include "minddata/dataset/text/kernels/regex_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||||
| #endif | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| #ifdef ENABLE_ICU4C | |||||
| PYBIND_REGISTER(BasicTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(*m, | |||||
| "BasicTokenizerOp") | |||||
| .def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(WhitespaceTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>( | |||||
| *m, "WhitespaceTokenizerOp") | |||||
| .def(py::init<const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(UnicodeScriptTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>( | |||||
| *m, "UnicodeScriptTokenizerOp") | |||||
| .def(py::init<>()) | |||||
| .def(py::init<const bool &, const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| CaseFoldOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(*m, "CaseFoldOp").def(py::init<>()); | |||||
| })); | |||||
| PYBIND_REGISTER(NormalizeUTF8Op, 1, ([](const py::module *m) { | |||||
| (void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(*m, "NormalizeUTF8Op") | |||||
| .def(py::init<>()) | |||||
| .def(py::init<NormalizeForm>()); | |||||
| })); | |||||
| PYBIND_REGISTER(RegexReplaceOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(*m, "RegexReplaceOp") | |||||
| .def(py::init<const std::string &, const std::string &, bool>()); | |||||
| })); | |||||
| PYBIND_REGISTER(RegexTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(*m, | |||||
| "RegexTokenizerOp") | |||||
| .def(py::init<const std::string &, const std::string &, const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(BertTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp") | |||||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, | |||||
| const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<NormalizeForm>(*m, "NormalizeForm", py::arithmetic()) | |||||
| .value("DE_NORMALIZE_NONE", NormalizeForm::kNone) | |||||
| .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc) | |||||
| .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc) | |||||
| .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd) | |||||
| .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd) | |||||
| .export_values(); | |||||
| })); | |||||
| #endif | |||||
| PYBIND_REGISTER(JiebaTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, | |||||
| "JiebaTokenizerOp") | |||||
| .def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>()) | |||||
| .def("add_word", [](JiebaTokenizerOp &self, const std::string word, int freq) { | |||||
| THROW_IF_ERROR(self.AddWord(word, freq)); | |||||
| }); | |||||
| })); | |||||
| PYBIND_REGISTER(UnicodeCharTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>( | |||||
| *m, "UnicodeCharTokenizerOp") | |||||
| .def(py::init<const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp") | |||||
| .def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word, | |||||
| const DataType &data_type) { | |||||
| if (vocab == nullptr) { | |||||
| THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null.")); | |||||
| } | |||||
| if (py_word.is_none()) { | |||||
| return std::make_shared<LookupOp>(vocab, Vocab::kNoTokenExists, data_type); | |||||
| } | |||||
| std::string word = py::reinterpret_borrow<py::str>(py_word); | |||||
| WordIdType default_id = vocab->Lookup(word); | |||||
| if (default_id == Vocab::kNoTokenExists) { | |||||
| THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, | |||||
| "default unknown token: " + word + " doesn't exist in vocab.")); | |||||
| } | |||||
| return std::make_shared<LookupOp>(vocab, default_id, data_type); | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(NgramOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<NgramOp, TensorOp, std::shared_ptr<NgramOp>>(*m, "NgramOp") | |||||
| .def(py::init<const std::vector<int32_t> &, int32_t, int32_t, const std::string &, | |||||
| const std::string &, const std::string &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>( | |||||
| *m, "WordpieceTokenizerOp") | |||||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, | |||||
| const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(SlidingWindowOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(*m, "SlidingWindowOp") | |||||
| .def(py::init<uint32_t, int32_t>()); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| SentencePieceTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<SentencePieceTokenizerOp, TensorOp, std::shared_ptr<SentencePieceTokenizerOp>>( | |||||
| *m, "SentencePieceTokenizerOp") | |||||
| .def( | |||||
| py::init<std::shared_ptr<SentencePieceVocab> &, const SPieceTokenizerLoadType, const SPieceTokenizerOutType>()) | |||||
| .def(py::init<const std::string &, const std::string &, const SPieceTokenizerLoadType, | |||||
| const SPieceTokenizerOutType>()); | |||||
| })); | |||||
| PYBIND_REGISTER(ToNumberOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<ToNumberOp, TensorOp, std::shared_ptr<ToNumberOp>>(*m, "ToNumberOp") | |||||
| .def(py::init<DataType>()) | |||||
| .def(py::init<std::string>()); | |||||
| })); | |||||
| PYBIND_REGISTER(TruncateSequencePairOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>( | |||||
| *m, "TruncateSequencePairOp") | |||||
| .def(py::init<int64_t>()); | |||||
| })); | |||||
| PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<JiebaMode>(*m, "JiebaMode", py::arithmetic()) | |||||
| .value("DE_JIEBA_MIX", JiebaMode::kMix) | |||||
| .value("DE_JIEBA_MP", JiebaMode::kMp) | |||||
| .value("DE_JIEBA_HMM", JiebaMode::kHmm) | |||||
| .export_values(); | |||||
| })); | |||||
| PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<SPieceTokenizerOutType>(*m, "SPieceTokenizerOutType", py::arithmetic()) | |||||
| .value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString) | |||||
| .value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt) | |||||
| .export_values(); | |||||
| })); | |||||
| PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<SPieceTokenizerLoadType>(*m, "SPieceTokenizerLoadType", py::arithmetic()) | |||||
| .value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile) | |||||
| .value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel) | |||||
| .export_values(); | |||||
| })); | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,267 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "pybind11/pybind11.h" | |||||
| #include "pybind11/stl.h" | |||||
| #include "pybind11/stl_bind.h" | |||||
| #include "minddata/dataset/api/python/pybind_register.h" | |||||
| #include "minddata/dataset/include/text.h" | |||||
| #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | |||||
| #include "minddata/dataset/text/sentence_piece_vocab.h" | |||||
| #include "minddata/dataset/text/vocab.h" | |||||
| namespace mindspore { | |||||
| namespace dataset { | |||||
| #ifdef ENABLE_ICU4C | |||||
| PYBIND_REGISTER( | |||||
| BasicTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::BasicTokenizerOperation, TensorOperation, std::shared_ptr<text::BasicTokenizerOperation>>( | |||||
| *m, "BasicTokenizerOperation") | |||||
| .def(py::init([](bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, | |||||
| bool preserve_unused_token, bool with_offsets) { | |||||
| auto basic_tokenizer = std::make_shared<text::BasicTokenizerOperation>( | |||||
| lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets); | |||||
| THROW_IF_ERROR(basic_tokenizer->ValidateParams()); | |||||
| return basic_tokenizer; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| BertTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::BertTokenizerOperation, TensorOperation, std::shared_ptr<text::BertTokenizerOperation>>( | |||||
| *m, "BertTokenizerOperation") | |||||
| .def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool lower_case, | |||||
| bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, | |||||
| bool with_offsets) { | |||||
| auto bert_tokenizer = std::make_shared<text::BertTokenizerOperation>( | |||||
| vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, normalize_form, | |||||
| preserve_unused_token, with_offsets); | |||||
| THROW_IF_ERROR(bert_tokenizer->ValidateParams()); | |||||
| return bert_tokenizer; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(CaseFoldOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::CaseFoldOperation, TensorOperation, std::shared_ptr<text::CaseFoldOperation>>( | |||||
| *m, "CaseFoldOperation") | |||||
| .def(py::init([]() { | |||||
| auto case_fold = std::make_shared<text::CaseFoldOperation>(); | |||||
| THROW_IF_ERROR(case_fold->ValidateParams()); | |||||
| return case_fold; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| NormalizeUTF8Operation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::NormalizeUTF8Operation, TensorOperation, std::shared_ptr<text::NormalizeUTF8Operation>>( | |||||
| *m, "NormalizeUTF8Operation") | |||||
| .def(py::init([](NormalizeForm normalize_form) { | |||||
| auto normalize_utf8 = std::make_shared<text::NormalizeUTF8Operation>(normalize_form); | |||||
| THROW_IF_ERROR(normalize_utf8->ValidateParams()); | |||||
| return normalize_utf8; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| RegexReplaceOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::RegexReplaceOperation, TensorOperation, std::shared_ptr<text::RegexReplaceOperation>>( | |||||
| *m, "RegexReplaceOperation") | |||||
| .def(py::init([](std::string pattern, std::string replace, bool replace_all) { | |||||
| auto regex_replace = std::make_shared<text::RegexReplaceOperation>(pattern, replace, replace_all); | |||||
| THROW_IF_ERROR(regex_replace->ValidateParams()); | |||||
| return regex_replace; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| RegexTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::RegexTokenizerOperation, TensorOperation, std::shared_ptr<text::RegexTokenizerOperation>>( | |||||
| *m, "RegexTokenizerOperation") | |||||
| .def( | |||||
| py::init([](const std::string &delim_pattern, const std::string &keep_delim_pattern, const bool &with_offsets) { | |||||
| auto regex_tokenizer = | |||||
| std::make_shared<text::RegexTokenizerOperation>(delim_pattern, keep_delim_pattern, with_offsets); | |||||
| THROW_IF_ERROR(regex_tokenizer->ValidateParams()); | |||||
| return regex_tokenizer; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(UnicodeScriptTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::UnicodeScriptTokenizerOperation, TensorOperation, | |||||
| std::shared_ptr<text::UnicodeScriptTokenizerOperation>>( | |||||
| *m, "UnicodeScriptTokenizerOperation") | |||||
| .def(py::init([](bool keep_whitespace, bool with_offsets) { | |||||
| auto unicode_script_tokenizer = | |||||
| std::make_shared<text::UnicodeScriptTokenizerOperation>(keep_whitespace, with_offsets); | |||||
| THROW_IF_ERROR(unicode_script_tokenizer->ValidateParams()); | |||||
| return unicode_script_tokenizer; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(WhitespaceTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::WhitespaceTokenizerOperation, TensorOperation, | |||||
| std::shared_ptr<text::WhitespaceTokenizerOperation>>(*m, | |||||
| "WhitespaceTokenizerOperation") | |||||
| .def(py::init([](bool with_offsets) { | |||||
| auto whitespace_tokenizer = std::make_shared<text::WhitespaceTokenizerOperation>(with_offsets); | |||||
| THROW_IF_ERROR(whitespace_tokenizer->ValidateParams()); | |||||
| return whitespace_tokenizer; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(NormalizeForm, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<NormalizeForm>(*m, "NormalizeForm", py::arithmetic()) | |||||
| .value("DE_NORMALIZE_NONE", NormalizeForm::kNone) | |||||
| .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc) | |||||
| .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc) | |||||
| .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd) | |||||
| .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd) | |||||
| .export_values(); | |||||
| })); | |||||
| #endif | |||||
| PYBIND_REGISTER( | |||||
| JiebaTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::JiebaTokenizerOperation, TensorOperation, std::shared_ptr<text::JiebaTokenizerOperation>>( | |||||
| *m, "JiebaTokenizerOperation") | |||||
| .def( | |||||
| py::init([](const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode, bool with_offsets) { | |||||
| auto jieba_tokenizer = std::make_shared<text::JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets); | |||||
| THROW_IF_ERROR(jieba_tokenizer->ValidateParams()); | |||||
| return jieba_tokenizer; | |||||
| })) | |||||
| .def("add_word", [](text::JiebaTokenizerOperation &self, const std::string word, int64_t freq) { | |||||
| THROW_IF_ERROR(self.AddWord(word, freq)); | |||||
| }); | |||||
| })); | |||||
| PYBIND_REGISTER(LookupOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::LookupOperation, TensorOperation, std::shared_ptr<text::LookupOperation>>( | |||||
| *m, "LookupOperation") | |||||
| .def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | |||||
| const std::string &data_type) { | |||||
| auto lookup = std::make_shared<text::LookupOperation>(vocab, unknown_token, data_type); | |||||
| THROW_IF_ERROR(lookup->ValidateParams()); | |||||
| return lookup; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(NgramOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::NgramOperation, TensorOperation, std::shared_ptr<text::NgramOperation>>( | |||||
| *m, "NgramOperation") | |||||
| .def( | |||||
| py::init([](const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad, | |||||
| const std::pair<std::string, int32_t> &right_pad, const std::string &separator) { | |||||
| auto ngram = std::make_shared<text::NgramOperation>(ngrams, left_pad, right_pad, separator); | |||||
| THROW_IF_ERROR(ngram->ValidateParams()); | |||||
| return ngram; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| SentencePieceTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::SentencePieceTokenizerOperation, TensorOperation, | |||||
| std::shared_ptr<text::SentencePieceTokenizerOperation>>(*m, "SentencePieceTokenizerOperation") | |||||
| .def(py::init([](const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) { | |||||
| auto SentencePieceTokenizer = std::make_shared<text::SentencePieceTokenizerOperation>(vocab, out_type); | |||||
| THROW_IF_ERROR(SentencePieceTokenizer->ValidateParams()); | |||||
| return SentencePieceTokenizer; | |||||
| })) | |||||
| .def(py::init([](const std::string &vocab_path, SPieceTokenizerOutType out_type) { | |||||
| auto sentence_piece_tokenizer = std::make_shared<text::SentencePieceTokenizerOperation>(vocab_path, out_type); | |||||
| THROW_IF_ERROR(sentence_piece_tokenizer->ValidateParams()); | |||||
| return sentence_piece_tokenizer; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER( | |||||
| SlidingWindowOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::SlidingWindowOperation, TensorOperation, std::shared_ptr<text::SlidingWindowOperation>>( | |||||
| *m, "SlidingWindowOperation") | |||||
| .def(py::init([](const int32_t width, const int32_t axis) { | |||||
| auto sliding_window = std::make_shared<text::SlidingWindowOperation>(width, axis); | |||||
| THROW_IF_ERROR(sliding_window->ValidateParams()); | |||||
| return sliding_window; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::ToNumberOperation, TensorOperation, std::shared_ptr<text::ToNumberOperation>>( | |||||
| *m, "ToNumberOperation") | |||||
| .def(py::init([](std::string data_type) { | |||||
| auto to_number = std::make_shared<text::ToNumberOperation>(data_type); | |||||
| THROW_IF_ERROR(to_number->ValidateParams()); | |||||
| return to_number; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::TruncateSequencePairOperation, TensorOperation, | |||||
| std::shared_ptr<text::TruncateSequencePairOperation>>( | |||||
| *m, "TruncateSequencePairOperation") | |||||
| .def(py::init([](int32_t max_length) { | |||||
| auto truncate_sequence_pair = std::make_shared<text::TruncateSequencePairOperation>(max_length); | |||||
| THROW_IF_ERROR(truncate_sequence_pair->ValidateParams()); | |||||
| return truncate_sequence_pair; | |||||
| })); | |||||
| })); | |||||
| PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::UnicodeCharTokenizerOperation, TensorOperation, | |||||
| std::shared_ptr<text::UnicodeCharTokenizerOperation>>( | |||||
| *m, "UnicodeCharTokenizerOperation") | |||||
| .def(py::init([](bool with_offsets) { | |||||
| auto unicode_char_tokenizer = std::make_shared<text::UnicodeCharTokenizerOperation>(with_offsets); | |||||
| THROW_IF_ERROR(unicode_char_tokenizer->ValidateParams()); | |||||
| return unicode_char_tokenizer; | |||||
| })); | |||||
| })); | |||||
| // TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ | |||||
| PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>( | |||||
| *m, "WordpieceTokenizerOp") | |||||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, | |||||
| const bool &>()); | |||||
| })); | |||||
| PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<JiebaMode>(*m, "JiebaMode", py::arithmetic()) | |||||
| .value("DE_JIEBA_MIX", JiebaMode::kMix) | |||||
| .value("DE_JIEBA_MP", JiebaMode::kMp) | |||||
| .value("DE_JIEBA_HMM", JiebaMode::kHmm) | |||||
| .export_values(); | |||||
| })); | |||||
| PYBIND_REGISTER(SPieceTokenizerLoadType, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<SPieceTokenizerLoadType>(*m, "SPieceTokenizerLoadType", py::arithmetic()) | |||||
| .value("DE_SPIECE_TOKENIZER_LOAD_KFILE", SPieceTokenizerLoadType::kFile) | |||||
| .value("DE_SPIECE_TOKENIZER_LOAD_KMODEL", SPieceTokenizerLoadType::kModel) | |||||
| .export_values(); | |||||
| })); | |||||
| PYBIND_REGISTER(SPieceTokenizerOutType, 0, ([](const py::module *m) { | |||||
| (void)py::enum_<SPieceTokenizerOutType>(*m, "SPieceTokenizerOutType", py::arithmetic()) | |||||
| .value("DE_SPIECE_TOKENIZER_OUTTYPE_KString", SPieceTokenizerOutType::kString) | |||||
| .value("DE_SPIECE_TOKENIZER_OUTTYPE_KINT", SPieceTokenizerOutType::kInt) | |||||
| .export_values(); | |||||
| })); | |||||
| } // namespace dataset | |||||
| } // namespace mindspore | |||||
| @@ -314,9 +314,31 @@ Status JiebaTokenizerOperation::ValidateParams() { | |||||
| std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() { | ||||
| std::shared_ptr<JiebaTokenizerOp> tensor_op = | std::shared_ptr<JiebaTokenizerOp> tensor_op = | ||||
| std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_); | std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_); | ||||
| for (auto &word : words_list_) { | |||||
| Status rc = tensor_op->AddWord(word.first, word.second); | |||||
| if (rc.IsError()) { | |||||
| MS_LOG(ERROR) << rc; | |||||
| return {}; | |||||
| } | |||||
| } | |||||
| return tensor_op; | return tensor_op; | ||||
| } | } | ||||
| Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) { | |||||
| if (word.empty()) { | |||||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (freq < 0) { | |||||
| std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| words_list_.emplace_back(word, freq); | |||||
| return Status::OK(); | |||||
| } | |||||
| // LookupOperation | // LookupOperation | ||||
| LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token, | ||||
| const std::string &data_type) | const std::string &data_type) | ||||
| @@ -330,12 +352,13 @@ Status LookupOperation::ValidateParams() { | |||||
| MS_LOG(ERROR) << err_msg; | MS_LOG(ERROR) << err_msg; | ||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | RETURN_STATUS_SYNTAX_ERROR(err_msg); | ||||
| } | } | ||||
| default_id_ = vocab_->Lookup(unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| if (!unknown_token_.empty()) { | |||||
| default_id_ = vocab_->Lookup(unknown_token_); | |||||
| if (default_id_ == Vocab::kNoTokenExists) { | |||||
| std::string err_msg = "Lookup: \"" + unknown_token_ + "\" doesn't exist in vocab."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| } | } | ||||
| if (!IsTypeNumeric(data_type_)) { | if (!IsTypeNumeric(data_type_)) { | ||||
| @@ -331,11 +331,14 @@ class JiebaTokenizerOperation : public TensorOperation { | |||||
| std::string Name() const override { return kJiebaTokenizerOperation; } | std::string Name() const override { return kJiebaTokenizerOperation; } | ||||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||||
| private: | private: | ||||
| std::string hmm_path_; | std::string hmm_path_; | ||||
| std::string mp_path_; | std::string mp_path_; | ||||
| JiebaMode mode_; | JiebaMode mode_; | ||||
| bool with_offsets_; | bool with_offsets_; | ||||
| std::vector<std::pair<std::string, int64_t>> words_list_; | |||||
| }; | }; | ||||
| class LookupOperation : public TensorOperation { | class LookupOperation : public TensorOperation { | ||||
| @@ -383,3 +383,7 @@ def check_tensor_op(param, param_name): | |||||
| """check whether param is a tensor op or a callable Python function""" | """check whether param is a tensor op or a callable Python function""" | ||||
| if not isinstance(param, cde.TensorOp) and not callable(param) and not getattr(param, 'parse', None): | if not isinstance(param, cde.TensorOp) and not callable(param) and not getattr(param, 'parse', None): | ||||
| raise TypeError("{0} is neither a c_transform op (TensorOperation) nor a callable pyfunc.".format(param_name)) | raise TypeError("{0} is neither a c_transform op (TensorOperation) nor a callable pyfunc.".format(param_name)) | ||||
| def replace_none(value, default): | |||||
| return value if value is not None else default | |||||
| @@ -55,9 +55,10 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che | |||||
| check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \ | check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \ | ||||
| check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \ | check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \ | ||||
| check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \ | check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, \ | ||||
| check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, replace_none | |||||
| check_paddeddataset, check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send | |||||
| from ..core.config import get_callback_timeout, _init_device_info | from ..core.config import get_callback_timeout, _init_device_info | ||||
| from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist | from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist | ||||
| from ..core.validator_helpers import replace_none | |||||
| try: | try: | ||||
| context = import_module("mindspore.context") | context = import_module("mindspore.context") | ||||
| @@ -1323,7 +1323,3 @@ def check_to_device_send(method): | |||||
| return method(self, *args, **kwargs) | return method(self, *args, **kwargs) | ||||
| return new_method | return new_method | ||||
| def replace_none(value, default): | |||||
| return value if value is not None else default | |||||
| @@ -58,9 +58,13 @@ from .validators import check_lookup, check_jieba_add_dict, \ | |||||
| check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate, \ | check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate, \ | ||||
| check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow | check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow | ||||
| from ..core.datatypes import mstype_to_detype | from ..core.datatypes import mstype_to_detype | ||||
| from ..core.validator_helpers import replace_none | |||||
| class TextTensorOperation: | |||||
| def parse(self): | |||||
| raise NotImplementedError("TextTensorOperation has to implement parse method.") | |||||
| class Lookup(cde.LookupOp): | |||||
| class Lookup(TextTensorOperation): | |||||
| """ | """ | ||||
| Lookup operator that looks up a word to an id. | Lookup operator that looks up a word to an id. | ||||
| @@ -82,10 +86,15 @@ class Lookup(cde.LookupOp): | |||||
| @check_lookup | @check_lookup | ||||
| def __init__(self, vocab, unknown_token=None, data_type=mstype.int32): | def __init__(self, vocab, unknown_token=None, data_type=mstype.int32): | ||||
| super().__init__(vocab, unknown_token, mstype_to_detype(data_type)) | |||||
| self.vocab = vocab | |||||
| self.unknown_token = replace_none(unknown_token, '') | |||||
| self.data_type = data_type | |||||
| def parse(self): | |||||
| return cde.LookupOperation(self.vocab, self.unknown_token, str(mstype_to_detype(self.data_type))) | |||||
| class SlidingWindow(cde.SlidingWindowOp): | |||||
| class SlidingWindow(TextTensorOperation): | |||||
| """ | """ | ||||
| TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis | TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis | ||||
| is a slice of data starting at the corresponding position, with a specified width. | is a slice of data starting at the corresponding position, with a specified width. | ||||
| @@ -114,10 +123,14 @@ class SlidingWindow(cde.SlidingWindowOp): | |||||
| @check_slidingwindow | @check_slidingwindow | ||||
| def __init__(self, width, axis=0): | def __init__(self, width, axis=0): | ||||
| super().__init__(width, axis) | |||||
| self.width = width | |||||
| self.axis = axis | |||||
| def parse(self): | |||||
| return cde.SlidingWindowOperation(self.width, self.axis) | |||||
| class Ngram(cde.NgramOp): | |||||
| class Ngram(TextTensorOperation): | |||||
| """ | """ | ||||
| TensorOp to generate n-gram from a 1-D string Tensor. | TensorOp to generate n-gram from a 1-D string Tensor. | ||||
| @@ -145,7 +158,13 @@ class Ngram(cde.NgramOp): | |||||
| @check_ngram | @check_ngram | ||||
| def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "): | def __init__(self, n, left_pad=("", 0), right_pad=("", 0), separator=" "): | ||||
| super().__init__(n, left_pad[1], right_pad[1], left_pad[0], right_pad[0], separator) | |||||
| self.ngrams = n | |||||
| self.left_pad = left_pad | |||||
| self.right_pad = right_pad | |||||
| self.separator = separator | |||||
| def parse(self): | |||||
| return cde.NgramOperation(self.ngrams, self.left_pad, self.right_pad, self.separator) | |||||
| DE_C_INTER_JIEBA_MODE = { | DE_C_INTER_JIEBA_MODE = { | ||||
| @@ -155,7 +174,7 @@ DE_C_INTER_JIEBA_MODE = { | |||||
| } | } | ||||
| class JiebaTokenizer(cde.JiebaTokenizerOp): | |||||
| class JiebaTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize Chinese string into words based on dictionary. | Tokenize Chinese string into words based on dictionary. | ||||
| @@ -196,11 +215,19 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): | |||||
| self.mode = mode | self.mode = mode | ||||
| self.__check_path__(hmm_path) | self.__check_path__(hmm_path) | ||||
| self.hmm_path = hmm_path | |||||
| self.__check_path__(mp_path) | self.__check_path__(mp_path) | ||||
| self.mp_path = mp_path | |||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(hmm_path, mp_path, | |||||
| DE_C_INTER_JIEBA_MODE[mode], | |||||
| self.with_offsets) | |||||
| self.words = [] | |||||
| def parse(self): | |||||
| jieba_tokenizer = cde.JiebaTokenizerOperation(self.hmm_path, self.mp_path, | |||||
| DE_C_INTER_JIEBA_MODE[self.mode], | |||||
| self.with_offsets) | |||||
| for word in self.words: | |||||
| jieba_tokenizer.add_word(word[0], word[1]) | |||||
| return jieba_tokenizer | |||||
| @check_jieba_add_word | @check_jieba_add_word | ||||
| def add_word(self, word, freq=None): | def add_word(self, word, freq=None): | ||||
| @@ -225,9 +252,9 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): | |||||
| """ | """ | ||||
| if freq is None: | if freq is None: | ||||
| super().add_word(word, 0) | |||||
| self.words.append((word, 0)) | |||||
| else: | else: | ||||
| super().add_word(word, freq) | |||||
| self.words.append((word, freq)) | |||||
| @check_jieba_add_dict | @check_jieba_add_dict | ||||
| def add_dict(self, user_dict): | def add_dict(self, user_dict): | ||||
| @@ -308,7 +335,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): | |||||
| " jieba mode file {} is not exist.".format(model_path)) | " jieba mode file {} is not exist.".format(model_path)) | ||||
| class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): | |||||
| class UnicodeCharTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize a scalar tensor of UTF-8 string to Unicode characters. | Tokenize a scalar tensor of UTF-8 string to Unicode characters. | ||||
| @@ -332,9 +359,12 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): | |||||
| @check_with_offsets | @check_with_offsets | ||||
| def __init__(self, with_offsets=False): | def __init__(self, with_offsets=False): | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.UnicodeCharTokenizerOperation(self.with_offsets) | |||||
| # TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ | |||||
| class WordpieceTokenizer(cde.WordpieceTokenizerOp): | class WordpieceTokenizer(cde.WordpieceTokenizerOp): | ||||
| """ | """ | ||||
| Tokenize scalar token or 1-D tokens to 1-D subword tokens. | Tokenize scalar token or 1-D tokens to 1-D subword tokens. | ||||
| @@ -386,7 +416,7 @@ DE_C_INTER_SENTENCEPIECE_OUTTYPE = { | |||||
| } | } | ||||
| class SentencePieceTokenizer(cde.SentencePieceTokenizerOp): | |||||
| class SentencePieceTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize scalar token or 1-D tokens to tokens by sentencepiece. | Tokenize scalar token or 1-D tokens to tokens by sentencepiece. | ||||
| @@ -404,19 +434,15 @@ class SentencePieceTokenizer(cde.SentencePieceTokenizerOp): | |||||
| """ | """ | ||||
| def __init__(self, mode, out_type): | def __init__(self, mode, out_type): | ||||
| self.mode = mode | |||||
| self.out_type = out_type | self.out_type = out_type | ||||
| if isinstance(mode, str): | |||||
| model_path, model_filename = os.path.split(mode) | |||||
| super().__init__(model_path, model_filename, | |||||
| DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.FILE], | |||||
| DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type]) | |||||
| elif isinstance(mode, cde.SentencePieceVocab): | |||||
| super().__init__(mode, DE_C_INTER_SENTENCEPIECE_LOADTYPE[SPieceTokenizerLoadType.MODEL], | |||||
| DE_C_INTER_SENTENCEPIECE_OUTTYPE[out_type]) | |||||
| def parse(self): | |||||
| return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type]) | |||||
| if platform.system().lower() != 'windows': | if platform.system().lower() != 'windows': | ||||
| class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): | |||||
| class WhitespaceTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. | Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces, such as: ' ', '\\\\t', '\\\\r', '\\\\n'. | ||||
| @@ -444,10 +470,12 @@ if platform.system().lower() != 'windows': | |||||
| @check_with_offsets | @check_with_offsets | ||||
| def __init__(self, with_offsets=False): | def __init__(self, with_offsets=False): | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.WhitespaceTokenizerOperation(self.with_offsets) | |||||
| class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): | |||||
| class UnicodeScriptTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. | Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. | ||||
| @@ -475,12 +503,16 @@ if platform.system().lower() != 'windows': | |||||
| @check_unicode_script_tokenizer | @check_unicode_script_tokenizer | ||||
| def __init__(self, keep_whitespace=False, with_offsets=False): | def __init__(self, keep_whitespace=False, with_offsets=False): | ||||
| keep_whitespace = replace_none(keep_whitespace, False) | |||||
| with_offsets = replace_none(with_offsets, False) | |||||
| self.keep_whitespace = keep_whitespace | self.keep_whitespace = keep_whitespace | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.keep_whitespace, self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.UnicodeScriptTokenizerOperation(self.keep_whitespace, self.with_offsets) | |||||
| class CaseFold(cde.CaseFoldOp): | |||||
| class CaseFold(TextTensorOperation): | |||||
| """ | """ | ||||
| Apply case fold operation on UTF-8 string tensor. | Apply case fold operation on UTF-8 string tensor. | ||||
| @@ -494,6 +526,9 @@ if platform.system().lower() != 'windows': | |||||
| >>> data1 = data1.map(operations=case_op) | >>> data1 = data1.map(operations=case_op) | ||||
| """ | """ | ||||
| def parse(self): | |||||
| return cde.CaseFoldOperation() | |||||
| DE_C_INTER_NORMALIZE_FORM = { | DE_C_INTER_NORMALIZE_FORM = { | ||||
| NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, | NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE, | ||||
| @@ -504,7 +539,7 @@ if platform.system().lower() != 'windows': | |||||
| } | } | ||||
| class NormalizeUTF8(cde.NormalizeUTF8Op): | |||||
| class NormalizeUTF8(TextTensorOperation): | |||||
| """ | """ | ||||
| Apply normalize operation on UTF-8 string tensor. | Apply normalize operation on UTF-8 string tensor. | ||||
| @@ -534,11 +569,14 @@ if platform.system().lower() != 'windows': | |||||
| if not isinstance(normalize_form, NormalizeForm): | if not isinstance(normalize_form, NormalizeForm): | ||||
| raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") | raise TypeError("Wrong input type for normalization_form, should be enum of 'NormalizeForm'.") | ||||
| normalize_form = replace_none(normalize_form, NormalizeForm.NFKC) | |||||
| self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] | self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] | ||||
| super().__init__(self.normalize_form) | |||||
| def parse(self): | |||||
| return cde.NormalizeUTF8Operation(self.normalize_form) | |||||
| class RegexReplace(cde.RegexReplaceOp): | |||||
| class RegexReplace(TextTensorOperation): | |||||
| """ | """ | ||||
| Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'. | Replace UTF-8 string tensor with 'replace' according to regular expression 'pattern'. | ||||
| @@ -566,10 +604,12 @@ if platform.system().lower() != 'windows': | |||||
| self.pattern = pattern | self.pattern = pattern | ||||
| self.replace = replace | self.replace = replace | ||||
| self.replace_all = replace_all | self.replace_all = replace_all | ||||
| super().__init__(self.pattern, self.replace, self.replace_all) | |||||
| def parse(self): | |||||
| return cde.RegexReplaceOperation(self.pattern, self.replace, self.replace_all) | |||||
| class RegexTokenizer(cde.RegexTokenizerOp): | |||||
| class RegexTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize a scalar tensor of UTF-8 string by regex expression pattern. | Tokenize a scalar tensor of UTF-8 string by regex expression pattern. | ||||
| @@ -606,10 +646,12 @@ if platform.system().lower() != 'windows': | |||||
| self.delim_pattern = delim_pattern | self.delim_pattern = delim_pattern | ||||
| self.keep_delim_pattern = keep_delim_pattern | self.keep_delim_pattern = keep_delim_pattern | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.RegexTokenizerOperation(self.delim_pattern, self.keep_delim_pattern, self.with_offsets) | |||||
| class BasicTokenizer(cde.BasicTokenizerOp): | |||||
| class BasicTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize a scalar tensor of UTF-8 string by specific rules. | Tokenize a scalar tensor of UTF-8 string by specific rules. | ||||
| @@ -661,11 +703,13 @@ if platform.system().lower() != 'windows': | |||||
| self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] | self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] | ||||
| self.preserve_unused_token = preserve_unused_token | self.preserve_unused_token = preserve_unused_token | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form, | |||||
| self.preserve_unused_token, self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.BasicTokenizerOperation(self.lower_case, self.keep_whitespace, self.normalization_form, | |||||
| self.preserve_unused_token, self.with_offsets) | |||||
| class BertTokenizer(cde.BertTokenizerOp): | |||||
| class BertTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenizer used for Bert text process. | Tokenizer used for Bert text process. | ||||
| @@ -725,12 +769,14 @@ if platform.system().lower() != 'windows': | |||||
| self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] | self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] | ||||
| self.preserve_unused_token = preserve_unused_token | self.preserve_unused_token = preserve_unused_token | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, | |||||
| self.lower_case, self.keep_whitespace, self.normalization_form, | |||||
| self.preserve_unused_token, self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.BertTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, | |||||
| self.unknown_token, self.lower_case, self.keep_whitespace, | |||||
| self.normalization_form, self.preserve_unused_token, self.with_offsets) | |||||
| class TruncateSequencePair(cde.TruncateSequencePairOp): | |||||
| class TruncateSequencePair(TextTensorOperation): | |||||
| """ | """ | ||||
| Truncate a pair of rank-1 tensors such that the total length is less than max_length. | Truncate a pair of rank-1 tensors such that the total length is less than max_length. | ||||
| @@ -757,10 +803,13 @@ class TruncateSequencePair(cde.TruncateSequencePairOp): | |||||
| @check_pair_truncate | @check_pair_truncate | ||||
| def __init__(self, max_length): | def __init__(self, max_length): | ||||
| super().__init__(max_length) | |||||
| self.max_length = max_length | |||||
| def parse(self): | |||||
| return cde.TruncateSequencePairOperation(self.max_length) | |||||
| class ToNumber(cde.ToNumberOp): | |||||
| class ToNumber(TextTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to convert every element of a string tensor to a number. | Tensor operation to convert every element of a string tensor to a number. | ||||
| @@ -789,7 +838,9 @@ class ToNumber(cde.ToNumberOp): | |||||
| def __init__(self, data_type): | def __init__(self, data_type): | ||||
| data_type = mstype_to_detype(data_type) | data_type = mstype_to_detype(data_type) | ||||
| self.data_type = str(data_type) | self.data_type = str(data_type) | ||||
| super().__init__(data_type) | |||||
| def parse(self): | |||||
| return cde.ToNumberOperation(self.data_type) | |||||
| class PythonTokenizer: | class PythonTokenizer: | ||||
| @@ -81,11 +81,11 @@ def parse_padding(padding): | |||||
| padding = tuple(padding) | padding = tuple(padding) | ||||
| return padding | return padding | ||||
| class TensorOperation: | |||||
| class ImageTensorOperation: | |||||
| def parse(self): | def parse(self): | ||||
| raise NotImplementedError("TensorOperation has to implement parse method.") | |||||
| raise NotImplementedError("ImageTensorOperation has to implement parse method.") | |||||
| class AutoContrast(TensorOperation): | |||||
| class AutoContrast(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply automatic contrast on input image. | Apply automatic contrast on input image. | ||||
| @@ -112,7 +112,7 @@ class AutoContrast(TensorOperation): | |||||
| return cde.AutoContrastOperation(self.cutoff, self.ignore) | return cde.AutoContrastOperation(self.cutoff, self.ignore) | ||||
| class RandomSharpness(TensorOperation): | |||||
| class RandomSharpness(ImageTensorOperation): | |||||
| """ | """ | ||||
| Adjust the sharpness of the input image by a fixed or random degree. Degree of 0.0 gives a blurred image, | Adjust the sharpness of the input image by a fixed or random degree. Degree of 0.0 gives a blurred image, | ||||
| degree of 1.0 gives the original image, and degree of 2.0 gives a sharpened image. | degree of 1.0 gives the original image, and degree of 2.0 gives a sharpened image. | ||||
| @@ -140,7 +140,7 @@ class RandomSharpness(TensorOperation): | |||||
| return cde.RandomSharpnessOperation(self.degrees) | return cde.RandomSharpnessOperation(self.degrees) | ||||
| class Equalize(TensorOperation): | |||||
| class Equalize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply histogram equalization on input image. | Apply histogram equalization on input image. | ||||
| @@ -153,7 +153,7 @@ class Equalize(TensorOperation): | |||||
| return cde.EqualizeOperation() | return cde.EqualizeOperation() | ||||
| class Invert(TensorOperation): | |||||
| class Invert(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply invert on input image in RGB mode. | Apply invert on input image in RGB mode. | ||||
| @@ -166,7 +166,7 @@ class Invert(TensorOperation): | |||||
| return cde.InvertOperation() | return cde.InvertOperation() | ||||
| class Decode(TensorOperation): | |||||
| class Decode(ImageTensorOperation): | |||||
| """ | """ | ||||
| Decode the input image in RGB mode. | Decode the input image in RGB mode. | ||||
| @@ -203,7 +203,7 @@ class Decode(TensorOperation): | |||||
| return cde.DecodeOperation(self.rgb) | return cde.DecodeOperation(self.rgb) | ||||
| class CutMixBatch(TensorOperation): | |||||
| class CutMixBatch(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply CutMix transformation on input batch of images and labels. | Apply CutMix transformation on input batch of images and labels. | ||||
| Note that you need to make labels into one-hot format and batch before calling this function. | Note that you need to make labels into one-hot format and batch before calling this function. | ||||
| @@ -235,7 +235,7 @@ class CutMixBatch(TensorOperation): | |||||
| return cde.CutMixBatchOperation(DE_C_IMAGE_BATCH_FORMAT[self.image_batch_format], self.alpha, self.prob) | return cde.CutMixBatchOperation(DE_C_IMAGE_BATCH_FORMAT[self.image_batch_format], self.alpha, self.prob) | ||||
| class CutOut(TensorOperation): | |||||
| class CutOut(ImageTensorOperation): | |||||
| """ | """ | ||||
| Randomly cut (mask) out a given number of square patches from the input NumPy image array. | Randomly cut (mask) out a given number of square patches from the input NumPy image array. | ||||
| @@ -258,7 +258,7 @@ class CutOut(TensorOperation): | |||||
| return cde.CutOutOperation(self.length, self.num_patches) | return cde.CutOutOperation(self.length, self.num_patches) | ||||
| class MixUpBatch(TensorOperation): | |||||
| class MixUpBatch(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply MixUp transformation on input batch of images and labels. Each image is multiplied by a random weight (lambda) | Apply MixUp transformation on input batch of images and labels. Each image is multiplied by a random weight (lambda) | ||||
| and then added to a randomly selected image from the batch multiplied by (1 - lambda). The same formula is also | and then added to a randomly selected image from the batch multiplied by (1 - lambda). The same formula is also | ||||
| @@ -286,7 +286,7 @@ class MixUpBatch(TensorOperation): | |||||
| return cde.MixUpBatchOperation(self.alpha) | return cde.MixUpBatchOperation(self.alpha) | ||||
| class Normalize(TensorOperation): | |||||
| class Normalize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Normalize the input image with respect to mean and standard deviation. | Normalize the input image with respect to mean and standard deviation. | ||||
| @@ -333,7 +333,7 @@ class Normalize(TensorOperation): | |||||
| return cde.NormalizeOperation(self.mean, self.std) | return cde.NormalizeOperation(self.mean, self.std) | ||||
| class NormalizePad(TensorOperation): | |||||
| class NormalizePad(ImageTensorOperation): | |||||
| """ | """ | ||||
| Normalize the input image with respect to mean and standard deviation then pad an extra channel with value zero. | Normalize the input image with respect to mean and standard deviation then pad an extra channel with value zero. | ||||
| @@ -380,7 +380,7 @@ class NormalizePad(TensorOperation): | |||||
| return cde.NormalizePadOperation(self.mean, self.std, self.dtype) | return cde.NormalizePadOperation(self.mean, self.std, self.dtype) | ||||
| class RandomAffine(TensorOperation): | |||||
| class RandomAffine(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply Random affine transformation to the input image. | Apply Random affine transformation to the input image. | ||||
| @@ -486,7 +486,7 @@ class RandomAffine(TensorOperation): | |||||
| self.fill_value) | self.fill_value) | ||||
| class RandomCrop(TensorOperation): | |||||
| class RandomCrop(ImageTensorOperation): | |||||
| """ | """ | ||||
| Crop the input image at a random location. | Crop the input image at a random location. | ||||
| @@ -551,7 +551,7 @@ class RandomCrop(TensorOperation): | |||||
| return cde.RandomCropOperation(self.size, self.padding, self.pad_if_needed, self.fill_value, border_type) | return cde.RandomCropOperation(self.size, self.padding, self.pad_if_needed, self.fill_value, border_type) | ||||
| class RandomCropWithBBox(TensorOperation): | |||||
| class RandomCropWithBBox(ImageTensorOperation): | |||||
| """ | """ | ||||
| Crop the input image at a random location and adjust bounding boxes accordingly. | Crop the input image at a random location and adjust bounding boxes accordingly. | ||||
| @@ -615,7 +615,7 @@ class RandomCropWithBBox(TensorOperation): | |||||
| border_type) | border_type) | ||||
| class RandomHorizontalFlip(TensorOperation): | |||||
| class RandomHorizontalFlip(ImageTensorOperation): | |||||
| """ | """ | ||||
| Flip the input image horizontally, randomly with a given probability. | Flip the input image horizontally, randomly with a given probability. | ||||
| @@ -636,7 +636,7 @@ class RandomHorizontalFlip(TensorOperation): | |||||
| return cde.RandomHorizontalFlipOperation(self.prob) | return cde.RandomHorizontalFlipOperation(self.prob) | ||||
| class RandomHorizontalFlipWithBBox(TensorOperation): | |||||
| class RandomHorizontalFlipWithBBox(ImageTensorOperation): | |||||
| """ | """ | ||||
| Flip the input image horizontally, randomly with a given probability and adjust bounding boxes accordingly. | Flip the input image horizontally, randomly with a given probability and adjust bounding boxes accordingly. | ||||
| @@ -657,7 +657,7 @@ class RandomHorizontalFlipWithBBox(TensorOperation): | |||||
| return cde.RandomHorizontalFlipWithBBoxOperation(self.prob) | return cde.RandomHorizontalFlipWithBBoxOperation(self.prob) | ||||
| class RandomPosterize(TensorOperation): | |||||
| class RandomPosterize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Reduce the number of bits for each color channel. | Reduce the number of bits for each color channel. | ||||
| @@ -685,7 +685,7 @@ class RandomPosterize(TensorOperation): | |||||
| return cde.RandomPosterizeOperation(bits) | return cde.RandomPosterizeOperation(bits) | ||||
| class RandomVerticalFlip(TensorOperation): | |||||
| class RandomVerticalFlip(ImageTensorOperation): | |||||
| """ | """ | ||||
| Flip the input image vertically, randomly with a given probability. | Flip the input image vertically, randomly with a given probability. | ||||
| @@ -706,7 +706,7 @@ class RandomVerticalFlip(TensorOperation): | |||||
| return cde.RandomVerticalFlipOperation(self.prob) | return cde.RandomVerticalFlipOperation(self.prob) | ||||
| class RandomVerticalFlipWithBBox(TensorOperation): | |||||
| class RandomVerticalFlipWithBBox(ImageTensorOperation): | |||||
| """ | """ | ||||
| Flip the input image vertically, randomly with a given probability and adjust bounding boxes accordingly. | Flip the input image vertically, randomly with a given probability and adjust bounding boxes accordingly. | ||||
| @@ -727,7 +727,7 @@ class RandomVerticalFlipWithBBox(TensorOperation): | |||||
| return cde.RandomVerticalFlipWithBBoxOperation(self.prob) | return cde.RandomVerticalFlipWithBBoxOperation(self.prob) | ||||
| class BoundingBoxAugment(TensorOperation): | |||||
| class BoundingBoxAugment(ImageTensorOperation): | |||||
| """ | """ | ||||
| Apply a given image transform on a random selection of bounding box regions of a given image. | Apply a given image transform on a random selection of bounding box regions of a given image. | ||||
| @@ -760,7 +760,7 @@ class BoundingBoxAugment(TensorOperation): | |||||
| return cde.BoundingBoxAugmentOperation(transform, self.ratio) | return cde.BoundingBoxAugmentOperation(transform, self.ratio) | ||||
| class Resize(TensorOperation): | |||||
| class Resize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Resize the input image to the given size. | Resize the input image to the given size. | ||||
| @@ -816,7 +816,7 @@ class Resize(TensorOperation): | |||||
| return cde.ResizeOperation(self.size, DE_C_INTER_MODE[self.interpolation]) | return cde.ResizeOperation(self.size, DE_C_INTER_MODE[self.interpolation]) | ||||
| class ResizeWithBBox(TensorOperation): | |||||
| class ResizeWithBBox(ImageTensorOperation): | |||||
| """ | """ | ||||
| Resize the input image to the given size and adjust bounding boxes accordingly. | Resize the input image to the given size and adjust bounding boxes accordingly. | ||||
| @@ -855,7 +855,7 @@ class ResizeWithBBox(TensorOperation): | |||||
| return cde.ResizeWithBBoxOperation(size, DE_C_INTER_MODE[self.interpolation]) | return cde.ResizeWithBBoxOperation(size, DE_C_INTER_MODE[self.interpolation]) | ||||
| class RandomResizedCropWithBBox(TensorOperation): | |||||
| class RandomResizedCropWithBBox(ImageTensorOperation): | |||||
| """ | """ | ||||
| Crop the input image to a random size and aspect ratio and adjust bounding boxes accordingly. | Crop the input image to a random size and aspect ratio and adjust bounding boxes accordingly. | ||||
| @@ -904,7 +904,7 @@ class RandomResizedCropWithBBox(TensorOperation): | |||||
| DE_C_INTER_MODE[self.interpolation], self.max_attempts) | DE_C_INTER_MODE[self.interpolation], self.max_attempts) | ||||
| class RandomResizedCrop(TensorOperation): | |||||
| class RandomResizedCrop(ImageTensorOperation): | |||||
| """ | """ | ||||
| Crop the input image to a random size and aspect ratio. | Crop the input image to a random size and aspect ratio. | ||||
| @@ -954,7 +954,7 @@ class RandomResizedCrop(TensorOperation): | |||||
| self.max_attempts) | self.max_attempts) | ||||
| class CenterCrop(TensorOperation): | |||||
| class CenterCrop(ImageTensorOperation): | |||||
| """ | """ | ||||
| Crops the input image at the center to the given size. | Crops the input image at the center to the given size. | ||||
| @@ -984,7 +984,7 @@ class CenterCrop(TensorOperation): | |||||
| return cde.CenterCropOperation(self.size) | return cde.CenterCropOperation(self.size) | ||||
| class RandomColor(TensorOperation): | |||||
| class RandomColor(ImageTensorOperation): | |||||
| """ | """ | ||||
| Adjust the color of the input image by a fixed or random degree. | Adjust the color of the input image by a fixed or random degree. | ||||
| This operation works only with 3-channel color images. | This operation works only with 3-channel color images. | ||||
| @@ -1008,7 +1008,7 @@ class RandomColor(TensorOperation): | |||||
| return cde.RandomColorOperation(*self.degrees) | return cde.RandomColorOperation(*self.degrees) | ||||
| class RandomColorAdjust(TensorOperation): | |||||
| class RandomColorAdjust(ImageTensorOperation): | |||||
| """ | """ | ||||
| Randomly adjust the brightness, contrast, saturation, and hue of the input image. | Randomly adjust the brightness, contrast, saturation, and hue of the input image. | ||||
| @@ -1060,7 +1060,7 @@ class RandomColorAdjust(TensorOperation): | |||||
| return cde.RandomColorAdjustOperation(self.brightness, self.contrast, self.saturation, self.hue) | return cde.RandomColorAdjustOperation(self.brightness, self.contrast, self.saturation, self.hue) | ||||
| class RandomRotation(TensorOperation): | |||||
| class RandomRotation(ImageTensorOperation): | |||||
| """ | """ | ||||
| Rotate the input image by a random angle. | Rotate the input image by a random angle. | ||||
| @@ -1116,7 +1116,7 @@ class RandomRotation(TensorOperation): | |||||
| return cde.RandomRotationOperation(degrees, interpolation, expand, center, fill_value) | return cde.RandomRotationOperation(degrees, interpolation, expand, center, fill_value) | ||||
| class Rescale(TensorOperation): | |||||
| class Rescale(ImageTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to rescale the input image. | Tensor operation to rescale the input image. | ||||
| @@ -1155,7 +1155,7 @@ class Rescale(TensorOperation): | |||||
| return cde.RescaleOperation(self.rescale, self.shift) | return cde.RescaleOperation(self.rescale, self.shift) | ||||
| class RandomResize(TensorOperation): | |||||
| class RandomResize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to resize the input image using a randomly selected interpolation mode. | Tensor operation to resize the input image using a randomly selected interpolation mode. | ||||
| @@ -1187,7 +1187,7 @@ class RandomResize(TensorOperation): | |||||
| return cde.RandomResizeOperation(size) | return cde.RandomResizeOperation(size) | ||||
| class RandomResizeWithBBox(TensorOperation): | |||||
| class RandomResizeWithBBox(ImageTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to resize the input image using a randomly selected interpolation mode and adjust | Tensor operation to resize the input image using a randomly selected interpolation mode and adjust | ||||
| bounding boxes accordingly. | bounding boxes accordingly. | ||||
| @@ -1220,7 +1220,7 @@ class RandomResizeWithBBox(TensorOperation): | |||||
| return cde.RandomResizeWithBBoxOperation(size) | return cde.RandomResizeWithBBoxOperation(size) | ||||
| class HWC2CHW(TensorOperation): | |||||
| class HWC2CHW(ImageTensorOperation): | |||||
| """ | """ | ||||
| Transpose the input image; shape (H, W, C) to shape (C, H, W). | Transpose the input image; shape (H, W, C) to shape (C, H, W). | ||||
| @@ -1253,7 +1253,7 @@ class HWC2CHW(TensorOperation): | |||||
| return cde.HwcToChwOperation() | return cde.HwcToChwOperation() | ||||
| class RandomCropDecodeResize(TensorOperation): | |||||
| class RandomCropDecodeResize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Equivalent to RandomResizedCrop, but crops before decodes. | Equivalent to RandomResizedCrop, but crops before decodes. | ||||
| @@ -1305,7 +1305,7 @@ class RandomCropDecodeResize(TensorOperation): | |||||
| self.max_attempts) | self.max_attempts) | ||||
| class Pad(TensorOperation): | |||||
| class Pad(ImageTensorOperation): | |||||
| """ | """ | ||||
| Pads the image according to padding parameters. | Pads the image according to padding parameters. | ||||
| @@ -1370,7 +1370,7 @@ class Pad(TensorOperation): | |||||
| return img.as_array() | return img.as_array() | ||||
| class UniformAugment(TensorOperation): | |||||
| class UniformAugment(ImageTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to perform randomly selected augmentation. | Tensor operation to perform randomly selected augmentation. | ||||
| @@ -1407,7 +1407,7 @@ class UniformAugment(TensorOperation): | |||||
| return cde.UniformAugOperation(transforms, self.num_ops) | return cde.UniformAugOperation(transforms, self.num_ops) | ||||
| class RandomSelectSubpolicy(TensorOperation): | |||||
| class RandomSelectSubpolicy(ImageTensorOperation): | |||||
| """ | """ | ||||
| Choose a random sub-policy from a list to be applied on the input image. A sub-policy is a list of tuples | Choose a random sub-policy from a list to be applied on the input image. A sub-policy is a list of tuples | ||||
| (op, prob), where op is a TensorOp operation and prob is the probability that this op will be applied. Once | (op, prob), where op is a TensorOp operation and prob is the probability that this op will be applied. Once | ||||
| @@ -1446,7 +1446,7 @@ class RandomSelectSubpolicy(TensorOperation): | |||||
| return cde.RandomSelectSubpolicyOperation(policy) | return cde.RandomSelectSubpolicyOperation(policy) | ||||
| class SoftDvppDecodeResizeJpeg(TensorOperation): | |||||
| class SoftDvppDecodeResizeJpeg(ImageTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to decode and resize JPEG image using the simulation algorithm of | Tensor operation to decode and resize JPEG image using the simulation algorithm of | ||||
| Ascend series chip DVPP module. | Ascend series chip DVPP module. | ||||
| @@ -1486,7 +1486,7 @@ class SoftDvppDecodeResizeJpeg(TensorOperation): | |||||
| return cde.SoftDvppDecodeResizeJpegOperation(self.size) | return cde.SoftDvppDecodeResizeJpegOperation(self.size) | ||||
| class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation): | |||||
| class SoftDvppDecodeRandomCropResizeJpeg(ImageTensorOperation): | |||||
| """ | """ | ||||
| Tensor operation to decode, random crop and resize JPEG image using the simulation algorithm of | Tensor operation to decode, random crop and resize JPEG image using the simulation algorithm of | ||||
| Ascend series chip DVPP module. | Ascend series chip DVPP module. | ||||
| @@ -1531,7 +1531,7 @@ class SoftDvppDecodeRandomCropResizeJpeg(TensorOperation): | |||||
| return cde.SoftDvppDecodeRandomCropResizeJpegOperation(self.size, self.scale, self.ratio, self.max_attempts) | return cde.SoftDvppDecodeRandomCropResizeJpegOperation(self.size, self.scale, self.ratio, self.max_attempts) | ||||
| class RandomSolarize(TensorOperation): | |||||
| class RandomSolarize(ImageTensorOperation): | |||||
| """ | """ | ||||
| Invert all pixel values above a threshold. | Invert all pixel values above a threshold. | ||||
| @@ -877,6 +877,229 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) { | |||||
| EXPECT_EQ(jieba_tokenizer3, nullptr); | EXPECT_EQ(jieba_tokenizer3, nullptr); | ||||
| } | } | ||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) { | |||||
| // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0). | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create jieba_tokenizer operation on ds | |||||
| std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer = | |||||
| text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| // Add word with freq not provided (default 0) | |||||
| jieba_tokenizer->AddWord("男默女泪"); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| std::shared_ptr<Tensor> expected_tensor; | |||||
| Tensor::CreateFromVector(expected, &expected_tensor); | |||||
| EXPECT_EQ(*ind, *expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 1); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) { | |||||
| // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0. | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create jieba_tokenizer operation on ds | |||||
| std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer = | |||||
| text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| // Add word with freq is set explicitly to 0 | |||||
| jieba_tokenizer->AddWord("男默女泪", 0); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| std::shared_ptr<Tensor> expected_tensor; | |||||
| Tensor::CreateFromVector(expected, &expected_tensor); | |||||
| EXPECT_EQ(*ind, *expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 1); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) { | |||||
| // Testing the parameter AddWord of JiebaTokenizer when the freq is 10. | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create jieba_tokenizer operation on ds | |||||
| std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer = | |||||
| text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| // Add word with freq 10 | |||||
| jieba_tokenizer->AddWord("男默女泪", 10); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| std::shared_ptr<Tensor> expected_tensor; | |||||
| Tensor::CreateFromVector(expected, &expected_tensor); | |||||
| EXPECT_EQ(*ind, *expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 1); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) { | |||||
| // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation. | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create jieba_tokenizer operation on ds | |||||
| std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer = | |||||
| text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| // Add word with freq 20000 | |||||
| jieba_tokenizer->AddWord("江大桥", 20000); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, std::shared_ptr<Tensor>> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto ind = row["text"]; | |||||
| std::shared_ptr<Tensor> expected_tensor; | |||||
| Tensor::CreateFromVector(expected, &expected_tensor); | |||||
| EXPECT_EQ(*ind, *expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 1); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) { | |||||
| // Testing the incorrect parameter of AddWord in JiebaTokenizer. | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Testing the parameter word of AddWord is empty | |||||
| std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer = | |||||
| text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK()); | |||||
| // Testing the parameter freq of AddWord is negative | |||||
| std::shared_ptr<text::JiebaTokenizerOperation> jieba_tokenizer1 = | |||||
| text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer1, nullptr); | |||||
| EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { | TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { | ||||
| // Testing the parameter of SlidingWindow interface when the axis is 0. | // Testing the parameter of SlidingWindow interface when the axis is 0. | ||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; | ||||
| @@ -166,7 +166,8 @@ def test_lookup_cast_type(): | |||||
| assert test_config("unk") == np.dtype("int32") | assert test_config("unk") == np.dtype("int32") | ||||
| # test exception, data_type isn't the correct type | # test exception, data_type isn't the correct type | ||||
| assert "tldr is not of type (<class 'mindspore._c_expression.typing.Type'>,)" in test_config("unk", "tldr") | assert "tldr is not of type (<class 'mindspore._c_expression.typing.Type'>,)" in test_config("unk", "tldr") | ||||
| assert "Lookup doesn't support string to string lookup" in test_config("w1", mstype.string) | |||||
| assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \ | |||||
| test_config("w1", mstype.string) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||