Merge pull request !23008 from Isaac/FilterWikipediaXMLfeature/build-system-rewrite
| @@ -65,6 +65,17 @@ PYBIND_REGISTER(CaseFoldOperation, 1, ([](const py::module *m) { | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER(FilterWikipediaXMLOperation, 1, ([](const py::module *m) { | |||
| (void)py::class_<text::FilterWikipediaXMLOperation, TensorOperation, | |||
| std::shared_ptr<text::FilterWikipediaXMLOperation>>(*m, | |||
| "FilterWikipediaXMLOperation") | |||
| .def(py::init([]() { | |||
| auto filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXMLOperation>(); | |||
| THROW_IF_ERROR(filter_wikipedia_xml->ValidateParams()); | |||
| return filter_wikipedia_xml; | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER( | |||
| NormalizeUTF8Operation, 1, ([](const py::module *m) { | |||
| (void)py::class_<text::NormalizeUTF8Operation, TensorOperation, std::shared_ptr<text::NormalizeUTF8Operation>>( | |||
| @@ -105,6 +105,11 @@ std::shared_ptr<TensorOperation> BertTokenizer::Parse() { | |||
| CaseFold::CaseFold() = default; | |||
| std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); } | |||
| // FilterWikipediaXML | |||
| FilterWikipediaXML::FilterWikipediaXML() {} | |||
| std::shared_ptr<TensorOperation> FilterWikipediaXML::Parse() { return std::make_shared<FilterWikipediaXMLOperation>(); } | |||
| #endif | |||
| // JiebaTokenizer | |||
| @@ -173,6 +173,21 @@ class MS_API CaseFold final : public TensorTransform { | |||
| /// \return Shared pointer to the TensorOperation object. | |||
| std::shared_ptr<TensorOperation> Parse() override; | |||
| }; | |||
| /// \brief Filter wikipedia xml lines. | |||
| class FilterWikipediaXML final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| FilterWikipediaXML(); | |||
| /// \brief Destructor | |||
| ~FilterWikipediaXML() = default; | |||
| protected: | |||
| /// \brief The function to convert a TensorTransform object into a TensorOperation object. | |||
| /// \return Shared pointer to the TensorOperation object. | |||
| std::shared_ptr<TensorOperation> Parse() override; | |||
| }; | |||
| #endif | |||
| /// \brief Tokenize a Chinese string into words based on the dictionary. | |||
| @@ -125,6 +125,7 @@ constexpr char kVerticalFlipOp[] = "VerticalFlipOp"; | |||
| constexpr char kBasicTokenizerOp[] = "BasicTokenizerOp"; | |||
| constexpr char kBertTokenizerOp[] = "BertTokenizerOp"; | |||
| constexpr char kCaseFoldOp[] = "CaseFoldOp"; | |||
| constexpr char kFilterWikipediaXMLOp[] = "FilterWikipediaXMLOp"; | |||
| constexpr char kJiebaTokenizerOp[] = "JiebaTokenizerOp"; | |||
| constexpr char kLookupOp[] = "LookupOp"; | |||
| constexpr char kNgramOp[] = "NgramOp"; | |||
| @@ -21,6 +21,7 @@ | |||
| #include "minddata/dataset/text/kernels/basic_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/bert_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/case_fold_op.h" | |||
| #include "minddata/dataset/text/kernels/filter_wikipedia_xml_op.h" | |||
| #endif | |||
| #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/lookup_op.h" | |||
| @@ -137,6 +138,16 @@ std::shared_ptr<TensorOp> CaseFoldOperation::Build() { | |||
| std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>(); | |||
| return tensor_op; | |||
| } | |||
| // FilterWikipediaXMLOperation | |||
| FilterWikipediaXMLOperation::FilterWikipediaXMLOperation() {} | |||
| Status FilterWikipediaXMLOperation::ValidateParams() { return Status::OK(); } | |||
| std::shared_ptr<TensorOp> FilterWikipediaXMLOperation::Build() { | |||
| std::shared_ptr<FilterWikipediaXMLOp> tensor_op = std::make_shared<FilterWikipediaXMLOp>(); | |||
| return tensor_op; | |||
| } | |||
| #endif | |||
| // JiebaTokenizerOperation | |||
| @@ -37,6 +37,7 @@ namespace text { | |||
| constexpr char kBasicTokenizerOperation[] = "BasicTokenizer"; | |||
| constexpr char kBertTokenizerOperation[] = "BertTokenizer"; | |||
| constexpr char kCaseFoldOperation[] = "CaseFold"; | |||
| constexpr char kFilterWikipediaXMLOperation[] = "FilterWikipediaXML"; | |||
| constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer"; | |||
| constexpr char kLookupOperation[] = "Lookup"; | |||
| constexpr char kNgramOperation[] = "Ngram"; | |||
| @@ -116,6 +117,19 @@ class CaseFoldOperation : public TensorOperation { | |||
| std::string Name() const override { return kCaseFoldOperation; } | |||
| }; | |||
| class FilterWikipediaXMLOperation : public TensorOperation { | |||
| public: | |||
| FilterWikipediaXMLOperation(); | |||
| ~FilterWikipediaXMLOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kFilterWikipediaXMLOperation; } | |||
| }; | |||
| #endif | |||
| class JiebaTokenizerOperation : public TensorOperation { | |||
| @@ -5,6 +5,7 @@ if(NOT (CMAKE_SYSTEM_NAME MATCHES "Windows")) | |||
| basic_tokenizer_op.cc | |||
| bert_tokenizer_op.cc | |||
| case_fold_op.cc | |||
| filter_wikipedia_xml_op.cc | |||
| normalize_utf8_op.cc | |||
| regex_replace_op.cc | |||
| regex_tokenizer_op.cc | |||
| @@ -0,0 +1,117 @@ | |||
| /** | |||
| * Copyright 2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/text/kernels/filter_wikipedia_xml_op.h" | |||
| #include <memory> | |||
| #include <string_view> | |||
| #include <vector> | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| std::map<icu::UnicodeString, icu::UnicodeString> patterns = {{R"(<.*>)", ""}, | |||
| {R"(&)", "&"}, | |||
| {"<", "<"}, | |||
| {">", ">"}, | |||
| {R"(<ef[^<]*<\/ef>)", ""}, | |||
| {"<[^>]*>", ""}, | |||
| {R"(\[http:[^] ]*)", "["}, | |||
| {R"(\|thumb)", ""}, | |||
| {R"(\|left)", ""}, | |||
| {R"(\|right)", ""}, | |||
| {R"(\|\d+px)", ""}, | |||
| {R"(\[\[image:[^\[\]]*\|)", ""}, | |||
| {R"(\[\[category:([^|\]]*)[^]]*\]\])", "[[$1]]"}, | |||
| {R"(\[\[[a-z\-]*:[^\]]*\]\])", ""}, | |||
| {R"(\[\[[^\|\]]*\|)", "[["}, | |||
| {R"(\{\{[^\}]*\}\})", ""}, | |||
| {R"(\{[^\}]*\})", ""}, | |||
| {R"(\[)", ""}, | |||
| {R"(\])", ""}, | |||
| {"&[^;]*;", " "}, | |||
| {"A", "a"}, | |||
| {"B", "b"}, | |||
| {"C", "c"}, | |||
| {"D", "d"}, | |||
| {"E", "e"}, | |||
| {"F", "f"}, | |||
| {"G", "g"}, | |||
| {"H", "h"}, | |||
| {"I", "i"}, | |||
| {"J", "j"}, | |||
| {"K", "k"}, | |||
| {"L", "l"}, | |||
| {"M", "m"}, | |||
| {"N", "n"}, | |||
| {"O", "o"}, | |||
| {"P", "p"}, | |||
| {"Q", "q"}, | |||
| {"R", ""}, | |||
| {"S", "s"}, | |||
| {"T", "t"}, | |||
| {"U", "u"}, | |||
| {"V", "v"}, | |||
| {"W", "w"}, | |||
| {"X", "x"}, | |||
| {"Y", "y"}, | |||
| {"Z", "z"}, | |||
| {"0", " zero "}, | |||
| {"1", " one "}, | |||
| {"2", " two "}, | |||
| {"3", " three "}, | |||
| {"4", " four "}, | |||
| {"5", " five "}, | |||
| {"6", " six "}, | |||
| {"7", " seven "}, | |||
| {"8", " eight "}, | |||
| {"9", " nine "}, | |||
| {R"([^a-z\n]+)", " "}, | |||
| {R"(\n )", ""}, | |||
| {R"(\s+)", " "}, | |||
| {R"(\n\s*\n)", R"(\n)"}}; | |||
| Status FilterWikipediaXMLOp::FilterWikipediaXML(const std::string_view &text, std::string *out) const { | |||
| if (((text).find("#redirect") == -1) && ((text).find("#REDIRECT") == -1)) { | |||
| (*out) = text; | |||
| UErrorCode icu_error = U_ZERO_ERROR; | |||
| for (auto pattern_iter = patterns.begin(); pattern_iter != patterns.end(); pattern_iter++) { | |||
| icu::RegexMatcher matcher(pattern_iter->first, 0, icu_error); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), | |||
| "RegexReplace: create icu RegexMatcher failed, you may input an error pattern."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED((out != nullptr), "FilterWikipediaXML: icu init failed."); | |||
| icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(*out); | |||
| matcher.reset(unicode_text); | |||
| icu::UnicodeString unicode_out = matcher.replaceAll(pattern_iter->second, icu_error); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "FilterWikipediaXML: FilterWikipediaXML failed."); | |||
| (*out) = ""; | |||
| unicode_out.trim().toUTF8String(*out); | |||
| } | |||
| } else { | |||
| (*out) = ""; | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status FilterWikipediaXMLOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "RegexReplace: input is not of type string."); | |||
| std::vector<std::string> strs(input->Size()); | |||
| auto iter = input->begin<std::string_view>(); | |||
| RETURN_IF_NOT_OK(FilterWikipediaXML(*iter, &strs[0])); | |||
| RETURN_IF_NOT_OK(Tensor::CreateFromVector(strs, input->shape(), output)); | |||
| return Status::OK(); | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,49 @@ | |||
| /** | |||
| * Copyright 2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_FILTER_WIKIPEDIA_XML_OP_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_FILTER_WIKIPEDIA_XML_OP_H_ | |||
| #include <map> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "unicode/errorcode.h" | |||
| #include "unicode/regex.h" | |||
| #include "unicode/utypes.h" | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/kernels/tensor_op.h" | |||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| class FilterWikipediaXMLOp : public TensorOp { | |||
| public: | |||
| FilterWikipediaXMLOp() {} | |||
| ~FilterWikipediaXMLOp() override = default; | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| std::string Name() const override { return kFilterWikipediaXMLOp; } | |||
| private: | |||
| Status FilterWikipediaXML(const std::string_view &text, std::string *out) const; | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_FILTER_WIKIPEDIA_XML_OP_H_ | |||
| @@ -758,6 +758,22 @@ if platform.system().lower() != 'windows': | |||
| return cde.CaseFoldOperation() | |||
| class FilterWikipediaXML(TextTensorOperation): | |||
| """ | |||
| Filter Wikipedia XML dumps to "clean" text consisting only of lowercase letters (a-z, converted from A-Z), | |||
| and spaces (never consecutive). | |||
| Note: | |||
| FilterWikipediaXML is not supported on Windows platform yet. | |||
| Examples: | |||
| >>> replace_op = text.FilterWikipediaXML() | |||
| >>> text_file_dataset = text_file_dataset.map(operations=replace_op) | |||
| """ | |||
| def parse(self): | |||
| return cde.FilterWikipediaXMLOperation() | |||
| class NormalizeUTF8(TextTensorOperation): | |||
| """ | |||
| Apply normalize operation on UTF-8 string tensor. | |||
| @@ -769,6 +769,55 @@ TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) { | |||
| iter->Stop(); | |||
| } | |||
| /// Feature: FilterWikipediaXML | |||
| /// Description: test FilterWikipediaXML in pipeline mode | |||
| /// Expectation: the data is processed successfully | |||
| TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) { | |||
| // Testing the parameter of FilterWikipediaXML interface . | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFilterWikipediaXMLSuccess."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/2.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create filter_wikipedia_xml operation on ds | |||
| std::shared_ptr<TensorTransform> filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXML>(); | |||
| EXPECT_NE(filter_wikipedia_xml, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({filter_wikipedia_xml}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| std::vector<std::string> expected = {"welcome to beijing","",""}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto ind = row["text"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor ms_expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 3); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) { | |||
| // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess."; | |||
| @@ -254,6 +254,24 @@ TEST_F(MindDataTestExecute, TestCrop) { | |||
| EXPECT_EQ(image.Shape()[1], 15); | |||
| } | |||
| /// Feature: FilterWikipediaXMLEager | |||
| /// Description: Test FilterWikipediaXML's Eager mode | |||
| /// Expectation: Run successfully | |||
| TEST_F(MindDataTestExecute, TestFilterWikipediaXMLEager) { | |||
| // Test FilterWikipediaXML's Eager mode | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestFilterWikipediaXMLEager."; | |||
| std::vector<std::string> origin = {"中国","Wcdma","Pang","Yuchao"}; | |||
| TensorShape input_shape({2, 2}); | |||
| std::shared_ptr<Tensor> de_tensor; | |||
| Tensor::CreateFromVector(origin, input_shape, &de_tensor); | |||
| std::shared_ptr<TensorTransform> filter = std::make_shared<text::FilterWikipediaXML>(); | |||
| auto input = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor)); | |||
| mindspore::dataset::Execute Transform({filter}); | |||
| Status s = Transform(input, &input); | |||
| ASSERT_TRUE(s.IsOk()); | |||
| } | |||
| TEST_F(MindDataTestExecute, TestFrequencyMasking) { | |||
| MS_LOG(INFO) << "Doing MindDataTestExecute-TestFrequencyMasking."; | |||
| std::shared_ptr<Tensor> input_tensor_; | |||
| @@ -0,0 +1,3 @@ | |||
| Welcome to Beijing! | |||
| 中国 | |||
| #redirect | |||
| @@ -0,0 +1,71 @@ | |||
| # Copyright 2022 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing FilterWikipediaXML op | |||
| """ | |||
| import numpy as np | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.text.transforms as a_c_trans | |||
| from mindspore import log as logger | |||
| def count_unequal_element(data_expected, data_me): | |||
| assert data_expected.shape == data_me.shape | |||
| assert data_expected == data_me | |||
| def test_filter_wikipedia_xml_eager(): | |||
| """ | |||
| Feature: FilterWikipediaXML | |||
| Description: test FilterWikipediaXML in eager mode | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("test FilterWikipediaXML in eager mode") | |||
| # Original text | |||
| input01 = np.array(["Welcome to China"], dtype=np.unicode_) | |||
| # Expect text | |||
| expected = np.array(["welcome to china"], dtype=np.unicode_) | |||
| filter_wikipedia_xml_op = a_c_trans.FilterWikipediaXML() | |||
| output = filter_wikipedia_xml_op(input01) | |||
| count_unequal_element(expected, output) | |||
| def test_filter_wikipedia_xml_pipeline(): | |||
| """ | |||
| Feature: FilterWikipediaXML | |||
| Description: test FilterWikipediaXML in pipeline mode | |||
| Expectation: the data is processed successfully | |||
| """ | |||
| logger.info("test FilterWikipediaXML in pipeline mode") | |||
| # Original text | |||
| input02 = np.array(["Welcome to China", "中国", "ABC"], dtype=np.unicode_) | |||
| # Expect text | |||
| expected = np.array(["welcome to china", "", "abc"], dtype=np.string_) | |||
| dataset = ds.NumpySlicesDataset(input02, ["text"], shuffle=False) | |||
| filter_wikipedia_xml_op = a_c_trans.FilterWikipediaXML() | |||
| # Filtered waveform by filter_wikipedia_xml | |||
| dataset = dataset.map(input_columns=["text"], operations=filter_wikipedia_xml_op, num_parallel_workers=8) | |||
| i = 0 | |||
| for data in dataset.create_dict_iterator(output_numpy=True): | |||
| count_unequal_element(np.array(expected[i]), data['text']) | |||
| i += 1 | |||
| if __name__ == "__main__": | |||
| test_filter_wikipedia_xml_eager() | |||
| test_filter_wikipedia_xml_pipeline() | |||