Browse Source

!23008 [assistant][ops][FilterWikipediaXML]

Merge pull request !23008 from Isaac/FilterWikipediaXML
feature/build-system-rewrite
i-robot Gitee 4 years ago
parent
commit
02d63d7ad9
No known key found for this signature in database GPG Key ID: 173E9B9CA92EEF8F
14 changed files with 381 additions and 0 deletions
  1. +11
    -0
      mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
  2. +5
    -0
      mindspore/ccsrc/minddata/dataset/api/text.cc
  3. +15
    -0
      mindspore/ccsrc/minddata/dataset/include/dataset/text.h
  4. +1
    -0
      mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
  5. +11
    -0
      mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
  6. +14
    -0
      mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
  7. +1
    -0
      mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
  8. +117
    -0
      mindspore/ccsrc/minddata/dataset/text/kernels/filter_wikipedia_xml_op.cc
  9. +49
    -0
      mindspore/ccsrc/minddata/dataset/text/kernels/filter_wikipedia_xml_op.h
  10. +16
    -0
      mindspore/python/mindspore/dataset/text/transforms.py
  11. +49
    -0
      tests/ut/cpp/dataset/c_api_text_test.cc
  12. +18
    -0
      tests/ut/cpp/dataset/execute_test.cc
  13. +3
    -0
      tests/ut/data/dataset/testTokenizerData/2.txt
  14. +71
    -0
      tests/ut/python/dataset/test_filter_wikipedia_xml.py

+ 11
- 0
mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc View File

@@ -65,6 +65,17 @@ PYBIND_REGISTER(CaseFoldOperation, 1, ([](const py::module *m) {
}));
}));

PYBIND_REGISTER(FilterWikipediaXMLOperation, 1, ([](const py::module *m) {
(void)py::class_<text::FilterWikipediaXMLOperation, TensorOperation,
std::shared_ptr<text::FilterWikipediaXMLOperation>>(*m,
"FilterWikipediaXMLOperation")
.def(py::init([]() {
auto filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXMLOperation>();
THROW_IF_ERROR(filter_wikipedia_xml->ValidateParams());
return filter_wikipedia_xml;
}));
}));

PYBIND_REGISTER(
NormalizeUTF8Operation, 1, ([](const py::module *m) {
(void)py::class_<text::NormalizeUTF8Operation, TensorOperation, std::shared_ptr<text::NormalizeUTF8Operation>>(


+ 5
- 0
mindspore/ccsrc/minddata/dataset/api/text.cc View File

@@ -105,6 +105,11 @@ std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
CaseFold::CaseFold() = default;

std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }

// FilterWikipediaXML
FilterWikipediaXML::FilterWikipediaXML() {}

std::shared_ptr<TensorOperation> FilterWikipediaXML::Parse() { return std::make_shared<FilterWikipediaXMLOperation>(); }
#endif

// JiebaTokenizer


+ 15
- 0
mindspore/ccsrc/minddata/dataset/include/dataset/text.h View File

@@ -173,6 +173,21 @@ class MS_API CaseFold final : public TensorTransform {
/// \return Shared pointer to the TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;
};

/// \brief Filter wikipedia xml lines.
class FilterWikipediaXML final : public TensorTransform {
public:
/// \brief Constructor.
FilterWikipediaXML();

/// \brief Destructor
~FilterWikipediaXML() = default;

protected:
/// \brief The function to convert a TensorTransform object into a TensorOperation object.
/// \return Shared pointer to the TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;
};
#endif

/// \brief Tokenize a Chinese string into words based on the dictionary.


+ 1
- 0
mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h View File

@@ -125,6 +125,7 @@ constexpr char kVerticalFlipOp[] = "VerticalFlipOp";
constexpr char kBasicTokenizerOp[] = "BasicTokenizerOp";
constexpr char kBertTokenizerOp[] = "BertTokenizerOp";
constexpr char kCaseFoldOp[] = "CaseFoldOp";
constexpr char kFilterWikipediaXMLOp[] = "FilterWikipediaXMLOp";
constexpr char kJiebaTokenizerOp[] = "JiebaTokenizerOp";
constexpr char kLookupOp[] = "LookupOp";
constexpr char kNgramOp[] = "NgramOp";


+ 11
- 0
mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc View File

@@ -21,6 +21,7 @@
#include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
#include "minddata/dataset/text/kernels/bert_tokenizer_op.h"
#include "minddata/dataset/text/kernels/case_fold_op.h"
#include "minddata/dataset/text/kernels/filter_wikipedia_xml_op.h"
#endif
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
#include "minddata/dataset/text/kernels/lookup_op.h"
@@ -137,6 +138,16 @@ std::shared_ptr<TensorOp> CaseFoldOperation::Build() {
std::shared_ptr<CaseFoldOp> tensor_op = std::make_shared<CaseFoldOp>();
return tensor_op;
}

// FilterWikipediaXMLOperation
FilterWikipediaXMLOperation::FilterWikipediaXMLOperation() {}

Status FilterWikipediaXMLOperation::ValidateParams() { return Status::OK(); }

std::shared_ptr<TensorOp> FilterWikipediaXMLOperation::Build() {
std::shared_ptr<FilterWikipediaXMLOp> tensor_op = std::make_shared<FilterWikipediaXMLOp>();
return tensor_op;
}
#endif

// JiebaTokenizerOperation


+ 14
- 0
mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h View File

@@ -37,6 +37,7 @@ namespace text {
constexpr char kBasicTokenizerOperation[] = "BasicTokenizer";
constexpr char kBertTokenizerOperation[] = "BertTokenizer";
constexpr char kCaseFoldOperation[] = "CaseFold";
constexpr char kFilterWikipediaXMLOperation[] = "FilterWikipediaXML";
constexpr char kJiebaTokenizerOperation[] = "JiebaTokenizer";
constexpr char kLookupOperation[] = "Lookup";
constexpr char kNgramOperation[] = "Ngram";
@@ -116,6 +117,19 @@ class CaseFoldOperation : public TensorOperation {

std::string Name() const override { return kCaseFoldOperation; }
};

class FilterWikipediaXMLOperation : public TensorOperation {
public:
FilterWikipediaXMLOperation();

~FilterWikipediaXMLOperation() = default;

std::shared_ptr<TensorOp> Build() override;

Status ValidateParams() override;

std::string Name() const override { return kFilterWikipediaXMLOperation; }
};
#endif

class JiebaTokenizerOperation : public TensorOperation {


+ 1
- 0
mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt View File

@@ -5,6 +5,7 @@ if(NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
basic_tokenizer_op.cc
bert_tokenizer_op.cc
case_fold_op.cc
filter_wikipedia_xml_op.cc
normalize_utf8_op.cc
regex_replace_op.cc
regex_tokenizer_op.cc


+ 117
- 0
mindspore/ccsrc/minddata/dataset/text/kernels/filter_wikipedia_xml_op.cc View File

@@ -0,0 +1,117 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/text/kernels/filter_wikipedia_xml_op.h"

#include <memory>
#include <string_view>
#include <vector>

namespace mindspore {
namespace dataset {
std::map<icu::UnicodeString, icu::UnicodeString> patterns = {{R"(<.*>)", ""},
{R"(&amp;)", "&"},
{"&lt;", "<"},
{"&gt;", ">"},
{R"(<ef[^<]*<\/ef>)", ""},
{"<[^>]*>", ""},
{R"(\[http:[^] ]*)", "["},
{R"(\|thumb)", ""},
{R"(\|left)", ""},
{R"(\|right)", ""},
{R"(\|\d+px)", ""},
{R"(\[\[image:[^\[\]]*\|)", ""},
{R"(\[\[category:([^|\]]*)[^]]*\]\])", "[[$1]]"},
{R"(\[\[[a-z\-]*:[^\]]*\]\])", ""},
{R"(\[\[[^\|\]]*\|)", "[["},
{R"(\{\{[^\}]*\}\})", ""},
{R"(\{[^\}]*\})", ""},
{R"(\[)", ""},
{R"(\])", ""},
{"&[^;]*;", " "},
{"A", "a"},
{"B", "b"},
{"C", "c"},
{"D", "d"},
{"E", "e"},
{"F", "f"},
{"G", "g"},
{"H", "h"},
{"I", "i"},
{"J", "j"},
{"K", "k"},
{"L", "l"},
{"M", "m"},
{"N", "n"},
{"O", "o"},
{"P", "p"},
{"Q", "q"},
{"R", ""},
{"S", "s"},
{"T", "t"},
{"U", "u"},
{"V", "v"},
{"W", "w"},
{"X", "x"},
{"Y", "y"},
{"Z", "z"},
{"0", " zero "},
{"1", " one "},
{"2", " two "},
{"3", " three "},
{"4", " four "},
{"5", " five "},
{"6", " six "},
{"7", " seven "},
{"8", " eight "},
{"9", " nine "},
{R"([^a-z\n]+)", " "},
{R"(\n )", ""},
{R"(\s+)", " "},
{R"(\n\s*\n)", R"(\n)"}};

Status FilterWikipediaXMLOp::FilterWikipediaXML(const std::string_view &text, std::string *out) const {
if (((text).find("#redirect") == -1) && ((text).find("#REDIRECT") == -1)) {
(*out) = text;
UErrorCode icu_error = U_ZERO_ERROR;
for (auto pattern_iter = patterns.begin(); pattern_iter != patterns.end(); pattern_iter++) {
icu::RegexMatcher matcher(pattern_iter->first, 0, icu_error);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error),
"RegexReplace: create icu RegexMatcher failed, you may input an error pattern.");
CHECK_FAIL_RETURN_UNEXPECTED((out != nullptr), "FilterWikipediaXML: icu init failed.");
icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(*out);
matcher.reset(unicode_text);
icu::UnicodeString unicode_out = matcher.replaceAll(pattern_iter->second, icu_error);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "FilterWikipediaXML: FilterWikipediaXML failed.");
(*out) = "";
unicode_out.trim().toUTF8String(*out);
}
} else {
(*out) = "";
}
return Status::OK();
}

Status FilterWikipediaXMLOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "RegexReplace: input is not of type string.");
std::vector<std::string> strs(input->Size());
auto iter = input->begin<std::string_view>();
RETURN_IF_NOT_OK(FilterWikipediaXML(*iter, &strs[0]));
RETURN_IF_NOT_OK(Tensor::CreateFromVector(strs, input->shape(), output));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

+ 49
- 0
mindspore/ccsrc/minddata/dataset/text/kernels/filter_wikipedia_xml_op.h View File

@@ -0,0 +1,49 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_FILTER_WIKIPEDIA_XML_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_FILTER_WIKIPEDIA_XML_OP_H_

#include <map>
#include <memory>
#include <string>

#include "unicode/errorcode.h"
#include "unicode/regex.h"
#include "unicode/utypes.h"

#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#include "minddata/dataset/util/status.h"

namespace mindspore {
namespace dataset {
class FilterWikipediaXMLOp : public TensorOp {
public:
FilterWikipediaXMLOp() {}

~FilterWikipediaXMLOp() override = default;

Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;

std::string Name() const override { return kFilterWikipediaXMLOp; }

private:
Status FilterWikipediaXML(const std::string_view &text, std::string *out) const;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_FILTER_WIKIPEDIA_XML_OP_H_

+ 16
- 0
mindspore/python/mindspore/dataset/text/transforms.py View File

@@ -758,6 +758,22 @@ if platform.system().lower() != 'windows':
return cde.CaseFoldOperation()


class FilterWikipediaXML(TextTensorOperation):
"""
Filter Wikipedia XML dumps to "clean" text consisting only of lowercase letters (a-z, converted from A-Z),
and spaces (never consecutive).

Note:
FilterWikipediaXML is not supported on Windows platform yet.

Examples:
>>> replace_op = text.FilterWikipediaXML()
>>> text_file_dataset = text_file_dataset.map(operations=replace_op)
"""
def parse(self):
return cde.FilterWikipediaXMLOperation()


class NormalizeUTF8(TextTensorOperation):
"""
Apply normalize operation on UTF-8 string tensor.


+ 49
- 0
tests/ut/cpp/dataset/c_api_text_test.cc View File

@@ -769,6 +769,55 @@ TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
iter->Stop();
}

/// Feature: FilterWikipediaXML
/// Description: test FilterWikipediaXML in pipeline mode
/// Expectation: the data is processed successfully
TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
// Testing the parameter of FilterWikipediaXML interface .
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFilterWikipediaXMLSuccess.";

// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testTokenizerData/2.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);

// Create filter_wikipedia_xml operation on ds
std::shared_ptr<TensorTransform> filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXML>();
EXPECT_NE(filter_wikipedia_xml, nullptr);

// Create Map operation on ds
ds = ds->Map({filter_wikipedia_xml}, {"text"});
EXPECT_NE(ds, nullptr);

// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);

// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
std::vector<std::string> expected = {"welcome to beijing","",""};

uint64_t i = 0;

while (row.size() != 0) {
auto ind = row["text"];
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
mindspore::MSTensor ms_expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}

EXPECT_EQ(i, 3);

// Manually terminate the pipeline
iter->Stop();
}

TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
// Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";


+ 18
- 0
tests/ut/cpp/dataset/execute_test.cc View File

@@ -254,6 +254,24 @@ TEST_F(MindDataTestExecute, TestCrop) {
EXPECT_EQ(image.Shape()[1], 15);
}

/// Feature: FilterWikipediaXMLEager
/// Description: Test FilterWikipediaXML's Eager mode
/// Expectation: Run successfully
TEST_F(MindDataTestExecute, TestFilterWikipediaXMLEager) {
// Test FilterWikipediaXML's Eager mode
MS_LOG(INFO) << "Doing MindDataTestExecute-TestFilterWikipediaXMLEager.";
std::vector<std::string> origin = {"中国","Wcdma","Pang","Yuchao"};
TensorShape input_shape({2, 2});
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateFromVector(origin, input_shape, &de_tensor);
std::shared_ptr<TensorTransform> filter = std::make_shared<text::FilterWikipediaXML>();
auto input = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::dataset::Execute Transform({filter});
Status s = Transform(input, &input);

ASSERT_TRUE(s.IsOk());
}

TEST_F(MindDataTestExecute, TestFrequencyMasking) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestFrequencyMasking.";
std::shared_ptr<Tensor> input_tensor_;


+ 3
- 0
tests/ut/data/dataset/testTokenizerData/2.txt View File

@@ -0,0 +1,3 @@
Welcome to Beijing!
中国
#redirect

+ 71
- 0
tests/ut/python/dataset/test_filter_wikipedia_xml.py View File

@@ -0,0 +1,71 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing FilterWikipediaXML op
"""
import numpy as np

import mindspore.dataset as ds
import mindspore.dataset.text.transforms as a_c_trans
from mindspore import log as logger


def count_unequal_element(data_expected, data_me):
assert data_expected.shape == data_me.shape
assert data_expected == data_me


def test_filter_wikipedia_xml_eager():
"""
Feature: FilterWikipediaXML
Description: test FilterWikipediaXML in eager mode
Expectation: the data is processed successfully
"""
logger.info("test FilterWikipediaXML in eager mode")

# Original text
input01 = np.array(["Welcome to China"], dtype=np.unicode_)
# Expect text
expected = np.array(["welcome to china"], dtype=np.unicode_)
filter_wikipedia_xml_op = a_c_trans.FilterWikipediaXML()
output = filter_wikipedia_xml_op(input01)
count_unequal_element(expected, output)


def test_filter_wikipedia_xml_pipeline():
"""
Feature: FilterWikipediaXML
Description: test FilterWikipediaXML in pipeline mode
Expectation: the data is processed successfully
"""
logger.info("test FilterWikipediaXML in pipeline mode")

# Original text
input02 = np.array(["Welcome to China", "中国", "ABC"], dtype=np.unicode_)
# Expect text
expected = np.array(["welcome to china", "", "abc"], dtype=np.string_)
dataset = ds.NumpySlicesDataset(input02, ["text"], shuffle=False)
filter_wikipedia_xml_op = a_c_trans.FilterWikipediaXML()
# Filtered waveform by filter_wikipedia_xml
dataset = dataset.map(input_columns=["text"], operations=filter_wikipedia_xml_op, num_parallel_workers=8)
i = 0
for data in dataset.create_dict_iterator(output_numpy=True):
count_unequal_element(np.array(expected[i]), data['text'])
i += 1


if __name__ == "__main__":
test_filter_wikipedia_xml_eager()
test_filter_wikipedia_xml_pipeline()

Loading…
Cancel
Save