|
|
|
@@ -34,6 +34,49 @@ class MindDataTestPipeline : public UT::DatasetOpTesting { |
|
|
|
protected: |
|
|
|
}; |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) { |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create casefold operation on ds |
|
|
|
std::shared_ptr<TensorOperation> casefold = text::CaseFold(); |
|
|
|
EXPECT_NE(casefold, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({casefold}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
Tensor::CreateScalar(expected[i], &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) { |
|
|
|
// Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess."; |
|
|
|
@@ -472,6 +515,514 @@ TEST_F(MindDataTestPipeline, TestTextOperationName) { |
|
|
|
EXPECT_EQ(correct_name, sentence_piece_tokenizer_op->Name()); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) { |
|
|
|
// Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create normalizeutf8 operation on ds |
|
|
|
std::shared_ptr<TensorOperation> normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfkc); |
|
|
|
EXPECT_NE(normalizeutf8, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({normalizeutf8}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
Tensor::CreateScalar(expected[i], &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 6); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) { |
|
|
|
// Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create normalizeutf8 operation on ds |
|
|
|
std::shared_ptr<TensorOperation> normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfc); |
|
|
|
EXPECT_NE(normalizeutf8, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({normalizeutf8}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
Tensor::CreateScalar(expected[i], &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 6); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) { |
|
|
|
// Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create normalizeutf8 operation on ds |
|
|
|
std::shared_ptr<TensorOperation> normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfd); |
|
|
|
EXPECT_NE(normalizeutf8, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({normalizeutf8}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
Tensor::CreateScalar(expected[i], &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 6); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) { |
|
|
|
// Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create normalizeutf8 operation on ds |
|
|
|
std::shared_ptr<TensorOperation> normalizeutf8 = text::NormalizeUTF8(NormalizeForm::kNfkd); |
|
|
|
EXPECT_NE(normalizeutf8, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({normalizeutf8}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
Tensor::CreateScalar(expected[i], &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 6); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) { |
|
|
|
// Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create unicodechar_tokenizer operation on ds |
|
|
|
std::shared_ptr<TensorOperation> unicodechar_tokenizer = text::UnicodeCharTokenizer(); |
|
|
|
EXPECT_NE(unicodechar_tokenizer, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({unicodechar_tokenizer}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected = { |
|
|
|
{"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"}, |
|
|
|
{"北", "京", "欢", "迎", "您", "!"}, |
|
|
|
{"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"}, |
|
|
|
{" ", " "}}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
int x = expected[i].size(); |
|
|
|
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) { |
|
|
|
// Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create unicodechar_tokenizer operation on ds |
|
|
|
std::shared_ptr<TensorOperation> unicodechar_tokenizer = text::UnicodeCharTokenizer(true); |
|
|
|
EXPECT_NE(unicodechar_tokenizer, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, |
|
|
|
{"token", "offsets_start", "offsets_limit"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected = { |
|
|
|
{"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"}, |
|
|
|
{"北", "京", "欢", "迎", "您", "!"}, |
|
|
|
{"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"}, |
|
|
|
{" ", " "}}; |
|
|
|
|
|
|
|
std::vector<std::vector<uint32_t>> expected_offsets_start = { |
|
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}, |
|
|
|
{0, 3, 6, 9, 12, 15}, |
|
|
|
{0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16}, |
|
|
|
{0, 1}}; |
|
|
|
std::vector<std::vector<uint32_t>> expected_offsets_limit = { |
|
|
|
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, |
|
|
|
{3, 6, 9, 12, 15, 18}, |
|
|
|
{3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17}, |
|
|
|
{1, 2}}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["offsets_start"]; |
|
|
|
auto ind1 = row["offsets_limit"]; |
|
|
|
auto token = row["token"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
std::shared_ptr<Tensor> expected_tensor_offsets_start; |
|
|
|
std::shared_ptr<Tensor> expected_tensor_offsets_limit; |
|
|
|
int x = expected[i].size(); |
|
|
|
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); |
|
|
|
Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); |
|
|
|
Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor_offsets_start); |
|
|
|
EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); |
|
|
|
EXPECT_EQ(*token, *expected_tensor); |
|
|
|
|
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { |
|
|
|
// Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create unicodescript_tokenizer operation on ds |
|
|
|
std::shared_ptr<TensorOperation> unicodescript_tokenizer = text::UnicodeScriptTokenizer(); |
|
|
|
EXPECT_NE(unicodescript_tokenizer, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({unicodescript_tokenizer}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected = { |
|
|
|
{"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
int x = expected[i].size(); |
|
|
|
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) { |
|
|
|
// Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is |
|
|
|
// false. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create unicodescript_tokenizer operation on ds |
|
|
|
std::shared_ptr<TensorOperation> unicodescript_tokenizer = text::UnicodeScriptTokenizer(true); |
|
|
|
EXPECT_NE(unicodescript_tokenizer, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({unicodescript_tokenizer}, {"text"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected = { |
|
|
|
{"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["text"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
int x = expected[i].size(); |
|
|
|
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor); |
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) { |
|
|
|
// Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is |
|
|
|
// true. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create unicodescript_tokenizer operation on ds |
|
|
|
std::shared_ptr<TensorOperation> unicodescript_tokenizer = text::UnicodeScriptTokenizer(false, true); |
|
|
|
EXPECT_NE(unicodescript_tokenizer, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, |
|
|
|
{"token", "offsets_start", "offsets_limit"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected = { |
|
|
|
{"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}}; |
|
|
|
|
|
|
|
std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}}; |
|
|
|
std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["offsets_start"]; |
|
|
|
auto ind1 = row["offsets_limit"]; |
|
|
|
auto token = row["token"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
std::shared_ptr<Tensor> expected_tensor_offsets_start; |
|
|
|
std::shared_ptr<Tensor> expected_tensor_offsets_limit; |
|
|
|
int x = expected[i].size(); |
|
|
|
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); |
|
|
|
Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); |
|
|
|
Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor_offsets_start); |
|
|
|
EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); |
|
|
|
EXPECT_EQ(*token, *expected_tensor); |
|
|
|
|
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) { |
|
|
|
// Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is |
|
|
|
// true. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3."; |
|
|
|
|
|
|
|
// Create a TextFile dataset |
|
|
|
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; |
|
|
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create unicodescript_tokenizer operation on ds |
|
|
|
std::shared_ptr<TensorOperation> unicodescript_tokenizer = text::UnicodeScriptTokenizer(true, true); |
|
|
|
EXPECT_NE(unicodescript_tokenizer, nullptr); |
|
|
|
|
|
|
|
// Create Map operation on ds |
|
|
|
ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, |
|
|
|
{"token", "offsets_start", "offsets_limit"}); |
|
|
|
EXPECT_NE(ds, nullptr); |
|
|
|
|
|
|
|
// Create an iterator over the result of the above dataset |
|
|
|
// This will trigger the creation of the Execution Tree and launch it. |
|
|
|
std::shared_ptr<Iterator> iter = ds->CreateIterator(); |
|
|
|
EXPECT_NE(iter, nullptr); |
|
|
|
|
|
|
|
// Iterate the dataset and get each row |
|
|
|
std::unordered_map<std::string, std::shared_ptr<Tensor>> row; |
|
|
|
iter->GetNextRow(&row); |
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected = { |
|
|
|
{"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}}; |
|
|
|
|
|
|
|
std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}}; |
|
|
|
std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}}; |
|
|
|
|
|
|
|
uint64_t i = 0; |
|
|
|
while (row.size() != 0) { |
|
|
|
auto ind = row["offsets_start"]; |
|
|
|
auto ind1 = row["offsets_limit"]; |
|
|
|
auto token = row["token"]; |
|
|
|
std::shared_ptr<Tensor> expected_tensor; |
|
|
|
std::shared_ptr<Tensor> expected_tensor_offsets_start; |
|
|
|
std::shared_ptr<Tensor> expected_tensor_offsets_limit; |
|
|
|
int x = expected[i].size(); |
|
|
|
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor); |
|
|
|
Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start); |
|
|
|
Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit); |
|
|
|
EXPECT_EQ(*ind, *expected_tensor_offsets_start); |
|
|
|
EXPECT_EQ(*ind1, *expected_tensor_offsets_limit); |
|
|
|
EXPECT_EQ(*token, *expected_tensor); |
|
|
|
|
|
|
|
iter->GetNextRow(&row); |
|
|
|
i++; |
|
|
|
} |
|
|
|
|
|
|
|
EXPECT_EQ(i, 4); |
|
|
|
|
|
|
|
// Manually terminate the pipeline |
|
|
|
iter->Stop(); |
|
|
|
} |
|
|
|
|
|
|
|
TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) { |
|
|
|
// Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default. |
|
|
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess."; |
|
|
|
|