You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer_op_test.cc 3.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <memory>
  17. #include <string>
  18. #include <string_view>
  19. #include "common/common.h"
  20. #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
  21. #include "gtest/gtest.h"
  22. #include "utils/log_adapter.h"
  23. using namespace mindspore::dataset;
  24. class MindDataTestTokenizerOp : public UT::Common {
  25. public:
  26. void CheckEqual(const std::shared_ptr<Tensor> &o,
  27. const std::vector<dsize_t> &index,
  28. const std::string &expect) {
  29. std::string_view str;
  30. Status s = o->GetItemAt(&str, index);
  31. EXPECT_TRUE(s.IsOk());
  32. EXPECT_EQ(str, expect);
  33. }
  34. };
  35. TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
  36. MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp.";
  37. std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp());
  38. std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Hello World!");
  39. std::shared_ptr<Tensor> output;
  40. Status s = op->Compute(input, &output);
  41. EXPECT_TRUE(s.IsOk());
  42. EXPECT_EQ(output->Size(), 12);
  43. EXPECT_EQ(output->Rank(), 1);
  44. MS_LOG(INFO) << "Out tensor1: " << output->ToString();
  45. CheckEqual(output, {0}, "H");
  46. CheckEqual(output, {1}, "e");
  47. CheckEqual(output, {2}, "l");
  48. CheckEqual(output, {3}, "l");
  49. CheckEqual(output, {4}, "o");
  50. CheckEqual(output, {5}, " ");
  51. CheckEqual(output, {6}, "W");
  52. CheckEqual(output, {7}, "o");
  53. CheckEqual(output, {8}, "r");
  54. CheckEqual(output, {9}, "l");
  55. CheckEqual(output, {10}, "d");
  56. CheckEqual(output, {11}, "!");
  57. input = std::make_shared<Tensor>("中国 你好!");
  58. s = op->Compute(input, &output);
  59. EXPECT_TRUE(s.IsOk());
  60. EXPECT_EQ(output->Size(), 6);
  61. EXPECT_EQ(output->Rank(), 1);
  62. MS_LOG(INFO) << "Out tensor2: " << output->ToString();
  63. CheckEqual(output, {0}, "中");
  64. CheckEqual(output, {1}, "国");
  65. CheckEqual(output, {2}, " ");
  66. CheckEqual(output, {3}, "你");
  67. CheckEqual(output, {4}, "好");
  68. CheckEqual(output, {5}, "!");
  69. input = std::make_shared<Tensor>("中");
  70. s = op->Compute(input, &output);
  71. EXPECT_TRUE(s.IsOk());
  72. EXPECT_EQ(output->Size(), 1);
  73. EXPECT_EQ(output->Rank(), 1);
  74. MS_LOG(INFO) << "Out tensor3: " << output->ToString();
  75. CheckEqual(output, {0}, "中");
  76. input = std::make_shared<Tensor>("H");
  77. s = op->Compute(input, &output);
  78. EXPECT_TRUE(s.IsOk());
  79. EXPECT_EQ(output->Size(), 1);
  80. EXPECT_EQ(output->Rank(), 1);
  81. MS_LOG(INFO) << "Out tensor4: " << output->ToString();
  82. CheckEqual(output, {0}, "H");
  83. input = std::make_shared<Tensor>(" ");
  84. s = op->Compute(input, &output);
  85. EXPECT_TRUE(s.IsOk());
  86. EXPECT_EQ(output->Size(), 2);
  87. EXPECT_EQ(output->Rank(), 1);
  88. MS_LOG(INFO) << "Out tensor5: " << output->ToString();
  89. CheckEqual(output, {0}, " ");
  90. CheckEqual(output, {1}, " ");
  91. input = std::make_shared<Tensor>("");
  92. s = op->Compute(input, &output);
  93. EXPECT_TRUE(s.IsOk());
  94. EXPECT_EQ(output->Size(), 1);
  95. EXPECT_EQ(output->Rank(), 1);
  96. MS_LOG(INFO) << "Out tensor6: " << output->ToString();
  97. CheckEqual(output, {0}, "");
  98. }