You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_eager_text.py 2.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import mindspore.dataset.text.transforms as T
  17. import mindspore.common.dtype as mstype
  18. from mindspore import log as logger
  19. def test_sliding_window():
  20. txt = ["Welcome", "to", "Beijing", "!"]
  21. sliding_window = T.SlidingWindow(width=2)
  22. txt = sliding_window(txt)
  23. logger.info("Result: {}".format(txt))
  24. expected = [['Welcome', 'to'], ['to', 'Beijing'], ['Beijing', '!']]
  25. np.testing.assert_equal(txt, expected)
  26. def test_to_number():
  27. txt = ["123456"]
  28. to_number = T.ToNumber(mstype.int32)
  29. txt = to_number(txt)
  30. logger.info("Result: {}, type: {}".format(txt, type(txt[0])))
  31. assert txt == 123456
  32. def test_whitespace_tokenizer():
  33. txt = "Welcome to Beijing !"
  34. txt = T.WhitespaceTokenizer()(txt)
  35. logger.info("Tokenize result: {}".format(txt))
  36. expected = ['Welcome', 'to', 'Beijing', '!']
  37. np.testing.assert_equal(txt, expected)
  38. def test_python_tokenizer():
  39. # whitespace tokenizer
  40. def my_tokenizer(line):
  41. words = line.split()
  42. if not words:
  43. return [""]
  44. return words
  45. txt1 = np.array("Welcome to Beijing !".encode())
  46. txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
  47. logger.info("Tokenize result: {}".format(txt1))
  48. txt2 = np.array("Welcome to Beijing !")
  49. txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
  50. logger.info("Tokenize result: {}".format(txt2))
  51. expected = ['Welcome', 'to', 'Beijing', '!']
  52. np.testing.assert_equal(txt1, expected)
  53. np.testing.assert_equal(txt2, expected)
  54. if __name__ == '__main__':
  55. test_sliding_window()
  56. test_to_number()
  57. test_whitespace_tokenizer()
  58. test_python_tokenizer()