|
- # Copyright 2021 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- import numpy as np
- import mindspore.dataset.text.transforms as T
- import mindspore.common.dtype as mstype
- from mindspore import log as logger
-
- def test_sliding_window():
- txt = ["Welcome", "to", "Beijing", "!"]
- sliding_window = T.SlidingWindow(width=2)
- txt = sliding_window(txt)
- logger.info("Result: {}".format(txt))
-
- expected = [['Welcome', 'to'], ['to', 'Beijing'], ['Beijing', '!']]
- np.testing.assert_equal(txt, expected)
-
-
- def test_to_number():
- txt = ["123456"]
- to_number = T.ToNumber(mstype.int32)
- txt = to_number(txt)
- logger.info("Result: {}, type: {}".format(txt, type(txt[0])))
-
- assert txt == 123456
-
-
- def test_whitespace_tokenizer():
- txt = "Welcome to Beijing !"
- txt = T.WhitespaceTokenizer()(txt)
- logger.info("Tokenize result: {}".format(txt))
-
- expected = ['Welcome', 'to', 'Beijing', '!']
- np.testing.assert_equal(txt, expected)
-
-
- def test_python_tokenizer():
- # whitespace tokenizer
- def my_tokenizer(line):
- words = line.split()
- if not words:
- return [""]
- return words
- txt1 = np.array("Welcome to Beijing !".encode())
- txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
- logger.info("Tokenize result: {}".format(txt1))
-
- txt2 = np.array("Welcome to Beijing !")
- txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
- logger.info("Tokenize result: {}".format(txt2))
-
- expected = ['Welcome', 'to', 'Beijing', '!']
- np.testing.assert_equal(txt1, expected)
- np.testing.assert_equal(txt2, expected)
-
-
- if __name__ == '__main__':
- test_sliding_window()
- test_to_number()
- test_whitespace_tokenizer()
- test_python_tokenizer()
|