fix python tokenizer

4 years ago · f99204b292
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@@ -533,7 +533,9 @@ class PythonTokenizer:
        self.random = False

    def __call__(self, in_array):
        if not isinstance(in_array, str):
        if not isinstance(in_array, np.ndarray):
            raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
        if in_array.dtype.type is np.bytes_:
            in_array = to_str(in_array)
        tokens = self.tokenizer(in_array)
        return tokens
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'):
    """

    if not isinstance(array, np.ndarray):
        raise ValueError('input should be a NumPy array.')
        raise TypeError('input should be a NumPy array.')

    return np.char.decode(array, encoding)

--- a/tests/ut/python/dataset/test_eager_text.py
+++ b/tests/ut/python/dataset/test_eager_text.py
@@ -52,12 +52,17 @@ def test_python_tokenizer():
        if not words:
            return [""]
        return words
    txt = "Welcome to Beijing !"
    txt = T.PythonTokenizer(my_tokenizer)(txt)
    logger.info("Tokenize result: {}".format(txt))
    txt1 = np.array("Welcome to Beijing !".encode())
    txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
    logger.info("Tokenize result: {}".format(txt1))

    txt2 = np.array("Welcome to Beijing !")
    txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
    logger.info("Tokenize result: {}".format(txt2))

    expected = ['Welcome', 'to', 'Beijing', '!']
    np.testing.assert_equal(txt, expected)
    np.testing.assert_equal(txt1, expected)
    np.testing.assert_equal(txt2, expected)


 if __name__ == '__main__':