| @@ -533,7 +533,9 @@ class PythonTokenizer: | |||||
| self.random = False | self.random = False | ||||
| def __call__(self, in_array): | def __call__(self, in_array): | ||||
| if not isinstance(in_array, str): | |||||
| if not isinstance(in_array, np.ndarray): | |||||
| raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array))) | |||||
| if in_array.dtype.type is np.bytes_: | |||||
| in_array = to_str(in_array) | in_array = to_str(in_array) | ||||
| tokens = self.tokenizer(in_array) | tokens = self.tokenizer(in_array) | ||||
| return tokens | return tokens | ||||
| @@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'): | |||||
| """ | """ | ||||
| if not isinstance(array, np.ndarray): | if not isinstance(array, np.ndarray): | ||||
| raise ValueError('input should be a NumPy array.') | |||||
| raise TypeError('input should be a NumPy array.') | |||||
| return np.char.decode(array, encoding) | return np.char.decode(array, encoding) | ||||
| @@ -52,12 +52,17 @@ def test_python_tokenizer(): | |||||
| if not words: | if not words: | ||||
| return [""] | return [""] | ||||
| return words | return words | ||||
| txt = "Welcome to Beijing !" | |||||
| txt = T.PythonTokenizer(my_tokenizer)(txt) | |||||
| logger.info("Tokenize result: {}".format(txt)) | |||||
| txt1 = np.array("Welcome to Beijing !".encode()) | |||||
| txt1 = T.PythonTokenizer(my_tokenizer)(txt1) | |||||
| logger.info("Tokenize result: {}".format(txt1)) | |||||
| txt2 = np.array("Welcome to Beijing !") | |||||
| txt2 = T.PythonTokenizer(my_tokenizer)(txt2) | |||||
| logger.info("Tokenize result: {}".format(txt2)) | |||||
| expected = ['Welcome', 'to', 'Beijing', '!'] | expected = ['Welcome', 'to', 'Beijing', '!'] | ||||
| np.testing.assert_equal(txt, expected) | |||||
| np.testing.assert_equal(txt1, expected) | |||||
| np.testing.assert_equal(txt2, expected) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||