Browse Source

fix python tokenizer

tags/v1.2.0-rc1
YangLuo 4 years ago
parent
commit
f99204b292
3 changed files with 13 additions and 6 deletions
  1. +3
    -1
      mindspore/dataset/text/transforms.py
  2. +1
    -1
      mindspore/dataset/text/utils.py
  3. +9
    -4
      tests/ut/python/dataset/test_eager_text.py

+ 3
- 1
mindspore/dataset/text/transforms.py View File

@@ -533,7 +533,9 @@ class PythonTokenizer:
self.random = False self.random = False


def __call__(self, in_array): def __call__(self, in_array):
if not isinstance(in_array, str):
if not isinstance(in_array, np.ndarray):
raise TypeError("input should be a NumPy array. Got {}.".format(type(in_array)))
if in_array.dtype.type is np.bytes_:
in_array = to_str(in_array) in_array = to_str(in_array)
tokens = self.tokenizer(in_array) tokens = self.tokenizer(in_array)
return tokens return tokens


+ 1
- 1
mindspore/dataset/text/utils.py View File

@@ -216,7 +216,7 @@ def to_str(array, encoding='utf8'):
""" """


if not isinstance(array, np.ndarray): if not isinstance(array, np.ndarray):
raise ValueError('input should be a NumPy array.')
raise TypeError('input should be a NumPy array.')


return np.char.decode(array, encoding) return np.char.decode(array, encoding)




+ 9
- 4
tests/ut/python/dataset/test_eager_text.py View File

@@ -52,12 +52,17 @@ def test_python_tokenizer():
if not words: if not words:
return [""] return [""]
return words return words
txt = "Welcome to Beijing !"
txt = T.PythonTokenizer(my_tokenizer)(txt)
logger.info("Tokenize result: {}".format(txt))
txt1 = np.array("Welcome to Beijing !".encode())
txt1 = T.PythonTokenizer(my_tokenizer)(txt1)
logger.info("Tokenize result: {}".format(txt1))

txt2 = np.array("Welcome to Beijing !")
txt2 = T.PythonTokenizer(my_tokenizer)(txt2)
logger.info("Tokenize result: {}".format(txt2))


expected = ['Welcome', 'to', 'Beijing', '!'] expected = ['Welcome', 'to', 'Beijing', '!']
np.testing.assert_equal(txt, expected)
np.testing.assert_equal(txt1, expected)
np.testing.assert_equal(txt2, expected)




if __name__ == '__main__': if __name__ == '__main__':


Loading…
Cancel
Save