You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_tokenizer.py 8.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Testing UnicodeCharTokenizer op in DE
  17. """
  18. import numpy as np
  19. import mindspore.dataset as ds
  20. from mindspore import log as logger
  21. import mindspore.dataset.text as nlp
  22. DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
  23. NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
  24. REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
  25. REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
  26. def split_by_unicode_char(input_strs):
  27. """
  28. Split utf-8 strings to unicode characters
  29. """
  30. out = []
  31. for s in input_strs:
  32. out.append([c for c in s])
  33. return out
  34. def test_unicode_char_tokenizer():
  35. """
  36. Test UnicodeCharTokenizer
  37. """
  38. input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
  39. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  40. tokenizer = nlp.UnicodeCharTokenizer()
  41. dataset = dataset.map(operations=tokenizer)
  42. tokens = []
  43. for i in dataset.create_dict_iterator():
  44. text = nlp.to_str(i['text']).tolist()
  45. tokens.append(text)
  46. logger.info("The out tokens is : {}".format(tokens))
  47. assert split_by_unicode_char(input_strs) == tokens
  48. def test_whitespace_tokenizer():
  49. """
  50. Test WhitespaceTokenizer
  51. """
  52. whitespace_strs = [["Welcome", "to", "Beijing!"],
  53. ["北京欢迎您!"],
  54. ["我喜欢English!"],
  55. [""]]
  56. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  57. tokenizer = nlp.WhitespaceTokenizer()
  58. dataset = dataset.map(operations=tokenizer)
  59. tokens = []
  60. for i in dataset.create_dict_iterator():
  61. text = nlp.to_str(i['text']).tolist()
  62. tokens.append(text)
  63. logger.info("The out tokens is : {}".format(tokens))
  64. assert whitespace_strs == tokens
  65. def test_unicode_script_tokenizer():
  66. """
  67. Test UnicodeScriptTokenizer when para keep_whitespace=False
  68. """
  69. unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
  70. ["北京欢迎您", "!"],
  71. ["我喜欢", "English", "!"],
  72. [""]]
  73. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  74. tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False)
  75. dataset = dataset.map(operations=tokenizer)
  76. tokens = []
  77. for i in dataset.create_dict_iterator():
  78. text = nlp.to_str(i['text']).tolist()
  79. tokens.append(text)
  80. logger.info("The out tokens is : {}".format(tokens))
  81. assert unicode_script_strs == tokens
  82. def test_unicode_script_tokenizer2():
  83. """
  84. Test UnicodeScriptTokenizer when para keep_whitespace=True
  85. """
  86. unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
  87. ["北京欢迎您", "!"],
  88. ["我喜欢", "English", "!"],
  89. [" "]]
  90. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  91. tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True)
  92. dataset = dataset.map(operations=tokenizer)
  93. tokens = []
  94. for i in dataset.create_dict_iterator():
  95. text = nlp.to_str(i['text']).tolist()
  96. tokens.append(text)
  97. logger.info("The out tokens is :", tokens)
  98. assert unicode_script_strs2 == tokens
  99. def test_case_fold():
  100. """
  101. Test CaseFold
  102. """
  103. expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
  104. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  105. op = nlp.CaseFold()
  106. dataset = dataset.map(operations=op)
  107. lower_strs = []
  108. for i in dataset.create_dict_iterator():
  109. text = nlp.to_str(i['text']).tolist()
  110. lower_strs.append(text)
  111. assert lower_strs == expect_strs
  112. def test_normalize_utf8():
  113. """
  114. Test NormalizeUTF8
  115. """
  116. def normalize(normalize_form):
  117. dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
  118. normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
  119. dataset = dataset.map(operations=normalize)
  120. out_bytes = []
  121. out_texts = []
  122. for i in dataset.create_dict_iterator():
  123. out_bytes.append(i['text'])
  124. out_texts.append(nlp.to_str(i['text']).tolist())
  125. logger.info("The out bytes is : ", out_bytes)
  126. logger.info("The out texts is: ", out_texts)
  127. return out_bytes
  128. expect_normlize_data = [
  129. # NFC
  130. [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
  131. b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
  132. # NFKC
  133. [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
  134. b'fi', b'25', b'\xe1\xb9\xa9'],
  135. # NFD
  136. [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
  137. b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
  138. # NFKD
  139. [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
  140. b'fi', b'25', b's\xcc\xa3\xcc\x87']
  141. ]
  142. assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0]
  143. assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
  144. assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2]
  145. assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
  146. def test_regex_replace():
  147. """
  148. Test RegexReplace
  149. """
  150. def regex_replace(first, last, expect_str, pattern, replace):
  151. dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
  152. if first > 1:
  153. dataset = dataset.skip(first - 1)
  154. if last >= first:
  155. dataset = dataset.take(last - first + 1)
  156. replace_op = nlp.RegexReplace(pattern, replace)
  157. dataset = dataset.map(operations=replace_op)
  158. out_text = []
  159. for i in dataset.create_dict_iterator():
  160. text = nlp.to_str(i['text']).tolist()
  161. out_text.append(text)
  162. logger.info("Out:", out_text)
  163. logger.info("Exp:", expect_str)
  164. assert expect_str == out_text
  165. regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
  166. regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
  167. regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
  168. regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
  169. def test_regex_tokenizer():
  170. """
  171. Test RegexTokenizer
  172. """
  173. def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
  174. dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
  175. if first > 1:
  176. dataset = dataset.skip(first - 1)
  177. if last >= first:
  178. dataset = dataset.take(last - first + 1)
  179. tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
  180. dataset = dataset.map(operations=tokenizer_op)
  181. out_text = []
  182. count = 0
  183. for i in dataset.create_dict_iterator():
  184. text = nlp.to_str(i['text']).tolist()
  185. np.testing.assert_array_equal(text, expect_str[count])
  186. count += 1
  187. out_text.append(text)
  188. logger.info("Out:", out_text)
  189. logger.info("Exp:", expect_str)
  190. regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
  191. regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
  192. regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
  193. regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
  194. regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
  195. regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
  196. if __name__ == '__main__':
  197. test_unicode_char_tokenizer()
  198. test_whitespace_tokenizer()
  199. test_unicode_script_tokenizer()
  200. test_unicode_script_tokenizer2()
  201. test_case_fold()
  202. test_normalize_utf8()
  203. test_regex_replace()
  204. test_regex_tokenizer()