You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_text_tokenizer.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Testing UnicodeCharTokenizer op in DE
  17. """
  18. import numpy as np
  19. import mindspore.dataset as ds
  20. from mindspore import log as logger
  21. import mindspore.dataset.text as text
  22. DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
  23. NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
  24. REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
  25. REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
  26. def split_by_unicode_char(input_strs):
  27. """
  28. Split utf-8 strings to unicode characters
  29. """
  30. out = []
  31. for s in input_strs:
  32. out.append([c for c in s])
  33. return out
  34. def test_unicode_char_tokenizer_default():
  35. """
  36. Test UnicodeCharTokenizer
  37. """
  38. input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
  39. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  40. tokenizer = text.UnicodeCharTokenizer()
  41. dataset = dataset.map(operations=tokenizer)
  42. tokens = []
  43. for i in dataset.create_dict_iterator(num_epochs=1):
  44. token = text.to_str(i['text']).tolist()
  45. tokens.append(token)
  46. logger.info("The out tokens is : {}".format(tokens))
  47. assert split_by_unicode_char(input_strs) == tokens
  48. def test_unicode_char_tokenizer_with_offsets():
  49. """
  50. Test UnicodeCharTokenizer
  51. """
  52. input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
  53. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  54. tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
  55. dataset = dataset.map(operations=tokenizer, input_columns=['text'],
  56. output_columns=['token', 'offsets_start', 'offsets_limit'],
  57. column_order=['token', 'offsets_start', 'offsets_limit'])
  58. tokens = []
  59. expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
  60. [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
  61. expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
  62. [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]]
  63. count = 0
  64. for i in dataset.create_dict_iterator(num_epochs=1):
  65. token = text.to_str(i['token']).tolist()
  66. tokens.append(token)
  67. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  68. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  69. count += 1
  70. logger.info("The out tokens is : {}".format(tokens))
  71. assert split_by_unicode_char(input_strs) == tokens
  72. def test_whitespace_tokenizer_default():
  73. """
  74. Test WhitespaceTokenizer
  75. """
  76. whitespace_strs = [["Welcome", "to", "Beijing!"],
  77. ["北京欢迎您!"],
  78. ["我喜欢English!"],
  79. [""]]
  80. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  81. tokenizer = text.WhitespaceTokenizer()
  82. dataset = dataset.map(operations=tokenizer)
  83. tokens = []
  84. for i in dataset.create_dict_iterator(num_epochs=1):
  85. token = text.to_str(i['text']).tolist()
  86. tokens.append(token)
  87. logger.info("The out tokens is : {}".format(tokens))
  88. assert whitespace_strs == tokens
  89. def test_whitespace_tokenizer_with_offsets():
  90. """
  91. Test WhitespaceTokenizer
  92. """
  93. whitespace_strs = [["Welcome", "to", "Beijing!"],
  94. ["北京欢迎您!"],
  95. ["我喜欢English!"],
  96. [""]]
  97. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  98. tokenizer = text.WhitespaceTokenizer(with_offsets=True)
  99. dataset = dataset.map(operations=tokenizer, input_columns=['text'],
  100. output_columns=['token', 'offsets_start', 'offsets_limit'],
  101. column_order=['token', 'offsets_start', 'offsets_limit'])
  102. tokens = []
  103. expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
  104. expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
  105. count = 0
  106. for i in dataset.create_dict_iterator(num_epochs=1):
  107. token = text.to_str(i['token']).tolist()
  108. tokens.append(token)
  109. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  110. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  111. count += 1
  112. logger.info("The out tokens is : {}".format(tokens))
  113. assert whitespace_strs == tokens
  114. def test_unicode_script_tokenizer_default():
  115. """
  116. Test UnicodeScriptTokenizer when para keep_whitespace=False
  117. """
  118. unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
  119. ["北京欢迎您", "!"],
  120. ["我喜欢", "English", "!"],
  121. [""]]
  122. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  123. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False)
  124. dataset = dataset.map(operations=tokenizer)
  125. tokens = []
  126. for i in dataset.create_dict_iterator(num_epochs=1):
  127. token = text.to_str(i['text']).tolist()
  128. tokens.append(token)
  129. logger.info("The out tokens is : {}".format(tokens))
  130. assert unicode_script_strs == tokens
  131. def test_unicode_script_tokenizer_default2():
  132. """
  133. Test UnicodeScriptTokenizer when para keep_whitespace=True
  134. """
  135. unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
  136. ["北京欢迎您", "!"],
  137. ["我喜欢", "English", "!"],
  138. [" "]]
  139. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  140. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
  141. dataset = dataset.map(operations=tokenizer)
  142. tokens = []
  143. for i in dataset.create_dict_iterator(num_epochs=1):
  144. token = text.to_str(i['text']).tolist()
  145. tokens.append(token)
  146. logger.info("The out tokens is :", tokens)
  147. assert unicode_script_strs2 == tokens
  148. def test_unicode_script_tokenizer_with_offsets():
  149. """
  150. Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
  151. """
  152. unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
  153. ["北京欢迎您", "!"],
  154. ["我喜欢", "English", "!"],
  155. [""]]
  156. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  157. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True)
  158. dataset = dataset.map(operations=tokenizer, input_columns=['text'],
  159. output_columns=['token', 'offsets_start', 'offsets_limit'],
  160. column_order=['token', 'offsets_start', 'offsets_limit'])
  161. tokens = []
  162. expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]]
  163. expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]]
  164. count = 0
  165. for i in dataset.create_dict_iterator(num_epochs=1):
  166. token = text.to_str(i['token']).tolist()
  167. tokens.append(token)
  168. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  169. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  170. count += 1
  171. logger.info("The out tokens is : {}".format(tokens))
  172. assert unicode_script_strs == tokens
  173. def test_unicode_script_tokenizer_with_offsets2():
  174. """
  175. Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
  176. """
  177. unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
  178. ["北京欢迎您", "!"],
  179. ["我喜欢", "English", "!"],
  180. [" "]]
  181. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  182. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
  183. dataset = dataset.map(operations=tokenizer, input_columns=['text'],
  184. output_columns=['token', 'offsets_start', 'offsets_limit'],
  185. column_order=['token', 'offsets_start', 'offsets_limit'])
  186. tokens = []
  187. expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
  188. expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]]
  189. count = 0
  190. for i in dataset.create_dict_iterator(num_epochs=1):
  191. token = text.to_str(i['token']).tolist()
  192. tokens.append(token)
  193. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  194. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  195. count += 1
  196. logger.info("The out tokens is :", tokens)
  197. assert unicode_script_strs2 == tokens
  198. def test_case_fold():
  199. """
  200. Test CaseFold
  201. """
  202. expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
  203. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  204. op = text.CaseFold()
  205. dataset = dataset.map(operations=op)
  206. lower_strs = []
  207. for i in dataset.create_dict_iterator(num_epochs=1):
  208. token = text.to_str(i['text']).tolist()
  209. lower_strs.append(token)
  210. assert lower_strs == expect_strs
  211. def test_normalize_utf8():
  212. """
  213. Test NormalizeUTF8
  214. """
  215. def normalize(normalize_form):
  216. dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
  217. normalize = text.NormalizeUTF8(normalize_form=normalize_form)
  218. dataset = dataset.map(operations=normalize)
  219. out_bytes = []
  220. out_texts = []
  221. for i in dataset.create_dict_iterator(num_epochs=1):
  222. out_bytes.append(i['text'])
  223. out_texts.append(text.to_str(i['text']).tolist())
  224. logger.info("The out bytes is : ", out_bytes)
  225. logger.info("The out texts is: ", out_texts)
  226. return out_bytes
  227. expect_normlize_data = [
  228. # NFC
  229. [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
  230. b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
  231. # NFKC
  232. [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
  233. b'fi', b'25', b'\xe1\xb9\xa9'],
  234. # NFD
  235. [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
  236. b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
  237. # NFKD
  238. [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
  239. b'fi', b'25', b's\xcc\xa3\xcc\x87']
  240. ]
  241. assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0]
  242. assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
  243. assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2]
  244. assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
  245. def test_regex_replace():
  246. """
  247. Test RegexReplace
  248. """
  249. def regex_replace(first, last, expect_str, pattern, replace):
  250. dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
  251. if first > 1:
  252. dataset = dataset.skip(first - 1)
  253. if last >= first:
  254. dataset = dataset.take(last - first + 1)
  255. replace_op = text.RegexReplace(pattern, replace)
  256. dataset = dataset.map(operations=replace_op)
  257. out_text = []
  258. for i in dataset.create_dict_iterator(num_epochs=1):
  259. token = text.to_str(i['text']).tolist()
  260. out_text.append(token)
  261. logger.info("Out:", out_text)
  262. logger.info("Exp:", expect_str)
  263. assert expect_str == out_text
  264. regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
  265. regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
  266. regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
  267. regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
  268. def test_regex_tokenizer_default():
  269. """
  270. Test RegexTokenizer
  271. """
  272. def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
  273. dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
  274. if first > 1:
  275. dataset = dataset.skip(first - 1)
  276. if last >= first:
  277. dataset = dataset.take(last - first + 1)
  278. tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern)
  279. dataset = dataset.map(operations=tokenizer_op)
  280. out_text = []
  281. count = 0
  282. for i in dataset.create_dict_iterator(num_epochs=1):
  283. token = text.to_str(i['text']).tolist()
  284. np.testing.assert_array_equal(token, expect_str[count])
  285. count += 1
  286. out_text.append(token)
  287. logger.info("Out:", out_text)
  288. logger.info("Exp:", expect_str)
  289. regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
  290. regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
  291. regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
  292. regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
  293. regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
  294. regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
  295. def test_regex_tokenizer_with_offsets():
  296. """
  297. Test RegexTokenizer
  298. """
  299. def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern,
  300. keep_delim_pattern):
  301. dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
  302. if first > 1:
  303. dataset = dataset.skip(first - 1)
  304. if last >= first:
  305. dataset = dataset.take(last - first + 1)
  306. tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
  307. dataset = dataset.map(operations=tokenizer_op, input_columns=['text'],
  308. output_columns=['token', 'offsets_start', 'offsets_limit'],
  309. column_order=['token', 'offsets_start', 'offsets_limit'])
  310. out_text = []
  311. count = 0
  312. for i in dataset.create_dict_iterator(num_epochs=1):
  313. token = text.to_str(i['token']).tolist()
  314. np.testing.assert_array_equal(token, expect_str[count])
  315. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  316. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  317. count += 1
  318. out_text.append(token)
  319. logger.info("Out:", out_text)
  320. logger.info("Exp:", expect_str)
  321. regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "")
  322. regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]],
  323. "\\s+", "\\s+")
  324. regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]],
  325. [[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}")
  326. regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]],
  327. r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
  328. regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "")
  329. regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "")
  330. if __name__ == '__main__':
  331. test_unicode_char_tokenizer_default()
  332. test_unicode_char_tokenizer_with_offsets()
  333. test_whitespace_tokenizer_default()
  334. test_whitespace_tokenizer_with_offsets()
  335. test_unicode_script_tokenizer_default()
  336. test_unicode_script_tokenizer_default2()
  337. test_unicode_script_tokenizer_with_offsets()
  338. test_unicode_script_tokenizer_with_offsets2()
  339. test_case_fold()
  340. test_normalize_utf8()
  341. test_regex_replace()
  342. test_regex_tokenizer_default()
  343. test_regex_tokenizer_with_offsets()