You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_text_jieba_tokenizer.py 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import pytest
  17. import mindspore.dataset as ds
  18. from mindspore.dataset.text import JiebaTokenizer
  19. from mindspore.dataset.text import JiebaMode, to_str
  20. from mindspore import log as logger
  21. DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
  22. DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
  23. HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
  24. MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
  25. def test_jieba_callable():
  26. """
  27. Test jieba tokenizer op is callable
  28. """
  29. logger.info("test_jieba_callable")
  30. jieba_op1 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  31. jieba_op2 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
  32. # test one tensor
  33. text1 = "今天天气太好了我们一起去外面玩吧"
  34. text2 = "男默女泪市长江大桥"
  35. assert np.array_equal(jieba_op1(text1), ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'])
  36. assert np.array_equal(jieba_op2(text1), ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'])
  37. jieba_op1.add_word("男默女泪")
  38. assert np.array_equal(jieba_op1(text2), ['男默女泪', '市', '长江大桥'])
  39. # test input multiple tensors
  40. with pytest.raises(RuntimeError) as info:
  41. _ = jieba_op1(text1, text2)
  42. assert "JiebaTokenizerOp: input should be one column data." in str(info.value)
  43. def test_jieba_1():
  44. """Test jieba tokenizer with MP mode"""
  45. data = ds.TextFileDataset(DATA_FILE)
  46. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  47. data = data.map(operations=jieba_op, input_columns=["text"],
  48. num_parallel_workers=1)
  49. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  50. ret = []
  51. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  52. ret = to_str(i["text"])
  53. for index, item in enumerate(ret):
  54. assert item == expect[index]
  55. def test_jieba_1_1():
  56. """Test jieba tokenizer with HMM mode"""
  57. data = ds.TextFileDataset(DATA_FILE)
  58. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
  59. data = data.map(operations=jieba_op, input_columns=["text"],
  60. num_parallel_workers=1)
  61. expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
  62. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  63. ret = to_str(i["text"])
  64. for index, item in enumerate(ret):
  65. assert item == expect[index]
  66. def test_jieba_1_2():
  67. """Test jieba tokenizer with HMM MIX"""
  68. data = ds.TextFileDataset(DATA_FILE)
  69. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
  70. data = data.map(operations=jieba_op, input_columns=["text"],
  71. num_parallel_workers=1)
  72. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  73. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  74. ret = to_str(i["text"])
  75. for index, item in enumerate(ret):
  76. assert item == expect[index]
  77. def test_jieba_2():
  78. """Test add_word"""
  79. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  80. data = ds.TextFileDataset(DATA_FILE4)
  81. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  82. jieba_op.add_word("男默女泪")
  83. expect = ['男默女泪', '市', '长江大桥']
  84. data = data.map(operations=jieba_op, input_columns=["text"],
  85. num_parallel_workers=2)
  86. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  87. ret = to_str(i["text"])
  88. for index, item in enumerate(ret):
  89. assert item == expect[index]
  90. def test_jieba_2_1():
  91. """Test add_word with freq"""
  92. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  93. data = ds.TextFileDataset(DATA_FILE4)
  94. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  95. jieba_op.add_word("男默女泪", 10)
  96. data = data.map(operations=jieba_op, input_columns=["text"],
  97. num_parallel_workers=2)
  98. expect = ['男默女泪', '市', '长江大桥']
  99. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  100. ret = to_str(i["text"])
  101. for index, item in enumerate(ret):
  102. assert item == expect[index]
  103. def test_jieba_2_2():
  104. """Test add_word with invalid None Input"""
  105. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  106. try:
  107. jieba_op.add_word(None)
  108. except ValueError:
  109. pass
  110. def test_jieba_2_3():
  111. """Test add_word with freq, the value of freq affects the result of segmentation"""
  112. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  113. data = ds.TextFileDataset(DATA_FILE4)
  114. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  115. jieba_op.add_word("江大桥", 20000)
  116. data = data.map(operations=jieba_op, input_columns=["text"],
  117. num_parallel_workers=2)
  118. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  119. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  120. ret = to_str(i["text"])
  121. for index, item in enumerate(ret):
  122. assert item == expect[index]
  123. def test_jieba_3():
  124. """Test add_dict with dict"""
  125. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  126. user_dict = {
  127. "男默女泪": 10
  128. }
  129. data = ds.TextFileDataset(DATA_FILE4)
  130. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  131. jieba_op.add_dict(user_dict)
  132. data = data.map(operations=jieba_op, input_columns=["text"],
  133. num_parallel_workers=1)
  134. expect = ['男默女泪', '市', '长江大桥']
  135. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  136. ret = to_str(i["text"])
  137. for index, item in enumerate(ret):
  138. assert item == expect[index]
  139. def test_jieba_3_1():
  140. """Test add_dict with dict"""
  141. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  142. user_dict = {
  143. "男默女泪": 10,
  144. "江大桥": 20000
  145. }
  146. data = ds.TextFileDataset(DATA_FILE4)
  147. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  148. jieba_op.add_dict(user_dict)
  149. data = data.map(operations=jieba_op, input_columns=["text"],
  150. num_parallel_workers=1)
  151. expect = ['男默女泪', '市长', '江大桥']
  152. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  153. ret = to_str(i["text"])
  154. for index, item in enumerate(ret):
  155. assert item == expect[index]
  156. def test_jieba_4():
  157. DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
  158. DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
  159. data = ds.TextFileDataset(DATA_FILE4)
  160. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  161. jieba_op.add_dict(DICT_FILE)
  162. data = data.map(operations=jieba_op, input_columns=["text"],
  163. num_parallel_workers=1)
  164. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  165. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  166. ret = to_str(i["text"])
  167. for index, item in enumerate(ret):
  168. assert item == expect[index]
  169. def test_jieba_4_1():
  170. """Test add dict with invalid file path"""
  171. DICT_FILE = ""
  172. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  173. try:
  174. jieba_op.add_dict(DICT_FILE)
  175. except ValueError:
  176. pass
  177. def test_jieba_5():
  178. """Test add dict with file path"""
  179. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  180. data = ds.TextFileDataset(DATA_FILE4)
  181. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  182. jieba_op.add_word("江大桥", 20000)
  183. data = data.map(operations=jieba_op, input_columns=["text"],
  184. num_parallel_workers=1)
  185. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  186. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  187. ret = to_str(i["text"])
  188. for index, item in enumerate(ret):
  189. assert item == expect[index]
  190. def test_jieba_with_offsets_1():
  191. """Test jieba tokenizer with MP mode"""
  192. data = ds.TextFileDataset(DATA_FILE)
  193. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  194. data = data.map(operations=jieba_op, input_columns=["text"],
  195. output_columns=["token", "offsets_start", "offsets_limit"],
  196. column_order=["token", "offsets_start", "offsets_limit"],
  197. num_parallel_workers=1)
  198. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  199. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  200. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  201. ret = []
  202. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  203. ret = to_str(i["token"])
  204. for index, item in enumerate(ret):
  205. assert item == expect[index]
  206. for index, item in enumerate(i["offsets_start"]):
  207. assert item == expected_offsets_start[index]
  208. for index, item in enumerate(i["offsets_limit"]):
  209. assert item == expected_offsets_limit[index]
  210. def test_jieba_with_offsets_1_1():
  211. """Test jieba tokenizer with HMM mode"""
  212. data = ds.TextFileDataset(DATA_FILE)
  213. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True)
  214. data = data.map(operations=jieba_op, input_columns=["text"],
  215. output_columns=["token", "offsets_start", "offsets_limit"],
  216. column_order=["token", "offsets_start", "offsets_limit"],
  217. num_parallel_workers=1)
  218. expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
  219. expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45]
  220. expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48]
  221. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  222. ret = to_str(i["token"])
  223. for index, item in enumerate(ret):
  224. assert item == expect[index]
  225. for index, item in enumerate(i["offsets_start"]):
  226. assert item == expected_offsets_start[index]
  227. for index, item in enumerate(i["offsets_limit"]):
  228. assert item == expected_offsets_limit[index]
  229. def test_jieba_with_offsets_1_2():
  230. """Test jieba tokenizer with HMM MIX"""
  231. data = ds.TextFileDataset(DATA_FILE)
  232. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True)
  233. data = data.map(operations=jieba_op, input_columns=["text"],
  234. output_columns=["token", "offsets_start", "offsets_limit"],
  235. column_order=["token", "offsets_start", "offsets_limit"],
  236. num_parallel_workers=1)
  237. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  238. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  239. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  240. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  241. ret = to_str(i["token"])
  242. for index, item in enumerate(ret):
  243. assert item == expect[index]
  244. for index, item in enumerate(i["offsets_start"]):
  245. assert item == expected_offsets_start[index]
  246. for index, item in enumerate(i["offsets_limit"]):
  247. assert item == expected_offsets_limit[index]
  248. def test_jieba_with_offsets_2():
  249. """Test add_word"""
  250. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  251. data = ds.TextFileDataset(DATA_FILE4)
  252. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  253. jieba_op.add_word("男默女泪")
  254. expect = ['男默女泪', '市', '长江大桥']
  255. data = data.map(operations=jieba_op, input_columns=["text"],
  256. output_columns=["token", "offsets_start", "offsets_limit"],
  257. column_order=["token", "offsets_start", "offsets_limit"],
  258. num_parallel_workers=2)
  259. expected_offsets_start = [0, 12, 15]
  260. expected_offsets_limit = [12, 15, 27]
  261. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  262. ret = to_str(i["token"])
  263. for index, item in enumerate(ret):
  264. assert item == expect[index]
  265. for index, item in enumerate(i["offsets_start"]):
  266. assert item == expected_offsets_start[index]
  267. for index, item in enumerate(i["offsets_limit"]):
  268. assert item == expected_offsets_limit[index]
  269. def test_jieba_with_offsets_2_1():
  270. """Test add_word with freq"""
  271. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  272. data = ds.TextFileDataset(DATA_FILE4)
  273. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  274. jieba_op.add_word("男默女泪", 10)
  275. data = data.map(operations=jieba_op, input_columns=["text"],
  276. output_columns=["token", "offsets_start", "offsets_limit"],
  277. column_order=["token", "offsets_start", "offsets_limit"],
  278. num_parallel_workers=2)
  279. expect = ['男默女泪', '市', '长江大桥']
  280. expected_offsets_start = [0, 12, 15]
  281. expected_offsets_limit = [12, 15, 27]
  282. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  283. ret = to_str(i["token"])
  284. for index, item in enumerate(ret):
  285. assert item == expect[index]
  286. for index, item in enumerate(i["offsets_start"]):
  287. assert item == expected_offsets_start[index]
  288. for index, item in enumerate(i["offsets_limit"]):
  289. assert item == expected_offsets_limit[index]
  290. def test_jieba_with_offsets_2_2():
  291. """Test add_word with freq, the value of freq affects the result of segmentation"""
  292. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  293. data = ds.TextFileDataset(DATA_FILE4)
  294. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  295. jieba_op.add_word("江大桥", 20000)
  296. data = data.map(operations=jieba_op, input_columns=["text"],
  297. output_columns=["token", "offsets_start", "offsets_limit"],
  298. column_order=["token", "offsets_start", "offsets_limit"],
  299. num_parallel_workers=2)
  300. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  301. expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
  302. expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
  303. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  304. ret = to_str(i["token"])
  305. for index, item in enumerate(ret):
  306. assert item == expect[index]
  307. for index, item in enumerate(i["offsets_start"]):
  308. assert item == expected_offsets_start[index]
  309. for index, item in enumerate(i["offsets_limit"]):
  310. assert item == expected_offsets_limit[index]
  311. def test_jieba_with_offsets_3():
  312. """Test add_dict with dict"""
  313. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  314. user_dict = {
  315. "男默女泪": 10
  316. }
  317. data = ds.TextFileDataset(DATA_FILE4)
  318. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  319. jieba_op.add_dict(user_dict)
  320. data = data.map(operations=jieba_op, input_columns=["text"],
  321. output_columns=["token", "offsets_start", "offsets_limit"],
  322. column_order=["token", "offsets_start", "offsets_limit"],
  323. num_parallel_workers=1)
  324. expect = ['男默女泪', '市', '长江大桥']
  325. expected_offsets_start = [0, 12, 15]
  326. expected_offsets_limit = [12, 15, 27]
  327. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  328. ret = to_str(i["token"])
  329. for index, item in enumerate(ret):
  330. assert item == expect[index]
  331. for index, item in enumerate(i["offsets_start"]):
  332. assert item == expected_offsets_start[index]
  333. for index, item in enumerate(i["offsets_limit"]):
  334. assert item == expected_offsets_limit[index]
  335. def test_jieba_with_offsets_3_1():
  336. """Test add_dict with dict"""
  337. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  338. user_dict = {
  339. "男默女泪": 10,
  340. "江大桥": 20000
  341. }
  342. data = ds.TextFileDataset(DATA_FILE4)
  343. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  344. jieba_op.add_dict(user_dict)
  345. data = data.map(operations=jieba_op, input_columns=["text"],
  346. output_columns=["token", "offsets_start", "offsets_limit"],
  347. column_order=["token", "offsets_start", "offsets_limit"],
  348. num_parallel_workers=1)
  349. expect = ['男默女泪', '市长', '江大桥']
  350. expected_offsets_start = [0, 12, 18]
  351. expected_offsets_limit = [12, 18, 27]
  352. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  353. ret = to_str(i["token"])
  354. for index, item in enumerate(ret):
  355. assert item == expect[index]
  356. for index, item in enumerate(i["offsets_start"]):
  357. assert item == expected_offsets_start[index]
  358. for index, item in enumerate(i["offsets_limit"]):
  359. assert item == expected_offsets_limit[index]
  360. def test_jieba_with_offsets_4():
  361. DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
  362. DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
  363. data = ds.TextFileDataset(DATA_FILE4)
  364. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  365. jieba_op.add_dict(DICT_FILE)
  366. data = data.map(operations=jieba_op, input_columns=["text"],
  367. output_columns=["token", "offsets_start", "offsets_limit"],
  368. column_order=["token", "offsets_start", "offsets_limit"],
  369. num_parallel_workers=1)
  370. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  371. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  372. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  373. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  374. ret = to_str(i["token"])
  375. for index, item in enumerate(ret):
  376. assert item == expect[index]
  377. for index, item in enumerate(i["offsets_start"]):
  378. assert item == expected_offsets_start[index]
  379. for index, item in enumerate(i["offsets_limit"]):
  380. assert item == expected_offsets_limit[index]
  381. def test_jieba_with_offsets_5():
  382. """Test add dict with file path"""
  383. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  384. data = ds.TextFileDataset(DATA_FILE4)
  385. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  386. jieba_op.add_word("江大桥", 20000)
  387. data = data.map(operations=jieba_op, input_columns=["text"],
  388. output_columns=["token", "offsets_start", "offsets_limit"],
  389. column_order=["token", "offsets_start", "offsets_limit"],
  390. num_parallel_workers=1)
  391. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  392. expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
  393. expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
  394. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  395. ret = to_str(i["token"])
  396. for index, item in enumerate(ret):
  397. assert item == expect[index]
  398. for index, item in enumerate(i["offsets_start"]):
  399. assert item == expected_offsets_start[index]
  400. for index, item in enumerate(i["offsets_limit"]):
  401. assert item == expected_offsets_limit[index]
  402. def gen():
  403. text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
  404. yield (text,)
  405. def pytoken_op(input_data):
  406. te = str(to_str(input_data))
  407. tokens = []
  408. tokens.append(te[:5].encode("UTF8"))
  409. tokens.append(te[5:10].encode("UTF8"))
  410. tokens.append(te[10:].encode("UTF8"))
  411. return np.array(tokens, dtype='S')
  412. def test_jieba_6():
  413. data = ds.GeneratorDataset(gen, column_names=["text"])
  414. data = data.map(operations=pytoken_op, input_columns=["text"],
  415. num_parallel_workers=1)
  416. expect = ['今天天气太', '好了我们一', '起去外面玩吧']
  417. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  418. ret = to_str(i["text"])
  419. for index, item in enumerate(ret):
  420. assert item == expect[index]
  421. if __name__ == "__main__":
  422. test_jieba_callable()
  423. test_jieba_1()
  424. test_jieba_1_1()
  425. test_jieba_1_2()
  426. test_jieba_2()
  427. test_jieba_2_1()
  428. test_jieba_2_2()
  429. test_jieba_3()
  430. test_jieba_3_1()
  431. test_jieba_4()
  432. test_jieba_4_1()
  433. test_jieba_5()
  434. test_jieba_5()
  435. test_jieba_6()
  436. test_jieba_with_offsets_1()
  437. test_jieba_with_offsets_1_1()
  438. test_jieba_with_offsets_1_2()
  439. test_jieba_with_offsets_2()
  440. test_jieba_with_offsets_2_1()
  441. test_jieba_with_offsets_2_2()
  442. test_jieba_with_offsets_3()
  443. test_jieba_with_offsets_3_1()
  444. test_jieba_with_offsets_4()
  445. test_jieba_with_offsets_5()