You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_text_jieba_tokenizer.py 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import mindspore.dataset as ds
  17. from mindspore.dataset.text import JiebaTokenizer
  18. from mindspore.dataset.text import JiebaMode, to_str
  19. from mindspore import log as logger
  20. DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
  21. DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
  22. HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
  23. MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
  24. def test_jieba_callable():
  25. """
  26. Test jieba tokenizer op is callable
  27. """
  28. logger.info("test_jieba_callable")
  29. jieba_op1 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  30. jieba_op2 = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
  31. text1 = "今天天气太好了我们一起去外面玩吧"
  32. text2 = "男默女泪市长江大桥"
  33. assert np.array_equal(jieba_op1(text1), ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'])
  34. assert np.array_equal(jieba_op2(text1), ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'])
  35. jieba_op1.add_word("男默女泪")
  36. assert np.array_equal(jieba_op1(text2), ['男默女泪', '市', '长江大桥'])
  37. def test_jieba_1():
  38. """Test jieba tokenizer with MP mode"""
  39. data = ds.TextFileDataset(DATA_FILE)
  40. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  41. data = data.map(operations=jieba_op, input_columns=["text"],
  42. num_parallel_workers=1)
  43. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  44. ret = []
  45. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  46. ret = to_str(i["text"])
  47. for index, item in enumerate(ret):
  48. assert item == expect[index]
  49. def test_jieba_1_1():
  50. """Test jieba tokenizer with HMM mode"""
  51. data = ds.TextFileDataset(DATA_FILE)
  52. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
  53. data = data.map(operations=jieba_op, input_columns=["text"],
  54. num_parallel_workers=1)
  55. expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
  56. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  57. ret = to_str(i["text"])
  58. for index, item in enumerate(ret):
  59. assert item == expect[index]
  60. def test_jieba_1_2():
  61. """Test jieba tokenizer with HMM MIX"""
  62. data = ds.TextFileDataset(DATA_FILE)
  63. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
  64. data = data.map(operations=jieba_op, input_columns=["text"],
  65. num_parallel_workers=1)
  66. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  67. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  68. ret = to_str(i["text"])
  69. for index, item in enumerate(ret):
  70. assert item == expect[index]
  71. def test_jieba_2():
  72. """Test add_word"""
  73. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  74. data = ds.TextFileDataset(DATA_FILE4)
  75. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  76. jieba_op.add_word("男默女泪")
  77. expect = ['男默女泪', '市', '长江大桥']
  78. data = data.map(operations=jieba_op, input_columns=["text"],
  79. num_parallel_workers=2)
  80. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  81. ret = to_str(i["text"])
  82. for index, item in enumerate(ret):
  83. assert item == expect[index]
  84. def test_jieba_2_1():
  85. """Test add_word with freq"""
  86. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  87. data = ds.TextFileDataset(DATA_FILE4)
  88. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  89. jieba_op.add_word("男默女泪", 10)
  90. data = data.map(operations=jieba_op, input_columns=["text"],
  91. num_parallel_workers=2)
  92. expect = ['男默女泪', '市', '长江大桥']
  93. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  94. ret = to_str(i["text"])
  95. for index, item in enumerate(ret):
  96. assert item == expect[index]
  97. def test_jieba_2_2():
  98. """Test add_word with invalid None Input"""
  99. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  100. try:
  101. jieba_op.add_word(None)
  102. except ValueError:
  103. pass
  104. def test_jieba_2_3():
  105. """Test add_word with freq, the value of freq affects the result of segmentation"""
  106. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  107. data = ds.TextFileDataset(DATA_FILE4)
  108. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  109. jieba_op.add_word("江大桥", 20000)
  110. data = data.map(operations=jieba_op, input_columns=["text"],
  111. num_parallel_workers=2)
  112. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  113. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  114. ret = to_str(i["text"])
  115. for index, item in enumerate(ret):
  116. assert item == expect[index]
  117. def test_jieba_3():
  118. """Test add_dict with dict"""
  119. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  120. user_dict = {
  121. "男默女泪": 10
  122. }
  123. data = ds.TextFileDataset(DATA_FILE4)
  124. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  125. jieba_op.add_dict(user_dict)
  126. data = data.map(operations=jieba_op, input_columns=["text"],
  127. num_parallel_workers=1)
  128. expect = ['男默女泪', '市', '长江大桥']
  129. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  130. ret = to_str(i["text"])
  131. for index, item in enumerate(ret):
  132. assert item == expect[index]
  133. def test_jieba_3_1():
  134. """Test add_dict with dict"""
  135. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  136. user_dict = {
  137. "男默女泪": 10,
  138. "江大桥": 20000
  139. }
  140. data = ds.TextFileDataset(DATA_FILE4)
  141. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  142. jieba_op.add_dict(user_dict)
  143. data = data.map(operations=jieba_op, input_columns=["text"],
  144. num_parallel_workers=1)
  145. expect = ['男默女泪', '市长', '江大桥']
  146. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  147. ret = to_str(i["text"])
  148. for index, item in enumerate(ret):
  149. assert item == expect[index]
  150. def test_jieba_4():
  151. DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
  152. DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
  153. data = ds.TextFileDataset(DATA_FILE4)
  154. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  155. jieba_op.add_dict(DICT_FILE)
  156. data = data.map(operations=jieba_op, input_columns=["text"],
  157. num_parallel_workers=1)
  158. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  159. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  160. ret = to_str(i["text"])
  161. for index, item in enumerate(ret):
  162. assert item == expect[index]
  163. def test_jieba_4_1():
  164. """Test add dict with invalid file path"""
  165. DICT_FILE = ""
  166. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  167. try:
  168. jieba_op.add_dict(DICT_FILE)
  169. except ValueError:
  170. pass
  171. def test_jieba_5():
  172. """Test add dict with file path"""
  173. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  174. data = ds.TextFileDataset(DATA_FILE4)
  175. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  176. jieba_op.add_word("江大桥", 20000)
  177. data = data.map(operations=jieba_op, input_columns=["text"],
  178. num_parallel_workers=1)
  179. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  180. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  181. ret = to_str(i["text"])
  182. for index, item in enumerate(ret):
  183. assert item == expect[index]
  184. def test_jieba_with_offsets_1():
  185. """Test jieba tokenizer with MP mode"""
  186. data = ds.TextFileDataset(DATA_FILE)
  187. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  188. data = data.map(operations=jieba_op, input_columns=["text"],
  189. output_columns=["token", "offsets_start", "offsets_limit"],
  190. column_order=["token", "offsets_start", "offsets_limit"],
  191. num_parallel_workers=1)
  192. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  193. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  194. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  195. ret = []
  196. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  197. ret = to_str(i["token"])
  198. for index, item in enumerate(ret):
  199. assert item == expect[index]
  200. for index, item in enumerate(i["offsets_start"]):
  201. assert item == expected_offsets_start[index]
  202. for index, item in enumerate(i["offsets_limit"]):
  203. assert item == expected_offsets_limit[index]
  204. def test_jieba_with_offsets_1_1():
  205. """Test jieba tokenizer with HMM mode"""
  206. data = ds.TextFileDataset(DATA_FILE)
  207. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True)
  208. data = data.map(operations=jieba_op, input_columns=["text"],
  209. output_columns=["token", "offsets_start", "offsets_limit"],
  210. column_order=["token", "offsets_start", "offsets_limit"],
  211. num_parallel_workers=1)
  212. expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
  213. expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45]
  214. expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48]
  215. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  216. ret = to_str(i["token"])
  217. for index, item in enumerate(ret):
  218. assert item == expect[index]
  219. for index, item in enumerate(i["offsets_start"]):
  220. assert item == expected_offsets_start[index]
  221. for index, item in enumerate(i["offsets_limit"]):
  222. assert item == expected_offsets_limit[index]
  223. def test_jieba_with_offsets_1_2():
  224. """Test jieba tokenizer with HMM MIX"""
  225. data = ds.TextFileDataset(DATA_FILE)
  226. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True)
  227. data = data.map(operations=jieba_op, input_columns=["text"],
  228. output_columns=["token", "offsets_start", "offsets_limit"],
  229. column_order=["token", "offsets_start", "offsets_limit"],
  230. num_parallel_workers=1)
  231. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  232. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  233. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  234. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  235. ret = to_str(i["token"])
  236. for index, item in enumerate(ret):
  237. assert item == expect[index]
  238. for index, item in enumerate(i["offsets_start"]):
  239. assert item == expected_offsets_start[index]
  240. for index, item in enumerate(i["offsets_limit"]):
  241. assert item == expected_offsets_limit[index]
  242. def test_jieba_with_offsets_2():
  243. """Test add_word"""
  244. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  245. data = ds.TextFileDataset(DATA_FILE4)
  246. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  247. jieba_op.add_word("男默女泪")
  248. expect = ['男默女泪', '市', '长江大桥']
  249. data = data.map(operations=jieba_op, input_columns=["text"],
  250. output_columns=["token", "offsets_start", "offsets_limit"],
  251. column_order=["token", "offsets_start", "offsets_limit"],
  252. num_parallel_workers=2)
  253. expected_offsets_start = [0, 12, 15]
  254. expected_offsets_limit = [12, 15, 27]
  255. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  256. ret = to_str(i["token"])
  257. for index, item in enumerate(ret):
  258. assert item == expect[index]
  259. for index, item in enumerate(i["offsets_start"]):
  260. assert item == expected_offsets_start[index]
  261. for index, item in enumerate(i["offsets_limit"]):
  262. assert item == expected_offsets_limit[index]
  263. def test_jieba_with_offsets_2_1():
  264. """Test add_word with freq"""
  265. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  266. data = ds.TextFileDataset(DATA_FILE4)
  267. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  268. jieba_op.add_word("男默女泪", 10)
  269. data = data.map(operations=jieba_op, input_columns=["text"],
  270. output_columns=["token", "offsets_start", "offsets_limit"],
  271. column_order=["token", "offsets_start", "offsets_limit"],
  272. num_parallel_workers=2)
  273. expect = ['男默女泪', '市', '长江大桥']
  274. expected_offsets_start = [0, 12, 15]
  275. expected_offsets_limit = [12, 15, 27]
  276. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  277. ret = to_str(i["token"])
  278. for index, item in enumerate(ret):
  279. assert item == expect[index]
  280. for index, item in enumerate(i["offsets_start"]):
  281. assert item == expected_offsets_start[index]
  282. for index, item in enumerate(i["offsets_limit"]):
  283. assert item == expected_offsets_limit[index]
  284. def test_jieba_with_offsets_2_2():
  285. """Test add_word with freq, the value of freq affects the result of segmentation"""
  286. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  287. data = ds.TextFileDataset(DATA_FILE4)
  288. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  289. jieba_op.add_word("江大桥", 20000)
  290. data = data.map(operations=jieba_op, input_columns=["text"],
  291. output_columns=["token", "offsets_start", "offsets_limit"],
  292. column_order=["token", "offsets_start", "offsets_limit"],
  293. num_parallel_workers=2)
  294. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  295. expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
  296. expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
  297. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  298. ret = to_str(i["token"])
  299. for index, item in enumerate(ret):
  300. assert item == expect[index]
  301. for index, item in enumerate(i["offsets_start"]):
  302. assert item == expected_offsets_start[index]
  303. for index, item in enumerate(i["offsets_limit"]):
  304. assert item == expected_offsets_limit[index]
  305. def test_jieba_with_offsets_3():
  306. """Test add_dict with dict"""
  307. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  308. user_dict = {
  309. "男默女泪": 10
  310. }
  311. data = ds.TextFileDataset(DATA_FILE4)
  312. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  313. jieba_op.add_dict(user_dict)
  314. data = data.map(operations=jieba_op, input_columns=["text"],
  315. output_columns=["token", "offsets_start", "offsets_limit"],
  316. column_order=["token", "offsets_start", "offsets_limit"],
  317. num_parallel_workers=1)
  318. expect = ['男默女泪', '市', '长江大桥']
  319. expected_offsets_start = [0, 12, 15]
  320. expected_offsets_limit = [12, 15, 27]
  321. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  322. ret = to_str(i["token"])
  323. for index, item in enumerate(ret):
  324. assert item == expect[index]
  325. for index, item in enumerate(i["offsets_start"]):
  326. assert item == expected_offsets_start[index]
  327. for index, item in enumerate(i["offsets_limit"]):
  328. assert item == expected_offsets_limit[index]
  329. def test_jieba_with_offsets_3_1():
  330. """Test add_dict with dict"""
  331. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  332. user_dict = {
  333. "男默女泪": 10,
  334. "江大桥": 20000
  335. }
  336. data = ds.TextFileDataset(DATA_FILE4)
  337. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  338. jieba_op.add_dict(user_dict)
  339. data = data.map(operations=jieba_op, input_columns=["text"],
  340. output_columns=["token", "offsets_start", "offsets_limit"],
  341. column_order=["token", "offsets_start", "offsets_limit"],
  342. num_parallel_workers=1)
  343. expect = ['男默女泪', '市长', '江大桥']
  344. expected_offsets_start = [0, 12, 18]
  345. expected_offsets_limit = [12, 18, 27]
  346. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  347. ret = to_str(i["token"])
  348. for index, item in enumerate(ret):
  349. assert item == expect[index]
  350. for index, item in enumerate(i["offsets_start"]):
  351. assert item == expected_offsets_start[index]
  352. for index, item in enumerate(i["offsets_limit"]):
  353. assert item == expected_offsets_limit[index]
  354. def test_jieba_with_offsets_4():
  355. DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
  356. DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
  357. data = ds.TextFileDataset(DATA_FILE4)
  358. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  359. jieba_op.add_dict(DICT_FILE)
  360. data = data.map(operations=jieba_op, input_columns=["text"],
  361. output_columns=["token", "offsets_start", "offsets_limit"],
  362. column_order=["token", "offsets_start", "offsets_limit"],
  363. num_parallel_workers=1)
  364. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  365. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  366. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  367. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  368. ret = to_str(i["token"])
  369. for index, item in enumerate(ret):
  370. assert item == expect[index]
  371. for index, item in enumerate(i["offsets_start"]):
  372. assert item == expected_offsets_start[index]
  373. for index, item in enumerate(i["offsets_limit"]):
  374. assert item == expected_offsets_limit[index]
  375. def test_jieba_with_offsets_5():
  376. """Test add dict with file path"""
  377. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  378. data = ds.TextFileDataset(DATA_FILE4)
  379. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  380. jieba_op.add_word("江大桥", 20000)
  381. data = data.map(operations=jieba_op, input_columns=["text"],
  382. output_columns=["token", "offsets_start", "offsets_limit"],
  383. column_order=["token", "offsets_start", "offsets_limit"],
  384. num_parallel_workers=1)
  385. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  386. expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
  387. expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
  388. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  389. ret = to_str(i["token"])
  390. for index, item in enumerate(ret):
  391. assert item == expect[index]
  392. for index, item in enumerate(i["offsets_start"]):
  393. assert item == expected_offsets_start[index]
  394. for index, item in enumerate(i["offsets_limit"]):
  395. assert item == expected_offsets_limit[index]
  396. def gen():
  397. text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
  398. yield (text,)
  399. def pytoken_op(input_data):
  400. te = str(to_str(input_data))
  401. tokens = []
  402. tokens.append(te[:5].encode("UTF8"))
  403. tokens.append(te[5:10].encode("UTF8"))
  404. tokens.append(te[10:].encode("UTF8"))
  405. return np.array(tokens, dtype='S')
  406. def test_jieba_6():
  407. data = ds.GeneratorDataset(gen, column_names=["text"])
  408. data = data.map(operations=pytoken_op, input_columns=["text"],
  409. num_parallel_workers=1)
  410. expect = ['今天天气太', '好了我们一', '起去外面玩吧']
  411. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  412. ret = to_str(i["text"])
  413. for index, item in enumerate(ret):
  414. assert item == expect[index]
  415. if __name__ == "__main__":
  416. test_jieba_callable()
  417. test_jieba_1()
  418. test_jieba_1_1()
  419. test_jieba_1_2()
  420. test_jieba_2()
  421. test_jieba_2_1()
  422. test_jieba_2_2()
  423. test_jieba_3()
  424. test_jieba_3_1()
  425. test_jieba_4()
  426. test_jieba_4_1()
  427. test_jieba_5()
  428. test_jieba_5()
  429. test_jieba_6()
  430. test_jieba_with_offsets_1()
  431. test_jieba_with_offsets_1_1()
  432. test_jieba_with_offsets_1_2()
  433. test_jieba_with_offsets_2()
  434. test_jieba_with_offsets_2_1()
  435. test_jieba_with_offsets_2_2()
  436. test_jieba_with_offsets_3()
  437. test_jieba_with_offsets_3_1()
  438. test_jieba_with_offsets_4()
  439. test_jieba_with_offsets_5()