You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_text_jieba_tokenizer.py 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import numpy as np
  16. import mindspore.dataset as ds
  17. from mindspore.dataset.text import JiebaTokenizer
  18. from mindspore.dataset.text import JiebaMode, to_str
  19. DATA_FILE = "../data/dataset/testJiebaDataset/3.txt"
  20. DATA_ALL_FILE = "../data/dataset/testJiebaDataset/*"
  21. HMM_FILE = "../data/dataset/jiebadict/hmm_model.utf8"
  22. MP_FILE = "../data/dataset/jiebadict/jieba.dict.utf8"
  23. def test_jieba_1():
  24. """Test jieba tokenizer with MP mode"""
  25. data = ds.TextFileDataset(DATA_FILE)
  26. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  27. data = data.map(operations=jieba_op, input_columns=["text"],
  28. num_parallel_workers=1)
  29. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  30. ret = []
  31. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  32. ret = to_str(i["text"])
  33. for index, item in enumerate(ret):
  34. assert item == expect[index]
  35. def test_jieba_1_1():
  36. """Test jieba tokenizer with HMM mode"""
  37. data = ds.TextFileDataset(DATA_FILE)
  38. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
  39. data = data.map(operations=jieba_op, input_columns=["text"],
  40. num_parallel_workers=1)
  41. expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
  42. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  43. ret = to_str(i["text"])
  44. for index, item in enumerate(ret):
  45. assert item == expect[index]
  46. def test_jieba_1_2():
  47. """Test jieba tokenizer with HMM MIX"""
  48. data = ds.TextFileDataset(DATA_FILE)
  49. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
  50. data = data.map(operations=jieba_op, input_columns=["text"],
  51. num_parallel_workers=1)
  52. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  53. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  54. ret = to_str(i["text"])
  55. for index, item in enumerate(ret):
  56. assert item == expect[index]
  57. def test_jieba_2():
  58. """Test add_word"""
  59. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  60. data = ds.TextFileDataset(DATA_FILE4)
  61. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  62. jieba_op.add_word("男默女泪")
  63. expect = ['男默女泪', '市', '长江大桥']
  64. data = data.map(operations=jieba_op, input_columns=["text"],
  65. num_parallel_workers=2)
  66. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  67. ret = to_str(i["text"])
  68. for index, item in enumerate(ret):
  69. assert item == expect[index]
  70. def test_jieba_2_1():
  71. """Test add_word with freq"""
  72. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  73. data = ds.TextFileDataset(DATA_FILE4)
  74. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  75. jieba_op.add_word("男默女泪", 10)
  76. data = data.map(operations=jieba_op, input_columns=["text"],
  77. num_parallel_workers=2)
  78. expect = ['男默女泪', '市', '长江大桥']
  79. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  80. ret = to_str(i["text"])
  81. for index, item in enumerate(ret):
  82. assert item == expect[index]
  83. def test_jieba_2_2():
  84. """Test add_word with invalid None Input"""
  85. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  86. try:
  87. jieba_op.add_word(None)
  88. except ValueError:
  89. pass
  90. def test_jieba_2_3():
  91. """Test add_word with freq, the value of freq affects the result of segmentation"""
  92. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  93. data = ds.TextFileDataset(DATA_FILE4)
  94. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  95. jieba_op.add_word("江大桥", 20000)
  96. data = data.map(operations=jieba_op, input_columns=["text"],
  97. num_parallel_workers=2)
  98. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  99. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  100. ret = to_str(i["text"])
  101. for index, item in enumerate(ret):
  102. assert item == expect[index]
  103. def test_jieba_3():
  104. """Test add_dict with dict"""
  105. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  106. user_dict = {
  107. "男默女泪": 10
  108. }
  109. data = ds.TextFileDataset(DATA_FILE4)
  110. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  111. jieba_op.add_dict(user_dict)
  112. data = data.map(operations=jieba_op, input_columns=["text"],
  113. num_parallel_workers=1)
  114. expect = ['男默女泪', '市', '长江大桥']
  115. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  116. ret = to_str(i["text"])
  117. for index, item in enumerate(ret):
  118. assert item == expect[index]
  119. def test_jieba_3_1():
  120. """Test add_dict with dict"""
  121. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  122. user_dict = {
  123. "男默女泪": 10,
  124. "江大桥": 20000
  125. }
  126. data = ds.TextFileDataset(DATA_FILE4)
  127. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  128. jieba_op.add_dict(user_dict)
  129. data = data.map(operations=jieba_op, input_columns=["text"],
  130. num_parallel_workers=1)
  131. expect = ['男默女泪', '市长', '江大桥']
  132. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  133. ret = to_str(i["text"])
  134. for index, item in enumerate(ret):
  135. assert item == expect[index]
  136. def test_jieba_4():
  137. DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
  138. DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
  139. data = ds.TextFileDataset(DATA_FILE4)
  140. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  141. jieba_op.add_dict(DICT_FILE)
  142. data = data.map(operations=jieba_op, input_columns=["text"],
  143. num_parallel_workers=1)
  144. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  145. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  146. ret = to_str(i["text"])
  147. for index, item in enumerate(ret):
  148. assert item == expect[index]
  149. def test_jieba_4_1():
  150. """Test add dict with invalid file path"""
  151. DICT_FILE = ""
  152. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  153. try:
  154. jieba_op.add_dict(DICT_FILE)
  155. except ValueError:
  156. pass
  157. def test_jieba_5():
  158. """Test add dict with file path"""
  159. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  160. data = ds.TextFileDataset(DATA_FILE4)
  161. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
  162. jieba_op.add_word("江大桥", 20000)
  163. data = data.map(operations=jieba_op, input_columns=["text"],
  164. num_parallel_workers=1)
  165. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  166. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  167. ret = to_str(i["text"])
  168. for index, item in enumerate(ret):
  169. assert item == expect[index]
  170. def test_jieba_with_offsets_1():
  171. """Test jieba tokenizer with MP mode"""
  172. data = ds.TextFileDataset(DATA_FILE)
  173. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  174. data = data.map(operations=jieba_op, input_columns=["text"],
  175. output_columns=["token", "offsets_start", "offsets_limit"],
  176. column_order=["token", "offsets_start", "offsets_limit"],
  177. num_parallel_workers=1)
  178. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  179. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  180. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  181. ret = []
  182. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  183. ret = to_str(i["token"])
  184. for index, item in enumerate(ret):
  185. assert item == expect[index]
  186. for index, item in enumerate(i["offsets_start"]):
  187. assert item == expected_offsets_start[index]
  188. for index, item in enumerate(i["offsets_limit"]):
  189. assert item == expected_offsets_limit[index]
  190. def test_jieba_with_offsets_1_1():
  191. """Test jieba tokenizer with HMM mode"""
  192. data = ds.TextFileDataset(DATA_FILE)
  193. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True)
  194. data = data.map(operations=jieba_op, input_columns=["text"],
  195. output_columns=["token", "offsets_start", "offsets_limit"],
  196. column_order=["token", "offsets_start", "offsets_limit"],
  197. num_parallel_workers=1)
  198. expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
  199. expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45]
  200. expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48]
  201. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  202. ret = to_str(i["token"])
  203. for index, item in enumerate(ret):
  204. assert item == expect[index]
  205. for index, item in enumerate(i["offsets_start"]):
  206. assert item == expected_offsets_start[index]
  207. for index, item in enumerate(i["offsets_limit"]):
  208. assert item == expected_offsets_limit[index]
  209. def test_jieba_with_offsets_1_2():
  210. """Test jieba tokenizer with HMM MIX"""
  211. data = ds.TextFileDataset(DATA_FILE)
  212. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True)
  213. data = data.map(operations=jieba_op, input_columns=["text"],
  214. output_columns=["token", "offsets_start", "offsets_limit"],
  215. column_order=["token", "offsets_start", "offsets_limit"],
  216. num_parallel_workers=1)
  217. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  218. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  219. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  220. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  221. ret = to_str(i["token"])
  222. for index, item in enumerate(ret):
  223. assert item == expect[index]
  224. for index, item in enumerate(i["offsets_start"]):
  225. assert item == expected_offsets_start[index]
  226. for index, item in enumerate(i["offsets_limit"]):
  227. assert item == expected_offsets_limit[index]
  228. def test_jieba_with_offsets_2():
  229. """Test add_word"""
  230. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  231. data = ds.TextFileDataset(DATA_FILE4)
  232. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  233. jieba_op.add_word("男默女泪")
  234. expect = ['男默女泪', '市', '长江大桥']
  235. data = data.map(operations=jieba_op, input_columns=["text"],
  236. output_columns=["token", "offsets_start", "offsets_limit"],
  237. column_order=["token", "offsets_start", "offsets_limit"],
  238. num_parallel_workers=2)
  239. expected_offsets_start = [0, 12, 15]
  240. expected_offsets_limit = [12, 15, 27]
  241. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  242. ret = to_str(i["token"])
  243. for index, item in enumerate(ret):
  244. assert item == expect[index]
  245. for index, item in enumerate(i["offsets_start"]):
  246. assert item == expected_offsets_start[index]
  247. for index, item in enumerate(i["offsets_limit"]):
  248. assert item == expected_offsets_limit[index]
  249. def test_jieba_with_offsets_2_1():
  250. """Test add_word with freq"""
  251. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  252. data = ds.TextFileDataset(DATA_FILE4)
  253. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  254. jieba_op.add_word("男默女泪", 10)
  255. data = data.map(operations=jieba_op, input_columns=["text"],
  256. output_columns=["token", "offsets_start", "offsets_limit"],
  257. column_order=["token", "offsets_start", "offsets_limit"],
  258. num_parallel_workers=2)
  259. expect = ['男默女泪', '市', '长江大桥']
  260. expected_offsets_start = [0, 12, 15]
  261. expected_offsets_limit = [12, 15, 27]
  262. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  263. ret = to_str(i["token"])
  264. for index, item in enumerate(ret):
  265. assert item == expect[index]
  266. for index, item in enumerate(i["offsets_start"]):
  267. assert item == expected_offsets_start[index]
  268. for index, item in enumerate(i["offsets_limit"]):
  269. assert item == expected_offsets_limit[index]
  270. def test_jieba_with_offsets_2_2():
  271. """Test add_word with freq, the value of freq affects the result of segmentation"""
  272. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  273. data = ds.TextFileDataset(DATA_FILE4)
  274. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  275. jieba_op.add_word("江大桥", 20000)
  276. data = data.map(operations=jieba_op, input_columns=["text"],
  277. output_columns=["token", "offsets_start", "offsets_limit"],
  278. column_order=["token", "offsets_start", "offsets_limit"],
  279. num_parallel_workers=2)
  280. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  281. expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
  282. expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
  283. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  284. ret = to_str(i["token"])
  285. for index, item in enumerate(ret):
  286. assert item == expect[index]
  287. for index, item in enumerate(i["offsets_start"]):
  288. assert item == expected_offsets_start[index]
  289. for index, item in enumerate(i["offsets_limit"]):
  290. assert item == expected_offsets_limit[index]
  291. def test_jieba_with_offsets_3():
  292. """Test add_dict with dict"""
  293. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  294. user_dict = {
  295. "男默女泪": 10
  296. }
  297. data = ds.TextFileDataset(DATA_FILE4)
  298. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  299. jieba_op.add_dict(user_dict)
  300. data = data.map(operations=jieba_op, input_columns=["text"],
  301. output_columns=["token", "offsets_start", "offsets_limit"],
  302. column_order=["token", "offsets_start", "offsets_limit"],
  303. num_parallel_workers=1)
  304. expect = ['男默女泪', '市', '长江大桥']
  305. expected_offsets_start = [0, 12, 15]
  306. expected_offsets_limit = [12, 15, 27]
  307. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  308. ret = to_str(i["token"])
  309. for index, item in enumerate(ret):
  310. assert item == expect[index]
  311. for index, item in enumerate(i["offsets_start"]):
  312. assert item == expected_offsets_start[index]
  313. for index, item in enumerate(i["offsets_limit"]):
  314. assert item == expected_offsets_limit[index]
  315. def test_jieba_with_offsets_3_1():
  316. """Test add_dict with dict"""
  317. DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
  318. user_dict = {
  319. "男默女泪": 10,
  320. "江大桥": 20000
  321. }
  322. data = ds.TextFileDataset(DATA_FILE4)
  323. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  324. jieba_op.add_dict(user_dict)
  325. data = data.map(operations=jieba_op, input_columns=["text"],
  326. output_columns=["token", "offsets_start", "offsets_limit"],
  327. column_order=["token", "offsets_start", "offsets_limit"],
  328. num_parallel_workers=1)
  329. expect = ['男默女泪', '市长', '江大桥']
  330. expected_offsets_start = [0, 12, 18]
  331. expected_offsets_limit = [12, 18, 27]
  332. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  333. ret = to_str(i["token"])
  334. for index, item in enumerate(ret):
  335. assert item == expect[index]
  336. for index, item in enumerate(i["offsets_start"]):
  337. assert item == expected_offsets_start[index]
  338. for index, item in enumerate(i["offsets_limit"]):
  339. assert item == expected_offsets_limit[index]
  340. def test_jieba_with_offsets_4():
  341. DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
  342. DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
  343. data = ds.TextFileDataset(DATA_FILE4)
  344. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  345. jieba_op.add_dict(DICT_FILE)
  346. data = data.map(operations=jieba_op, input_columns=["text"],
  347. output_columns=["token", "offsets_start", "offsets_limit"],
  348. column_order=["token", "offsets_start", "offsets_limit"],
  349. num_parallel_workers=1)
  350. expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
  351. expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
  352. expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
  353. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  354. ret = to_str(i["token"])
  355. for index, item in enumerate(ret):
  356. assert item == expect[index]
  357. for index, item in enumerate(i["offsets_start"]):
  358. assert item == expected_offsets_start[index]
  359. for index, item in enumerate(i["offsets_limit"]):
  360. assert item == expected_offsets_limit[index]
  361. def test_jieba_with_offsets_5():
  362. """Test add dict with file path"""
  363. DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
  364. data = ds.TextFileDataset(DATA_FILE4)
  365. jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
  366. jieba_op.add_word("江大桥", 20000)
  367. data = data.map(operations=jieba_op, input_columns=["text"],
  368. output_columns=["token", "offsets_start", "offsets_limit"],
  369. column_order=["token", "offsets_start", "offsets_limit"],
  370. num_parallel_workers=1)
  371. expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
  372. expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
  373. expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
  374. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  375. ret = to_str(i["token"])
  376. for index, item in enumerate(ret):
  377. assert item == expect[index]
  378. for index, item in enumerate(i["offsets_start"]):
  379. assert item == expected_offsets_start[index]
  380. for index, item in enumerate(i["offsets_limit"]):
  381. assert item == expected_offsets_limit[index]
  382. def gen():
  383. text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
  384. yield (text,)
  385. def pytoken_op(input_data):
  386. te = str(to_str(input_data))
  387. tokens = []
  388. tokens.append(te[:5].encode("UTF8"))
  389. tokens.append(te[5:10].encode("UTF8"))
  390. tokens.append(te[10:].encode("UTF8"))
  391. return np.array(tokens, dtype='S')
  392. def test_jieba_6():
  393. data = ds.GeneratorDataset(gen, column_names=["text"])
  394. data = data.map(operations=pytoken_op, input_columns=["text"],
  395. num_parallel_workers=1)
  396. expect = ['今天天气太', '好了我们一', '起去外面玩吧']
  397. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  398. ret = to_str(i["text"])
  399. for index, item in enumerate(ret):
  400. assert item == expect[index]
  401. if __name__ == "__main__":
  402. test_jieba_1()
  403. test_jieba_1_1()
  404. test_jieba_1_2()
  405. test_jieba_2()
  406. test_jieba_2_1()
  407. test_jieba_2_2()
  408. test_jieba_3()
  409. test_jieba_3_1()
  410. test_jieba_4()
  411. test_jieba_4_1()
  412. test_jieba_5()
  413. test_jieba_5()
  414. test_jieba_6()
  415. test_jieba_with_offsets_1()
  416. test_jieba_with_offsets_1_1()
  417. test_jieba_with_offsets_1_2()
  418. test_jieba_with_offsets_2()
  419. test_jieba_with_offsets_2_1()
  420. test_jieba_with_offsets_2_2()
  421. test_jieba_with_offsets_3()
  422. test_jieba_with_offsets_3_1()
  423. test_jieba_with_offsets_4()
  424. test_jieba_with_offsets_5()