You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fastnlp_tutorial_1.ipynb 36 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "id": "cdc25fcd",
  6. "metadata": {},
  7. "source": [
  8. "# T1. dataset 和 vocabulary 的基本使用\n",
  9. "\n",
  10. "  1   dataset 的使用与结构\n",
  11. " \n",
  12. "    1.1   dataset 的结构与创建\n",
  13. "\n",
  14. "    1.2   dataset 的数据预处理\n",
  15. "\n",
  16. "    1.3   延伸:instance 和 field\n",
  17. "\n",
  18. "  2   vocabulary 的结构与使用\n",
  19. "\n",
  20. "    2.1   vocabulary 的创建与修改\n",
  21. "\n",
  22. "    2.2   vocabulary 与 OOV 问题\n",
  23. "\n",
  24. "  3   dataset 和 vocabulary 的组合使用\n",
  25. " \n",
  26. "    3.1   从 dataframe 中加载 dataset\n",
  27. "\n",
  28. "    3.2   从 dataset 中获取 vocabulary"
  29. ]
  30. },
  31. {
  32. "cell_type": "markdown",
  33. "id": "0eb18a22",
  34. "metadata": {},
  35. "source": [
  36. "## 1. dataset 的基本使用\n",
  37. "\n",
  38. "### 1.1 dataset 的结构与创建\n",
  39. "\n",
  40. "在`fastNLP 0.8`中,使用`DataSet`模块表示数据集,**`dataset`类似于关系型数据库中的数据表**(下文统一为小写`dataset`)\n",
  41. "\n",
  42. "  **主要包含`field`字段和`instance`实例两个元素**,对应`table`中的`field`字段和`record`记录\n",
  43. "\n",
  44. "在`fastNLP 0.8`中,`DataSet`模块被定义在`fastNLP.core.dataset`路径下,导入该模块后,最简单的\n",
  45. "\n",
  46. "  初始化方法,即将字典形式的表格 **`{'field1': column1, 'field2': column2, ...}`** 传入构造函数"
  47. ]
  48. },
  49. {
  50. "cell_type": "code",
  51. "execution_count": 1,
  52. "id": "a1d69ad2",
  53. "metadata": {},
  54. "outputs": [
  55. {
  56. "data": {
  57. "text/html": [
  58. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  59. "</pre>\n"
  60. ],
  61. "text/plain": [
  62. "\n"
  63. ]
  64. },
  65. "metadata": {},
  66. "output_type": "display_data"
  67. },
  68. {
  69. "name": "stdout",
  70. "output_type": "stream",
  71. "text": [
  72. "+-----+------------------------+------------------------+-----+\n",
  73. "| idx | sentence | words | num |\n",
  74. "+-----+------------------------+------------------------+-----+\n",
  75. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  76. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  77. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  78. "+-----+------------------------+------------------------+-----+\n"
  79. ]
  80. }
  81. ],
  82. "source": [
  83. "from fastNLP.core.dataset import DataSet\n",
  84. "\n",
  85. "data = {'idx': [0, 1, 2], \n",
  86. " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n",
  87. " 'words': [['This', 'is', 'an', 'apple', '.'], \n",
  88. " ['I', 'like', 'apples', '.'], \n",
  89. " ['Apples', 'are', 'good', 'for', 'our', 'health', '.']],\n",
  90. " 'num': [5, 4, 7]}\n",
  91. "\n",
  92. "dataset = DataSet(data)\n",
  93. "print(dataset)"
  94. ]
  95. },
  96. {
  97. "cell_type": "markdown",
  98. "id": "9260fdc6",
  99. "metadata": {},
  100. "source": [
  101. "&emsp; 在`dataset`的实例中,字段`field`的名称和实例`instance`中的字符串也可以中文"
  102. ]
  103. },
  104. {
  105. "cell_type": "code",
  106. "execution_count": 2,
  107. "id": "3d72ef00",
  108. "metadata": {},
  109. "outputs": [
  110. {
  111. "name": "stdout",
  112. "output_type": "stream",
  113. "text": [
  114. "+------+--------------------+------------------------+------+\n",
  115. "| 序号 | 句子 | 字符 | 长度 |\n",
  116. "+------+--------------------+------------------------+------+\n",
  117. "| 0 | 生活就像海洋, | ['生', '活', '就', ... | 7 |\n",
  118. "| 1 | 只有意志坚强的人, | ['只', '有', '意', ... | 9 |\n",
  119. "| 2 | 才能到达彼岸。 | ['才', '能', '到', ... | 7 |\n",
  120. "+------+--------------------+------------------------+------+\n"
  121. ]
  122. }
  123. ],
  124. "source": [
  125. "temp = {'序号': [0, 1, 2], \n",
  126. " '句子':[\"生活就像海洋,\", \"只有意志坚强的人,\", \"才能到达彼岸。\"],\n",
  127. " '字符': [['生', '活', '就', '像', '海', '洋', ','], \n",
  128. " ['只', '有', '意', '志', '坚', '强', '的', '人', ','], \n",
  129. " ['才', '能', '到', '达', '彼', '岸', '。']],\n",
  130. " '长度': [7, 9, 7]}\n",
  131. "\n",
  132. "chinese = DataSet(temp)\n",
  133. "print(chinese)"
  134. ]
  135. },
  136. {
  137. "cell_type": "markdown",
  138. "id": "202e5490",
  139. "metadata": {},
  140. "source": [
  141. "在`dataset`中,使用`drop`方法可以删除满足条件的实例,这里使用了python中的`lambda`表达式\n",
  142. "\n",
  143. "&emsp; 注一:在`drop`方法中,通过设置`inplace`参数将删除对应实例后的`dataset`作为一个新的实例生成"
  144. ]
  145. },
  146. {
  147. "cell_type": "code",
  148. "execution_count": 3,
  149. "id": "09b478f8",
  150. "metadata": {},
  151. "outputs": [
  152. {
  153. "name": "stdout",
  154. "output_type": "stream",
  155. "text": [
  156. "1969418794120 1971237588872\n",
  157. "+-----+------------------------+------------------------+-----+\n",
  158. "| idx | sentence | words | num |\n",
  159. "+-----+------------------------+------------------------+-----+\n",
  160. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  161. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  162. "+-----+------------------------+------------------------+-----+\n",
  163. "+-----+------------------------+------------------------+-----+\n",
  164. "| idx | sentence | words | num |\n",
  165. "+-----+------------------------+------------------------+-----+\n",
  166. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  167. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  168. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  169. "+-----+------------------------+------------------------+-----+\n"
  170. ]
  171. }
  172. ],
  173. "source": [
  174. "dropped = dataset\n",
  175. "dropped = dropped.drop(lambda ins:ins['num'] < 5, inplace=False)\n",
  176. "print(id(dropped), id(dataset))\n",
  177. "print(dropped)\n",
  178. "print(dataset)"
  179. ]
  180. },
  181. {
  182. "cell_type": "markdown",
  183. "id": "aa277674",
  184. "metadata": {},
  185. "source": [
  186. "&emsp; 注二:在`fastNLP 0.8`中,**对`dataset`使用等号**,**其效果是传引用**,**而不是赋值**(???)\n",
  187. "\n",
  188. "&emsp; &emsp; 如下所示,**`dropped`和`dataset`具有相同`id`**,**对`dropped`执行删除操作`dataset`同时会被修改**"
  189. ]
  190. },
  191. {
  192. "cell_type": "code",
  193. "execution_count": 4,
  194. "id": "77c8583a",
  195. "metadata": {},
  196. "outputs": [
  197. {
  198. "name": "stdout",
  199. "output_type": "stream",
  200. "text": [
  201. "1971237588872 1971237588872\n",
  202. "+-----+------------------------+------------------------+-----+\n",
  203. "| idx | sentence | words | num |\n",
  204. "+-----+------------------------+------------------------+-----+\n",
  205. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  206. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  207. "+-----+------------------------+------------------------+-----+\n",
  208. "+-----+------------------------+------------------------+-----+\n",
  209. "| idx | sentence | words | num |\n",
  210. "+-----+------------------------+------------------------+-----+\n",
  211. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  212. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  213. "+-----+------------------------+------------------------+-----+\n"
  214. ]
  215. }
  216. ],
  217. "source": [
  218. "dropped = dataset\n",
  219. "dropped.drop(lambda ins:ins['num'] < 5)\n",
  220. "print(id(dropped), id(dataset))\n",
  221. "print(dropped)\n",
  222. "print(dataset)"
  223. ]
  224. },
  225. {
  226. "cell_type": "markdown",
  227. "id": "a76199dc",
  228. "metadata": {},
  229. "source": [
  230. "在`dataset`中,使用`delet_instance`方法可以删除对应序号的`instance`实例,序号从0开始"
  231. ]
  232. },
  233. {
  234. "cell_type": "code",
  235. "execution_count": 5,
  236. "id": "d8824b40",
  237. "metadata": {},
  238. "outputs": [
  239. {
  240. "name": "stdout",
  241. "output_type": "stream",
  242. "text": [
  243. "+-----+--------------------+------------------------+-----+\n",
  244. "| idx | sentence | words | num |\n",
  245. "+-----+--------------------+------------------------+-----+\n",
  246. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  247. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  248. "+-----+--------------------+------------------------+-----+\n"
  249. ]
  250. }
  251. ],
  252. "source": [
  253. "dataset = DataSet(data)\n",
  254. "dataset.delete_instance(2)\n",
  255. "print(dataset)"
  256. ]
  257. },
  258. {
  259. "cell_type": "markdown",
  260. "id": "f4fa9f33",
  261. "metadata": {},
  262. "source": [
  263. "在`dataset`中,使用`delet_field`方法可以删除对应名称的`field`字段"
  264. ]
  265. },
  266. {
  267. "cell_type": "code",
  268. "execution_count": 6,
  269. "id": "f68ddb40",
  270. "metadata": {},
  271. "outputs": [
  272. {
  273. "name": "stdout",
  274. "output_type": "stream",
  275. "text": [
  276. "+-----+--------------------+------------------------------+\n",
  277. "| idx | sentence | words |\n",
  278. "+-----+--------------------+------------------------------+\n",
  279. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  280. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  281. "+-----+--------------------+------------------------------+\n"
  282. ]
  283. }
  284. ],
  285. "source": [
  286. "dataset.delete_field('num')\n",
  287. "print(dataset)"
  288. ]
  289. },
  290. {
  291. "cell_type": "markdown",
  292. "id": "b1e9d42c",
  293. "metadata": {},
  294. "source": [
  295. "### 1.2 dataset 的数据预处理\n",
  296. "\n",
  297. "在`dataset`模块中,`apply`、`apply_field`、`apply_more`和`apply_field_more`函数可以进行简单的数据预处理\n",
  298. "\n",
  299. "&emsp; **`apply`和`apply_more`针对整条实例**,**`apply_field`和`apply_field_more`仅针对实例的部分字段**\n",
  300. "\n",
  301. "&emsp; **`apply`和`apply_field`仅针对单个字段**,**`apply_more`和`apply_field_more`则可以针对多个字段**\n",
  302. "\n",
  303. "&emsp; **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n",
  304. "\n",
  305. "***\n",
  306. "\n",
  307. "`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n",
  308. "\n",
  309. "&emsp; 的每个`instance`实例,函数`func`的处理结果存放在`new_field_name`对应的新建字段内"
  310. ]
  311. },
  312. {
  313. "cell_type": "code",
  314. "execution_count": 7,
  315. "id": "72a0b5f9",
  316. "metadata": {},
  317. "outputs": [
  318. {
  319. "data": {
  320. "application/vnd.jupyter.widget-view+json": {
  321. "model_id": "",
  322. "version_major": 2,
  323. "version_minor": 0
  324. },
  325. "text/plain": [
  326. "Output()"
  327. ]
  328. },
  329. "metadata": {},
  330. "output_type": "display_data"
  331. },
  332. {
  333. "data": {
  334. "text/html": [
  335. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  336. ],
  337. "text/plain": []
  338. },
  339. "metadata": {},
  340. "output_type": "display_data"
  341. },
  342. {
  343. "data": {
  344. "text/html": [
  345. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  346. "</pre>\n"
  347. ],
  348. "text/plain": [
  349. "\n"
  350. ]
  351. },
  352. "metadata": {},
  353. "output_type": "display_data"
  354. },
  355. {
  356. "name": "stdout",
  357. "output_type": "stream",
  358. "text": [
  359. "+-----+------------------------------+------------------------------+\n",
  360. "| idx | sentence | words |\n",
  361. "+-----+------------------------------+------------------------------+\n",
  362. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  363. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  364. "| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
  365. "+-----+------------------------------+------------------------------+\n"
  366. ]
  367. }
  368. ],
  369. "source": [
  370. "data = {'idx': [0, 1, 2], \n",
  371. " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"], }\n",
  372. "dataset = DataSet(data)\n",
  373. "dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')\n",
  374. "print(dataset)"
  375. ]
  376. },
  377. {
  378. "cell_type": "markdown",
  379. "id": "c10275ee",
  380. "metadata": {},
  381. "source": [
  382. "&emsp; **`apply`使用的函数可以是一个基于`lambda`表达式的匿名函数**,**也可以是一个自定义的函数**"
  383. ]
  384. },
  385. {
  386. "cell_type": "code",
  387. "execution_count": 8,
  388. "id": "b1a8631f",
  389. "metadata": {},
  390. "outputs": [
  391. {
  392. "data": {
  393. "text/html": [
  394. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  395. ],
  396. "text/plain": []
  397. },
  398. "metadata": {},
  399. "output_type": "display_data"
  400. },
  401. {
  402. "data": {
  403. "text/html": [
  404. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  405. ],
  406. "text/plain": []
  407. },
  408. "metadata": {},
  409. "output_type": "display_data"
  410. },
  411. {
  412. "data": {
  413. "text/html": [
  414. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  415. "</pre>\n"
  416. ],
  417. "text/plain": [
  418. "\n"
  419. ]
  420. },
  421. "metadata": {},
  422. "output_type": "display_data"
  423. },
  424. {
  425. "name": "stdout",
  426. "output_type": "stream",
  427. "text": [
  428. "+-----+------------------------------+------------------------------+\n",
  429. "| idx | sentence | words |\n",
  430. "+-----+------------------------------+------------------------------+\n",
  431. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  432. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  433. "| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
  434. "+-----+------------------------------+------------------------------+\n"
  435. ]
  436. }
  437. ],
  438. "source": [
  439. "dataset = DataSet(data)\n",
  440. "\n",
  441. "def get_words(instance):\n",
  442. " sentence = instance['sentence']\n",
  443. " words = sentence.split()\n",
  444. " return words\n",
  445. "\n",
  446. "dataset.apply(get_words, new_field_name='words')\n",
  447. "print(dataset)"
  448. ]
  449. },
  450. {
  451. "cell_type": "markdown",
  452. "id": "64abf745",
  453. "metadata": {},
  454. "source": [
  455. "`apply_field`的参数,除了函数`func`外还有`field_name`和`new_field_name`,该函数`func`的处理对象仅\n",
  456. "\n",
  457. "&emsp; 是`dataset`模块中的每个`field_name`对应的字段内容,处理结果存放在`new_field_name`对应的新建字段内"
  458. ]
  459. },
  460. {
  461. "cell_type": "code",
  462. "execution_count": 9,
  463. "id": "057c1d2c",
  464. "metadata": {},
  465. "outputs": [
  466. {
  467. "data": {
  468. "text/html": [
  469. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  470. ],
  471. "text/plain": []
  472. },
  473. "metadata": {},
  474. "output_type": "display_data"
  475. },
  476. {
  477. "data": {
  478. "text/html": [
  479. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  480. ],
  481. "text/plain": []
  482. },
  483. "metadata": {},
  484. "output_type": "display_data"
  485. },
  486. {
  487. "data": {
  488. "text/html": [
  489. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  490. "</pre>\n"
  491. ],
  492. "text/plain": [
  493. "\n"
  494. ]
  495. },
  496. "metadata": {},
  497. "output_type": "display_data"
  498. },
  499. {
  500. "name": "stdout",
  501. "output_type": "stream",
  502. "text": [
  503. "+-----+------------------------------+------------------------------+\n",
  504. "| idx | sentence | words |\n",
  505. "+-----+------------------------------+------------------------------+\n",
  506. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  507. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  508. "| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
  509. "+-----+------------------------------+------------------------------+\n"
  510. ]
  511. }
  512. ],
  513. "source": [
  514. "dataset = DataSet(data)\n",
  515. "dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words')\n",
  516. "print(dataset)"
  517. ]
  518. },
  519. {
  520. "cell_type": "markdown",
  521. "id": "5a9cc8b2",
  522. "metadata": {},
  523. "source": [
  524. "`apply_more`的参数只有函数`func`,函数`func`的处理对象是`dataset`模块中的每个`instance`实例\n",
  525. "\n",
  526. "&emsp; 要求函数`func`返回一个字典,根据字典的`key-value`确定存储在`dataset`中的字段名称与内容"
  527. ]
  528. },
  529. {
  530. "cell_type": "code",
  531. "execution_count": 10,
  532. "id": "51e2f02c",
  533. "metadata": {},
  534. "outputs": [
  535. {
  536. "data": {
  537. "text/html": [
  538. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  539. ],
  540. "text/plain": []
  541. },
  542. "metadata": {},
  543. "output_type": "display_data"
  544. },
  545. {
  546. "data": {
  547. "text/html": [
  548. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  549. ],
  550. "text/plain": []
  551. },
  552. "metadata": {},
  553. "output_type": "display_data"
  554. },
  555. {
  556. "data": {
  557. "text/html": [
  558. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  559. "</pre>\n"
  560. ],
  561. "text/plain": [
  562. "\n"
  563. ]
  564. },
  565. "metadata": {},
  566. "output_type": "display_data"
  567. },
  568. {
  569. "name": "stdout",
  570. "output_type": "stream",
  571. "text": [
  572. "+-----+------------------------+------------------------+-----+\n",
  573. "| idx | sentence | words | num |\n",
  574. "+-----+------------------------+------------------------+-----+\n",
  575. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  576. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  577. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  578. "+-----+------------------------+------------------------+-----+\n"
  579. ]
  580. }
  581. ],
  582. "source": [
  583. "dataset = DataSet(data)\n",
  584. "dataset.apply_more(lambda ins:{'words': ins['sentence'].split(), 'num': len(ins['sentence'].split())})\n",
  585. "print(dataset)"
  586. ]
  587. },
  588. {
  589. "cell_type": "markdown",
  590. "id": "02d2b7ef",
  591. "metadata": {},
  592. "source": [
  593. "`apply_more`的参数只有函数`func`,函数`func`的处理对象是`dataset`模块中的每个`instance`实例\n",
  594. "\n",
  595. "&emsp; 要求函数`func`返回一个字典,根据字典的`key-value`确定存储在`dataset`中的字段名称与内容"
  596. ]
  597. },
  598. {
  599. "cell_type": "code",
  600. "execution_count": 11,
  601. "id": "db4295d5",
  602. "metadata": {},
  603. "outputs": [
  604. {
  605. "data": {
  606. "text/html": [
  607. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  608. ],
  609. "text/plain": []
  610. },
  611. "metadata": {},
  612. "output_type": "display_data"
  613. },
  614. {
  615. "data": {
  616. "text/html": [
  617. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
  618. ],
  619. "text/plain": []
  620. },
  621. "metadata": {},
  622. "output_type": "display_data"
  623. },
  624. {
  625. "data": {
  626. "text/html": [
  627. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  628. "</pre>\n"
  629. ],
  630. "text/plain": [
  631. "\n"
  632. ]
  633. },
  634. "metadata": {},
  635. "output_type": "display_data"
  636. },
  637. {
  638. "name": "stdout",
  639. "output_type": "stream",
  640. "text": [
  641. "+-----+------------------------+------------------------+-----+\n",
  642. "| idx | sentence | words | num |\n",
  643. "+-----+------------------------+------------------------+-----+\n",
  644. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  645. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  646. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  647. "+-----+------------------------+------------------------+-----+\n"
  648. ]
  649. }
  650. ],
  651. "source": [
  652. "dataset = DataSet(data)\n",
  653. "dataset.apply_field_more(lambda sent:{'words': sent.split(), 'num': len(sent.split())}, \n",
  654. " field_name='sentence')\n",
  655. "print(dataset)"
  656. ]
  657. },
  658. {
  659. "cell_type": "markdown",
  660. "id": "9c09e592",
  661. "metadata": {},
  662. "source": [
  663. "### 1.3 延伸:instance 和 field\n",
  664. "\n",
  665. "在`fastNLP 0.8`中,使用`Instance`模块表示数据集`dataset`中的每条数据,被称为实例\n",
  666. "\n",
  667. "&emsp; 构造方式类似于构造一个字典,通过键值相同的`Instance`列表,也可以初始化一个`dataset`,代码如下"
  668. ]
  669. },
  670. {
  671. "cell_type": "code",
  672. "execution_count": 12,
  673. "id": "012f537c",
  674. "metadata": {},
  675. "outputs": [],
  676. "source": [
  677. "from fastNLP.core.dataset import DataSet\n",
  678. "from fastNLP.core.dataset import Instance\n",
  679. "\n",
  680. "dataset = DataSet([\n",
  681. " Instance(sentence=\"This is an apple .\",\n",
  682. " words=['This', 'is', 'an', 'apple', '.'],\n",
  683. " num=5),\n",
  684. " Instance(sentence=\"I like apples .\",\n",
  685. " words=['I', 'like', 'apples', '.'],\n",
  686. " num=4),\n",
  687. " Instance(sentence=\"Apples are good for our health .\",\n",
  688. " words=['Apples', 'are', 'good', 'for', 'our', 'health', '.'],\n",
  689. " num=7),\n",
  690. " ])"
  691. ]
  692. },
  693. {
  694. "cell_type": "markdown",
  695. "id": "2fafb1ef",
  696. "metadata": {},
  697. "source": [
  698. "&emsp; 通过`items`、`keys`和`values`方法,可以分别获得`dataset`的`item`列表、`key`列表、`value`列表"
  699. ]
  700. },
  701. {
  702. "cell_type": "code",
  703. "execution_count": 13,
  704. "id": "a4c1c10d",
  705. "metadata": {},
  706. "outputs": [
  707. {
  708. "name": "stdout",
  709. "output_type": "stream",
  710. "text": [
  711. "dict_items([('sentence', 'This is an apple .'), ('words', ['This', 'is', 'an', 'apple', '.']), ('num', 5)])\n",
  712. "dict_keys(['sentence', 'words', 'num'])\n",
  713. "dict_values(['This is an apple .', ['This', 'is', 'an', 'apple', '.'], 5])\n"
  714. ]
  715. }
  716. ],
  717. "source": [
  718. "ins = Instance(sentence=\"This is an apple .\", words=['This', 'is', 'an', 'apple', '.'], num=5)\n",
  719. "\n",
  720. "print(ins.items())\n",
  721. "print(ins.keys())\n",
  722. "print(ins.values())"
  723. ]
  724. },
  725. {
  726. "cell_type": "markdown",
  727. "id": "b5459a2d",
  728. "metadata": {},
  729. "source": [
  730. "&emsp; 通过`add_field`方法,可以在`Instance`实例中,通过参数`field_name`添加字段,通过参数`field`赋值"
  731. ]
  732. },
  733. {
  734. "cell_type": "code",
  735. "execution_count": 14,
  736. "id": "55376402",
  737. "metadata": {},
  738. "outputs": [
  739. {
  740. "name": "stdout",
  741. "output_type": "stream",
  742. "text": [
  743. "+--------------------+------------------------+-----+-----+\n",
  744. "| sentence | words | num | idx |\n",
  745. "+--------------------+------------------------+-----+-----+\n",
  746. "| This is an apple . | ['This', 'is', 'an'... | 5 | 0 |\n",
  747. "+--------------------+------------------------+-----+-----+\n"
  748. ]
  749. }
  750. ],
  751. "source": [
  752. "ins.add_field(field_name='idx', field=0)\n",
  753. "print(ins)"
  754. ]
  755. },
  756. {
  757. "cell_type": "markdown",
  758. "id": "49caaa9c",
  759. "metadata": {},
  760. "source": [
  761. "在`fastNLP 0.8`中,使用`FieldArray`模块表示数据集`dataset`中的每条字段名(注:没有`field`类)\n",
  762. "\n",
  763. "&emsp; 通过`get_all_fields`方法可以获取`dataset`的字段列表\n",
  764. "\n",
  765. "&emsp; 通过`get_field_names`方法可以获取`dataset`的字段名称列表,代码如下"
  766. ]
  767. },
  768. {
  769. "cell_type": "code",
  770. "execution_count": 15,
  771. "id": "fe15f4c1",
  772. "metadata": {},
  773. "outputs": [
  774. {
  775. "data": {
  776. "text/plain": [
  777. "{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d08>,\n",
  778. " 'words': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d88>,\n",
  779. " 'num': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879e08>}"
  780. ]
  781. },
  782. "execution_count": 15,
  783. "metadata": {},
  784. "output_type": "execute_result"
  785. }
  786. ],
  787. "source": [
  788. "dataset.get_all_fields()"
  789. ]
  790. },
  791. {
  792. "cell_type": "code",
  793. "execution_count": 16,
  794. "id": "5433815c",
  795. "metadata": {},
  796. "outputs": [
  797. {
  798. "data": {
  799. "text/plain": [
  800. "['num', 'sentence', 'words']"
  801. ]
  802. },
  803. "execution_count": 16,
  804. "metadata": {},
  805. "output_type": "execute_result"
  806. }
  807. ],
  808. "source": [
  809. "dataset.get_field_names()"
  810. ]
  811. },
  812. {
  813. "cell_type": "markdown",
  814. "id": "4964eeed",
  815. "metadata": {},
  816. "source": [
  817. "其他`dataset`的基本使用:通过`in`或者`has_field`方法可以判断`dataset`的是否包含某种字段\n",
  818. "\n",
  819. "&emsp; 通过`rename_field`方法可以更改`dataset`中的字段名称;通过`concat`方法可以实现两个`dataset`中的拼接\n",
  820. "\n",
  821. "&emsp; 通过`len`可以统计`dataset`中的实例数目;`dataset`的全部变量与函数可以通过`dir(dataset)`查询"
  822. ]
  823. },
  824. {
  825. "cell_type": "code",
  826. "execution_count": 17,
  827. "id": "25ce5488",
  828. "metadata": {},
  829. "outputs": [
  830. {
  831. "name": "stdout",
  832. "output_type": "stream",
  833. "text": [
  834. "3 False\n",
  835. "6 True\n",
  836. "+------------------------------+------------------------------+--------+\n",
  837. "| sentence | words | length |\n",
  838. "+------------------------------+------------------------------+--------+\n",
  839. "| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n",
  840. "| I like apples . | ['I', 'like', 'apples', '... | 4 |\n",
  841. "| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n",
  842. "| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n",
  843. "| I like apples . | ['I', 'like', 'apples', '... | 4 |\n",
  844. "| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n",
  845. "+------------------------------+------------------------------+--------+\n"
  846. ]
  847. }
  848. ],
  849. "source": [
  850. "print(len(dataset), dataset.has_field('length')) \n",
  851. "if 'num' in dataset:\n",
  852. " dataset.rename_field('num', 'length')\n",
  853. "elif 'length' in dataset:\n",
  854. " dataset.rename_field('length', 'num')\n",
  855. "dataset.concat(dataset)\n",
  856. "print(len(dataset), dataset.has_field('length')) \n",
  857. "print(dataset) "
  858. ]
  859. },
  860. {
  861. "cell_type": "markdown",
  862. "id": "e30a6cd7",
  863. "metadata": {},
  864. "source": [
  865. "## 2. vocabulary 的结构与使用\n",
  866. "\n",
  867. "### 2.1 vocabulary 的创建与修改\n",
  868. "\n",
  869. "在`fastNLP 0.8`中,使用`Vocabulary`模块表示词汇表,**`vocabulary`的核心是从单词到序号的映射**\n",
  870. "\n",
  871. "&emsp; 可以直接通过构造函数实例化,通过查找`word2idx`属性,可以找到`vocabulary`映射对应的字典实现\n",
  872. "\n",
  873. "&emsp; **默认补零`padding`用`<pad>`表示**,**对应序号为0**;**未知单词`unknown`用`<unk>`表示**,**对应序号1**\n",
  874. "\n",
  875. "&emsp; 通过打印`vocabulary`可以看到词汇表中的单词列表,其中,`padding`和`unknown`不会显示"
  876. ]
  877. },
  878. {
  879. "cell_type": "code",
  880. "execution_count": 18,
  881. "id": "3515e096",
  882. "metadata": {},
  883. "outputs": [
  884. {
  885. "name": "stdout",
  886. "output_type": "stream",
  887. "text": [
  888. "Vocabulary([]...)\n",
  889. "{'<pad>': 0, '<unk>': 1}\n",
  890. "<pad> 0\n",
  891. "<unk> 1\n"
  892. ]
  893. }
  894. ],
  895. "source": [
  896. "from fastNLP.core.vocabulary import Vocabulary\n",
  897. "\n",
  898. "vocab = Vocabulary()\n",
  899. "print(vocab)\n",
  900. "print(vocab.word2idx)\n",
  901. "print(vocab.padding, vocab.padding_idx)\n",
  902. "print(vocab.unknown, vocab.unknown_idx)"
  903. ]
  904. },
  905. {
  906. "cell_type": "markdown",
  907. "id": "640be126",
  908. "metadata": {},
  909. "source": [
  910. "在`vocabulary`中,通过`add_word`方法或`add_word_lst`方法,可以单独或批量添加单词\n",
  911. "\n",
  912. "&emsp; 通过`len`或`word_count`属性,可以显示`vocabulary`的单词量和每个单词添加的次数"
  913. ]
  914. },
  915. {
  916. "cell_type": "code",
  917. "execution_count": 19,
  918. "id": "88c7472a",
  919. "metadata": {},
  920. "outputs": [
  921. {
  922. "name": "stdout",
  923. "output_type": "stream",
  924. "text": [
  925. "5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
  926. "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n"
  927. ]
  928. }
  929. ],
  930. "source": [
  931. "vocab.add_word_lst(['生活', '就像', '海洋'])\n",
  932. "print(len(vocab), vocab.word_count)\n",
  933. "vocab.add_word('只有')\n",
  934. "print(len(vocab), vocab.word_count)"
  935. ]
  936. },
  937. {
  938. "cell_type": "markdown",
  939. "id": "f9ec8b28",
  940. "metadata": {},
  941. "source": [
  942. "&emsp; **通过`to_word`方法可以找到单词对应的序号**,**通过`to_index`方法可以找到序号对应的单词**\n",
  943. "\n",
  944. "&emsp; &emsp; 由于序号0和序号1已经被占用,所以**新加入的词的序号从2开始计数**,如`'生活'`对应2\n",
  945. "\n",
  946. "&emsp; &emsp; 通过`has_word`方法可以判断单词是否在词汇表中,没有的单词被判做`<unk>`"
  947. ]
  948. },
  949. {
  950. "cell_type": "code",
  951. "execution_count": 20,
  952. "id": "3447acde",
  953. "metadata": {},
  954. "outputs": [
  955. {
  956. "name": "stdout",
  957. "output_type": "stream",
  958. "text": [
  959. "<pad> 0\n",
  960. "<unk> 1\n",
  961. "生活 2\n",
  962. "只有 5\n",
  963. "彼岸 1 False\n"
  964. ]
  965. }
  966. ],
  967. "source": [
  968. "print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
  969. "print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
  970. "print(vocab.to_word(2), vocab.to_index('生活'))\n",
  971. "print(vocab.to_word(5), vocab.to_index('只有'))\n",
  972. "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
  973. ]
  974. },
  975. {
  976. "cell_type": "markdown",
  977. "id": "b4e36850",
  978. "metadata": {},
  979. "source": [
  980. "**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n",
  981. "\n",
  982. "&emsp; 但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询"
  983. ]
  984. },
  985. {
  986. "cell_type": "code",
  987. "execution_count": 21,
  988. "id": "490b101c",
  989. "metadata": {},
  990. "outputs": [
  991. {
  992. "name": "stdout",
  993. "output_type": "stream",
  994. "text": [
  995. "13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
  996. "彼岸 12 True\n"
  997. ]
  998. }
  999. ],
  1000. "source": [
  1001. "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n",
  1002. "print(len(vocab), vocab.word_count)\n",
  1003. "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
  1004. ]
  1005. },
  1006. {
  1007. "cell_type": "markdown",
  1008. "id": "23e32a63",
  1009. "metadata": {},
  1010. "source": [
  1011. "### 2.2 vocabulary 与 OOV 问题\n",
  1012. "\n",
  1013. "在`vocabulary`模块初始化的时候,可以通过指定`unknown`和`padding`为`None`,限制其存在\n",
  1014. "\n",
  1015. "&emsp; 此时添加单词直接从0开始标号,如果遇到未知单词会直接报错,即 out of vocabulary"
  1016. ]
  1017. },
  1018. {
  1019. "cell_type": "code",
  1020. "execution_count": 22,
  1021. "id": "a99ff909",
  1022. "metadata": {},
  1023. "outputs": [
  1024. {
  1025. "name": "stdout",
  1026. "output_type": "stream",
  1027. "text": [
  1028. "{'positive': 0, 'negative': 1}\n",
  1029. "ValueError: word `neutral` not in vocabulary\n"
  1030. ]
  1031. }
  1032. ],
  1033. "source": [
  1034. "vocab = Vocabulary(unknown=None, padding=None)\n",
  1035. "\n",
  1036. "vocab.add_word_lst(['positive', 'negative'])\n",
  1037. "print(vocab.word2idx)\n",
  1038. "\n",
  1039. "try:\n",
  1040. " print(vocab.to_index('neutral'))\n",
  1041. "except ValueError:\n",
  1042. " print(\"ValueError: word `neutral` not in vocabulary\")"
  1043. ]
  1044. },
  1045. {
  1046. "cell_type": "markdown",
  1047. "id": "618da6bd",
  1048. "metadata": {},
  1049. "source": [
  1050. "&emsp; 相应的,如果只指定其中的`unknown`,则编号会后移一个,同时遇到未知单词全部当做`<unk>`"
  1051. ]
  1052. },
  1053. {
  1054. "cell_type": "code",
  1055. "execution_count": 23,
  1056. "id": "432f74c1",
  1057. "metadata": {},
  1058. "outputs": [
  1059. {
  1060. "name": "stdout",
  1061. "output_type": "stream",
  1062. "text": [
  1063. "{'<unk>': 0, 'positive': 1, 'negative': 2}\n",
  1064. "0 <unk>\n"
  1065. ]
  1066. }
  1067. ],
  1068. "source": [
  1069. "vocab = Vocabulary(unknown='<unk>', padding=None)\n",
  1070. "\n",
  1071. "vocab.add_word_lst(['positive', 'negative'])\n",
  1072. "print(vocab.word2idx)\n",
  1073. "\n",
  1074. "print(vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral')))"
  1075. ]
  1076. },
  1077. {
  1078. "cell_type": "markdown",
  1079. "id": "b6263f73",
  1080. "metadata": {},
  1081. "source": [
  1082. "## 3 dataset 和 vocabulary 的组合使用\n",
  1083. " \n",
  1084. "### 3.1 从 dataframe 中加载 dataset\n",
  1085. "\n"
  1086. ]
  1087. },
  1088. {
  1089. "cell_type": "markdown",
  1090. "id": "89059713",
  1091. "metadata": {},
  1092. "source": []
  1093. },
  1094. {
  1095. "cell_type": "code",
  1096. "execution_count": null,
  1097. "id": "3dbd985d",
  1098. "metadata": {},
  1099. "outputs": [],
  1100. "source": []
  1101. },
  1102. {
  1103. "cell_type": "code",
  1104. "execution_count": null,
  1105. "id": "4f634586",
  1106. "metadata": {},
  1107. "outputs": [],
  1108. "source": []
  1109. },
  1110. {
  1111. "cell_type": "markdown",
  1112. "id": "5ba13989",
  1113. "metadata": {},
  1114. "source": [
  1115. "### 3.2 从 dataset 中获取 vocabulary"
  1116. ]
  1117. },
  1118. {
  1119. "cell_type": "code",
  1120. "execution_count": null,
  1121. "id": "a2de615b",
  1122. "metadata": {},
  1123. "outputs": [],
  1124. "source": []
  1125. },
  1126. {
  1127. "cell_type": "code",
  1128. "execution_count": null,
  1129. "id": "5f5eed18",
  1130. "metadata": {},
  1131. "outputs": [],
  1132. "source": []
  1133. }
  1134. ],
  1135. "metadata": {
  1136. "kernelspec": {
  1137. "display_name": "Python 3 (ipykernel)",
  1138. "language": "python",
  1139. "name": "python3"
  1140. },
  1141. "language_info": {
  1142. "codemirror_mode": {
  1143. "name": "ipython",
  1144. "version": 3
  1145. },
  1146. "file_extension": ".py",
  1147. "mimetype": "text/x-python",
  1148. "name": "python",
  1149. "nbconvert_exporter": "python",
  1150. "pygments_lexer": "ipython3",
  1151. "version": "3.7.4"
  1152. }
  1153. },
  1154. "nbformat": 4,
  1155. "nbformat_minor": 5
  1156. }