You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fastnlp_tutorial_2.ipynb 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# T2. dataloader 和 tokenizer 的基本使用\n",
  8. "\n",
  9. "  1   fastNLP 中的 dataloader\n",
  10. "\n",
  11. "    1.1   databundle 的结构与使用\n",
  12. "\n",
  13. "    1.2   dataloader 的结构与使用\n",
  14. "\n",
  15. "  2   fastNLP 中的 tokenizer\n",
  16. " \n",
  17. "    2.1   传统 GloVe 词嵌入的加载\n",
  18. " \n",
  19. "    2.2   PreTrainedTokenizer 的概念\n",
  20. "\n",
  21. "    2.3   BertTokenizer 的基本使用\n",
  22. "\n",
  23. "  3   实例:NG20 数据集的完整加载过程\n",
  24. " \n",
  25. "    3.1   \n",
  26. "\n",
  27. "    3.2   "
  28. ]
  29. },
  30. {
  31. "cell_type": "markdown",
  32. "metadata": {},
  33. "source": [
  34. "## 1. fastNLP 中的 dataloader\n",
  35. "\n",
  36. "### 1.1 databundle 的结构与使用\n",
  37. "\n",
  38. "在`fastNLP 0.8`中,在常用的数据加载模块`DataLoader`和数据集`DataSet`模块之间,还存在\n",
  39. "\n",
  40. "  一个中间模块,即 **数据包`DataBundle`模块**,可以从`fastNLP.io`路径中导入该模块\n",
  41. "\n",
  42. "在`fastNLP 0.8`中,**一个`databundle`数据包包含若干`dataset`数据集和`vocabulary`词汇表**\n",
  43. "\n",
  44. "  分别存储在`datasets`和`vocabs`两个变量中,所以了解`databundle`数据包之前\n",
  45. "\n",
  46. "  需要首先**复习`dataset`数据集和`vocabulary`词汇表**,**下面的一串代码**,**你知道其大概含义吗?**\n",
  47. "\n",
  48. "必要提示:`NG20`,全称[`News Group 20`](http://qwone.com/~jason/20Newsgroups/),是一个新闻文本分类数据集,包含20个大类以及若干小类\n",
  49. "\n",
  50. "  数据集包含训练集`'ng20_train.csv'`和测试集`'ng20_test.csv'`两部分,每条数据\n",
  51. "\n",
  52. "  包括`'label'`标签和`'text'`文本两个条目,通过`sample(frac=1)[:10]`随机采样并读取前十条"
  53. ]
  54. },
  55. {
  56. "cell_type": "code",
  57. "execution_count": 1,
  58. "metadata": {},
  59. "outputs": [
  60. {
  61. "data": {
  62. "text/html": [
  63. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  64. "</pre>\n"
  65. ],
  66. "text/plain": [
  67. "\n"
  68. ]
  69. },
  70. "metadata": {},
  71. "output_type": "display_data"
  72. },
  73. {
  74. "data": {
  75. "application/vnd.jupyter.widget-view+json": {
  76. "model_id": "",
  77. "version_major": 2,
  78. "version_minor": 0
  79. },
  80. "text/plain": [
  81. "Processing: 0%| | 0/10 [00:00<?, ?it/s]"
  82. ]
  83. },
  84. "metadata": {},
  85. "output_type": "display_data"
  86. },
  87. {
  88. "data": {
  89. "application/vnd.jupyter.widget-view+json": {
  90. "model_id": "",
  91. "version_major": 2,
  92. "version_minor": 0
  93. },
  94. "text/plain": [
  95. "Processing: 0%| | 0/10 [00:00<?, ?it/s]"
  96. ]
  97. },
  98. "metadata": {},
  99. "output_type": "display_data"
  100. },
  101. {
  102. "name": "stdout",
  103. "output_type": "stream",
  104. "text": [
  105. "+-------+------------------------------------------+\n",
  106. "| label | text |\n",
  107. "+-------+------------------------------------------+\n",
  108. "| talk | ['mwilson', 'ncratl', 'atlantaga', 'n... |\n",
  109. "| talk | ['ch981', 'cleveland', 'freenet', 'ed... |\n",
  110. "| rec | ['mbeaving', 'bnr', 'ca', '\\\\(', 'bea... |\n",
  111. "| soc | ['jayne', 'mmalt', 'guild', 'org', '\\... |\n",
  112. "| talk | ['jrutledg', 'cs', 'ulowell', 'edu', ... |\n",
  113. "| talk | ['cramer', 'optilink', 'com', '\\\\(', ... |\n",
  114. "| comp | ['triton', 'unm', 'edu', '\\\\(', 'larr... |\n",
  115. "| rec | ['ingres', 'com', '\\\\(', 'bruce', '\\\\... |\n",
  116. "| comp | ['ldo', 'waikato', 'ac', 'nz', '\\\\(',... |\n",
  117. "| misc | ['rebecca', 'rpi', 'edu', '\\\\(', 'ezr... |\n",
  118. "+-------+------------------------------------------+\n",
  119. "{'<pad>': 0, '<unk>': 1, 'rec': 2, 'talk': 3, 'comp': 4, 'soc': 5, 'misc': 6, 'sci': 7}\n"
  120. ]
  121. }
  122. ],
  123. "source": [
  124. "import pandas as pd\n",
  125. "\n",
  126. "from fastNLP import DataSet\n",
  127. "from fastNLP import Vocabulary\n",
  128. "from fastNLP.io import DataBundle\n",
  129. "\n",
  130. "datasets = {}\n",
  131. "datasets['train'] = DataSet.from_pandas(pd.read_csv('./data/ng20_train.csv').sample(frac=1)[:10])\n",
  132. "datasets['train'].apply_more(lambda ins:{'label': ins['label'].lower().split('.')[0], \n",
  133. " 'text': ins['text'].lower().split()},\n",
  134. " progress_bar='tqdm')\n",
  135. "datasets['test'] = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv').sample(frac=1)[:10])\n",
  136. "datasets['test'].apply_more(lambda ins:{'label': ins['label'].lower().split('.')[0], \n",
  137. " 'text': ins['text'].lower().split()},\n",
  138. " progress_bar='tqdm')\n",
  139. "print(datasets['train'])\n",
  140. "\n",
  141. "vocabs = {}\n",
  142. "vocabs['label'] = Vocabulary().from_dataset(datasets['train'].concat(datasets['test'], inplace=False), field_name='label')\n",
  143. "vocabs['text'] = Vocabulary().from_dataset(datasets['train'].concat(datasets['test'], inplace=False), field_name='text')\n",
  144. "print(vocabs['label'].word2idx)"
  145. ]
  146. },
  147. {
  148. "cell_type": "markdown",
  149. "metadata": {},
  150. "source": [
  151. "\n",
  152. "数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。\n",
  153. " 该对象一般由fastNLP中各种Loader的load函数生成,可以通过以下的方法获取里面的内容"
  154. ]
  155. },
  156. {
  157. "cell_type": "code",
  158. "execution_count": 2,
  159. "metadata": {},
  160. "outputs": [
  161. {
  162. "name": "stdout",
  163. "output_type": "stream",
  164. "text": [
  165. "In total 2 datasets:\n",
  166. "\ttrain has 10 instances.\n",
  167. "\ttest has 10 instances.\n",
  168. "In total 2 vocabs:\n",
  169. "\tlabel has 8 entries.\n",
  170. "\ttext has 1687 entries.\n",
  171. "\n"
  172. ]
  173. }
  174. ],
  175. "source": [
  176. "data_bundle = DataBundle(datasets=datasets, vocabs=vocabs)\n",
  177. "print(data_bundle)"
  178. ]
  179. },
  180. {
  181. "cell_type": "code",
  182. "execution_count": null,
  183. "metadata": {},
  184. "outputs": [],
  185. "source": []
  186. },
  187. {
  188. "cell_type": "markdown",
  189. "metadata": {},
  190. "source": [
  191. "### 1.2 dataloader 的结构与使用"
  192. ]
  193. },
  194. {
  195. "cell_type": "markdown",
  196. "metadata": {},
  197. "source": [
  198. "## 2. fastNLP 中的 tokenizer\n",
  199. "\n",
  200. "### 2.1 传统 GloVe 词嵌入的加载"
  201. ]
  202. },
  203. {
  204. "cell_type": "markdown",
  205. "metadata": {},
  206. "source": [
  207. "### 2.2 PreTrainTokenizer 的提出\n",
  208. "\n",
  209. "在`fastNLP 0.8`中,**使用`PreTrainedTokenizer`模块来为数据集中的词语进行词向量的标注**\n",
  210. "\n",
  211. "&emsp; 需要注意的是,`PreTrainedTokenizer`模块的下载和导入**需要确保环境安装了`transformers`模块**\n",
  212. "\n",
  213. "&emsp; 这是因为 `fastNLP 0.8`中`PreTrainedTokenizer`模块的实现基于`Huggingface Transformers`库\n",
  214. "\n",
  215. "**`Huggingface Transformers`是基于一个开源的**,**基于`transformer`模型结构提供的预训练语言库**\n",
  216. "\n",
  217. "&emsp; 包含了多种经典的基于`transformer`的预训练模型,如`BERT`、`BART`、`RoBERTa`、`GPT2`、`CPT`\n",
  218. "\n",
  219. "&emsp; 更多相关内容可以参考`Huggingface Transformers`的[相关论文](https://arxiv.org/pdf/1910.03771.pdf)、[官方文档](https://huggingface.co/transformers/)以及[的代码仓库](https://github.com/huggingface/transformers)"
  220. ]
  221. },
  222. {
  223. "cell_type": "markdown",
  224. "metadata": {},
  225. "source": [
  226. "### 2.3 BertTokenizer 的基本使用\n",
  227. "\n",
  228. "在`fastNLP 0.8`中,以`PreTrainedTokenizer`为基类,泛化出多个子类,实现基于`BERT`等模型的标注\n",
  229. "\n",
  230. "&emsp; 本节以`BertTokenizer`模块为例,展示`PreTrainedTokenizer`模块的使用方法与应用实例\n",
  231. "\n",
  232. "**`BertTokenizer`的初始化包括 导入模块和导入数据 两步**,先通过从`fastNLP.transformers.torch`中\n",
  233. "\n",
  234. "&emsp; 导入`BertTokenizer`模块,再通过`from_pretrained`方法指定`tokenizer`参数类型下载\n",
  235. "\n",
  236. "&emsp; 其中,**`'bert-base-uncased'`指定`tokenizer`使用的预训练`BERT`类型**:单词不区分大小写\n",
  237. "\n",
  238. "&emsp; &emsp; **模块层数`L=12`**,**隐藏层维度`H=768`**,**自注意力头数`A=12`**,**总参数量`110M`**\n",
  239. "\n",
  240. "&emsp; 另外,模型参数自动下载至 home 目录下的`~\\.cache\\huggingface\\transformers`文件夹中"
  241. ]
  242. },
  243. {
  244. "cell_type": "code",
  245. "execution_count": null,
  246. "metadata": {
  247. "scrolled": false
  248. },
  249. "outputs": [],
  250. "source": [
  251. "from fastNLP.transformers.torch import BertTokenizer\n",
  252. "\n",
  253. "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')"
  254. ]
  255. },
  256. {
  257. "cell_type": "code",
  258. "execution_count": null,
  259. "metadata": {},
  260. "outputs": [],
  261. "source": [
  262. "dir(tokenizer)"
  263. ]
  264. },
  265. {
  266. "cell_type": "markdown",
  267. "metadata": {},
  268. "source": [
  269. "## 3. 实例:NG20 数据集的完整加载过程\n",
  270. "\n",
  271. "### 3.1 使用 BertTokenizer 处理数据集\n",
  272. "\n",
  273. "在`fastNLP 0.8`中,**`Trainer`模块和`Evaluator`模块分别表示“训练器”和“评测器”**\n",
  274. "\n",
  275. "&emsp; 对应于之前的`fastNLP`版本中的`Trainer`模块和`Tester`模块,其定义方法如下所示\n",
  276. "\n",
  277. "在`fastNLP 0.8`中,需要注意,在同个`python`脚本中先使用`Trainer`训练,然后使用`Evaluator`评测\n",
  278. "\n",
  279. "&emsp; 非常关键的问题在于**如何正确设置二者的`driver`**。这就引入了另一个问题:什么是 `driver`?"
  280. ]
  281. },
  282. {
  283. "cell_type": "code",
  284. "execution_count": null,
  285. "metadata": {},
  286. "outputs": [],
  287. "source": [
  288. "import pandas as pd\n",
  289. "\n",
  290. "from fastNLP import DataSet\n",
  291. "from fastNLP import Vocabulary\n",
  292. "\n",
  293. "dataset = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv'))"
  294. ]
  295. },
  296. {
  297. "cell_type": "code",
  298. "execution_count": null,
  299. "metadata": {},
  300. "outputs": [],
  301. "source": [
  302. "from functools import partial\n",
  303. "\n",
  304. "encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n",
  305. " return_attention_mask=True)\n",
  306. "# 会新增 input_ids 、 attention_mask 和 token_type_ids 这三个 field\n",
  307. "dataset.apply_field_more(encode, field_name='text')"
  308. ]
  309. },
  310. {
  311. "cell_type": "code",
  312. "execution_count": null,
  313. "metadata": {},
  314. "outputs": [],
  315. "source": [
  316. "target_vocab = Vocabulary(padding=None, unknown=None)\n",
  317. "\n",
  318. "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
  319. "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
  320. " new_field_name='labels')\n",
  321. "# 需要将 input_ids 的 pad 值设置为 tokenizer 的 pad 值\n",
  322. "dataset.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
  323. "dataset.set_ignore('label', 'text') # 因为 label 是原始的不需要的 str ,所以我们可以忽略它,让它不要在 batch 的输出中出现"
  324. ]
  325. }
  326. ],
  327. "metadata": {
  328. "kernelspec": {
  329. "display_name": "Python 3 (ipykernel)",
  330. "language": "python",
  331. "name": "python3"
  332. },
  333. "language_info": {
  334. "codemirror_mode": {
  335. "name": "ipython",
  336. "version": 3
  337. },
  338. "file_extension": ".py",
  339. "mimetype": "text/x-python",
  340. "name": "python",
  341. "nbconvert_exporter": "python",
  342. "pygments_lexer": "ipython3",
  343. "version": "3.7.4"
  344. }
  345. },
  346. "nbformat": 4,
  347. "nbformat_minor": 1
  348. }