|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# E2. 使用 continuous prompt 完成 SST2 分类"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
- "</pre>\n"
- ],
- "text/plain": [
- "\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "4.18.0\n"
- ]
- }
- ],
- "source": [
- "import torch\n",
- "from torch.optim import AdamW\n",
- "from torch.utils.data import DataLoader, Dataset\n",
- "\n",
- "import torch.nn as nn\n",
- "\n",
- "import transformers\n",
- "from transformers import AutoTokenizer\n",
- "from transformers import AutoModelForSequenceClassification\n",
- "\n",
- "import sys\n",
- "sys.path.append('..')\n",
- "\n",
- "import fastNLP\n",
- "from fastNLP import Trainer\n",
- "from fastNLP.core.metrics import Accuracy\n",
- "\n",
- "print(transformers.__version__)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "GLUE_TASKS = [\"cola\", \"mnli\", \"mnli-mm\", \"mrpc\", \"qnli\", \"qqp\", \"rte\", \"sst2\", \"stsb\", \"wnli\"]\n",
- "\n",
- "task = \"sst2\"\n",
- "model_checkpoint = \"distilbert-base-uncased\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "class ClassModel(nn.Module):\n",
- " def __init__(self, model_checkpoint, num_labels, pre_seq_len):\n",
- " nn.Module.__init__(self)\n",
- " self.num_labels = num_labels\n",
- " self.back_bone = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, \n",
- " num_labels=num_labels)\n",
- " self.embeddings = self.back_bone.get_input_embeddings()\n",
- "\n",
- " for param in self.back_bone.parameters():\n",
- " param.requires_grad = False\n",
- " \n",
- " self.pre_seq_len = pre_seq_len\n",
- " self.prefix_tokens = torch.arange(self.pre_seq_len).long()\n",
- " self.prefix_encoder = nn.Embedding(self.pre_seq_len, self.embeddings.embedding_dim)\n",
- " \n",
- " def get_prompt(self, batch_size):\n",
- " prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(self.back_bone.device)\n",
- " prompts = self.prefix_encoder(prefix_tokens)\n",
- " return prompts\n",
- "\n",
- " def forward(self, input_ids, attention_mask, labels):\n",
- " \n",
- " batch_size = input_ids.shape[0]\n",
- " raw_embedding = self.embeddings(input_ids)\n",
- " \n",
- " prompts = self.get_prompt(batch_size=batch_size)\n",
- " inputs_embeds = torch.cat((prompts, raw_embedding), dim=1)\n",
- " prefix_attention_mask = torch.ones(batch_size, self.pre_seq_len).to(self.back_bone.device)\n",
- " attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)\n",
- "\n",
- " outputs = self.back_bone(inputs_embeds=inputs_embeds, \n",
- " attention_mask=attention_mask, labels=labels)\n",
- " return outputs\n",
- "\n",
- " def train_step(self, input_ids, attention_mask, labels):\n",
- " return {\"loss\": self(input_ids, attention_mask, labels).loss}\n",
- "\n",
- " def evaluate_step(self, input_ids, attention_mask, labels):\n",
- " pred = self(input_ids, attention_mask, labels).logits\n",
- " pred = torch.max(pred, dim=-1)[1]\n",
- " return {\"pred\": pred, \"target\": labels}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']\n",
- "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
- "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
- "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']\n",
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
- ]
- }
- ],
- "source": [
- "num_labels = 3 if task.startswith(\"mnli\") else 1 if task == \"stsb\" else 2\n",
- "\n",
- "model = ClassModel(num_labels=num_labels, model_checkpoint=model_checkpoint, pre_seq_len=16)\n",
- "\n",
- "# Generally, simple classification tasks prefer shorter prompts (less than 20)\n",
- "\n",
- "optimizers = AdamW(params=model.parameters(), lr=5e-3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Using the latest cached version of the module from /remote-home/xrliu/.cache/huggingface/modules/datasets_modules/datasets/glue/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad (last modified on Thu May 26 15:30:15 2022) since it couldn't be found locally at glue., or remotely on the Hugging Face Hub.\n",
- "Reusing dataset glue (/remote-home/xrliu/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "1b73650d43f245ac8a5501dc91c6fe8c",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/3 [00:00<?, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "from datasets import load_dataset, load_metric\n",
- "\n",
- "dataset = load_dataset(\"glue\", \"mnli\" if task == \"mnli-mm\" else task)\n",
- "\n",
- "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading cached processed dataset at /remote-home/xrliu/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-294e481a713c5754.arrow\n",
- "Loading cached processed dataset at /remote-home/xrliu/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-ed9d9258aaf0fb54.arrow\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "0be84915c90f460896b8e67299e09df4",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/2 [00:00<?, ?ba/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "def preprocess_function(examples):\n",
- " return tokenizer(examples['sentence'], truncation=True)\n",
- "\n",
- "encoded_dataset = dataset.map(preprocess_function, batched=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "class TestDistilBertDataset(Dataset):\n",
- " def __init__(self, dataset):\n",
- " super(TestDistilBertDataset, self).__init__()\n",
- " self.dataset = dataset\n",
- "\n",
- " def __len__(self):\n",
- " return len(self.dataset)\n",
- "\n",
- " def __getitem__(self, item):\n",
- " item = self.dataset[item]\n",
- " return item[\"input_ids\"], item[\"attention_mask\"], [item[\"label\"]] "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "def test_bert_collate_fn(batch):\n",
- " input_ids, atten_mask, labels = [], [], []\n",
- " max_length = [0] * 3\n",
- " for each_item in batch:\n",
- " input_ids.append(each_item[0])\n",
- " max_length[0] = max(max_length[0], len(each_item[0]))\n",
- " atten_mask.append(each_item[1])\n",
- " max_length[1] = max(max_length[1], len(each_item[1]))\n",
- " labels.append(each_item[2])\n",
- " max_length[2] = max(max_length[2], len(each_item[2]))\n",
- "\n",
- " for i in range(3):\n",
- " each = (input_ids, atten_mask, labels)[i]\n",
- " for item in each:\n",
- " item.extend([0] * (max_length[i] - len(item)))\n",
- " return {\"input_ids\": torch.cat([torch.tensor([item]) for item in input_ids], dim=0),\n",
- " \"attention_mask\": torch.cat([torch.tensor([item]) for item in atten_mask], dim=0),\n",
- " \"labels\": torch.cat([torch.tensor(item) for item in labels], dim=0)}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "dataset_train = TestDistilBertDataset(encoded_dataset[\"train\"])\n",
- "dataloader_train = DataLoader(dataset=dataset_train, \n",
- " batch_size=32, shuffle=True, collate_fn=test_bert_collate_fn)\n",
- "dataset_valid = TestDistilBertDataset(encoded_dataset[\"validation\"])\n",
- "dataloader_valid = DataLoader(dataset=dataset_valid, \n",
- " batch_size=32, shuffle=False, collate_fn=test_bert_collate_fn)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "trainer = Trainer(\n",
- " model=model,\n",
- " driver='torch',\n",
- " device='cuda',\n",
- " n_epochs=10,\n",
- " optimizers=optimizers,\n",
- " train_dataloader=dataloader_train,\n",
- " evaluate_dataloaders=dataloader_valid,\n",
- " metrics={'acc': Accuracy()}\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
- ],
- "text/plain": []
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
- ],
- "text/plain": []
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
- "</pre>\n"
- ],
- "text/plain": [
- "\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "trainer.run(num_eval_batch_per_dl=10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
- ],
- "text/plain": []
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
- ],
- "text/plain": []
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "{'acc#acc': 0.644495, 'total#acc': 872.0, 'correct#acc': 562.0}"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trainer.evaluator.run()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.13"
- },
- "pycharm": {
- "stem_cell": {
- "cell_type": "raw",
- "metadata": {
- "collapsed": false
- },
- "source": []
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
- }
|