CNN for sentence classificationtags/v0.1.0
| @@ -0,0 +1,110 @@ | |||||
| # Byte-compiled / optimized / DLL files | |||||
| __pycache__/ | |||||
| *.py[cod] | |||||
| *$py.class | |||||
| # C extensions | |||||
| *.so | |||||
| # Distribution / packaging | |||||
| .Python | |||||
| build/ | |||||
| develop-eggs/ | |||||
| dist/ | |||||
| downloads/ | |||||
| eggs/ | |||||
| .eggs/ | |||||
| lib/ | |||||
| lib64/ | |||||
| parts/ | |||||
| sdist/ | |||||
| var/ | |||||
| wheels/ | |||||
| *.egg-info/ | |||||
| .installed.cfg | |||||
| *.egg | |||||
| MANIFEST | |||||
| # PyInstaller | |||||
| # Usually these files are written by a python script from a template | |||||
| # before PyInstaller builds the exe, so as to inject date/other infos into it. | |||||
| *.manifest | |||||
| *.spec | |||||
| # Installer logs | |||||
| pip-log.txt | |||||
| pip-delete-this-directory.txt | |||||
| # Unit test / coverage reports | |||||
| htmlcov/ | |||||
| .tox/ | |||||
| .coverage | |||||
| .coverage.* | |||||
| .cache | |||||
| nosetests.xml | |||||
| coverage.xml | |||||
| *.cover | |||||
| .hypothesis/ | |||||
| .pytest_cache/ | |||||
| # Translations | |||||
| *.mo | |||||
| *.pot | |||||
| # Django stuff: | |||||
| *.log | |||||
| local_settings.py | |||||
| db.sqlite3 | |||||
| # Flask stuff: | |||||
| instance/ | |||||
| .webassets-cache | |||||
| # Scrapy stuff: | |||||
| .scrapy | |||||
| # Sphinx documentation | |||||
| docs/_build/ | |||||
| # PyBuilder | |||||
| target/ | |||||
| # Jupyter Notebook | |||||
| .ipynb_checkpoints | |||||
| # pyenv | |||||
| .python-version | |||||
| # celery beat schedule file | |||||
| celerybeat-schedule | |||||
| # SageMath parsed files | |||||
| *.sage.py | |||||
| # Environments | |||||
| .env | |||||
| .venv | |||||
| env/ | |||||
| venv/ | |||||
| ENV/ | |||||
| env.bak/ | |||||
| venv.bak/ | |||||
| # Spyder project settings | |||||
| .spyderproject | |||||
| .spyproject | |||||
| # Rope project settings | |||||
| .ropeproject | |||||
| # mkdocs documentation | |||||
| /site | |||||
| # mypy | |||||
| .mypy_cache | |||||
| #custom | |||||
| GoogleNews-vectors-negative300.bin/ | |||||
| GoogleNews-vectors-negative300.bin.gz | |||||
| models/ | |||||
| *.swp | |||||
| @@ -0,0 +1,77 @@ | |||||
| ## Introduction | |||||
| This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. | |||||
| * MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News) | |||||
| * It can be run in both CPU and GPU | |||||
| * The best accuracy is 82.61%, which is better than 81.5% in the paper | |||||
| (by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!) | |||||
| ## Requirement | |||||
| * python 3.6 | |||||
| * pytorch > 0.1 | |||||
| * numpy | |||||
| * gensim | |||||
| ## Run | |||||
| STEP 1 | |||||
| install packages like gensim (other needed pakages is the same) | |||||
| ``` | |||||
| pip install gensim | |||||
| ``` | |||||
| STEP 2 | |||||
| install MRdataset and word2vec resources | |||||
| * MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz) | |||||
| * word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) | |||||
| Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'): | |||||
| STEP 3 | |||||
| train the model | |||||
| ``` | |||||
| python train.py | |||||
| ``` | |||||
| you will get the information printed in the screen, like | |||||
| ``` | |||||
| Epoch [1/20], Iter [100/192] Loss: 0.7008 | |||||
| Test Accuracy: 71.869159 % | |||||
| Epoch [2/20], Iter [100/192] Loss: 0.5957 | |||||
| Test Accuracy: 75.700935 % | |||||
| Epoch [3/20], Iter [100/192] Loss: 0.4934 | |||||
| Test Accuracy: 78.130841 % | |||||
| ...... | |||||
| Epoch [20/20], Iter [100/192] Loss: 0.0364 | |||||
| Test Accuracy: 81.495327 % | |||||
| Best Accuracy: 82.616822 % | |||||
| Best Model: models/cnn.pkl | |||||
| ``` | |||||
| ## Hyperparameters | |||||
| According to the paper and experiment, I set: | |||||
| |Epoch|Kernel Size|dropout|learning rate|batch size| | |||||
| |---|---|---|---|---| | |||||
| |20|\(h,300,100\)|0.5|0.0001|50| | |||||
| h = [3,4,5] | |||||
| If the accuracy is not improved, the learning rate will \*0.8. | |||||
| ## Result | |||||
| I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA) | |||||
| There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel. | |||||
| I have tried CNN-non-static:A model with pre-trained vectors from word2vec. | |||||
| All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task | |||||
| (which has almost the best performance and the most difficut to implement among the four models) | |||||
| |Dataset|Class Size|Best Result|Kim's Paper Result| | |||||
| |---|---|---|---| | |||||
| |MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)| | |||||
| ## Reference | |||||
| * [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) | |||||
| * https://github.com/Shawn1993/cnn-text-classification-pytorch | |||||
| * https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py | |||||
| @@ -0,0 +1,149 @@ | |||||
| import re | |||||
| import sys | |||||
| import itertools | |||||
| import numpy as np | |||||
| from torch.utils.data import Dataset, DataLoader | |||||
| import random | |||||
| import os | |||||
| import pickle | |||||
| import codecs | |||||
| from gensim import corpora | |||||
| import gensim | |||||
| def clean_str(string): | |||||
| """ | |||||
| Tokenization/string cleaning for all datasets except for SST. | |||||
| Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py | |||||
| """ | |||||
| string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |||||
| string = re.sub(r"\'s", " \'s", string) | |||||
| string = re.sub(r"\'ve", " \'ve", string) | |||||
| string = re.sub(r"n\'t", " n\'t", string) | |||||
| string = re.sub(r"\'re", " \'re", string) | |||||
| string = re.sub(r"\'d", " \'d", string) | |||||
| string = re.sub(r"\'ll", " \'ll", string) | |||||
| string = re.sub(r",", " , ", string) | |||||
| string = re.sub(r"!", " ! ", string) | |||||
| string = re.sub(r"\(", " \( ", string) | |||||
| string = re.sub(r"\)", " \) ", string) | |||||
| string = re.sub(r"\?", " \? ", string) | |||||
| string = re.sub(r"\s{2,}", " ", string) | |||||
| return string.strip() | |||||
| def pad_sentences(sentence, padding_word=" <PAD/>"): | |||||
| sequence_length = 64 | |||||
| sent = sentence.split() | |||||
| padded_sentence = sentence + padding_word * (sequence_length - len(sent)) | |||||
| return padded_sentence | |||||
| #data loader | |||||
| class MRDataset(Dataset): | |||||
| def __init__(self): | |||||
| #load positive and negative sentenses from files | |||||
| with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: | |||||
| positive_examples = list(f.readlines()) | |||||
| with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: | |||||
| negative_examples = list(f.readlines()) | |||||
| #s.strip: clear "\n"; clear_str; pad | |||||
| positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] | |||||
| negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] | |||||
| self.examples = positive_examples + negative_examples | |||||
| self.sentences_texts = [sample.split() for sample in self.examples] | |||||
| #word dictionary | |||||
| dictionary = corpora.Dictionary(self.sentences_texts) | |||||
| self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} | |||||
| #set lables: postive is 1; negative is 0 | |||||
| positive_labels = [1 for _ in positive_examples] | |||||
| negative_labels = [0 for _ in negative_examples] | |||||
| self.lables = positive_labels + negative_labels | |||||
| examples_lables = list(zip(self.examples,self.lables)) | |||||
| random.shuffle(examples_lables) | |||||
| self.MRDataset_frame = examples_lables | |||||
| #transform word to id | |||||
| self.MRDataset_wordid = \ | |||||
| [( | |||||
| np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), | |||||
| sent[1] | |||||
| ) for sent in self.MRDataset_frame] | |||||
| def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'): | |||||
| #establish from google | |||||
| model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) | |||||
| print('Please wait ... (it could take a while to load the file : {})'.format(path)) | |||||
| word_dict = self.word2id_dict | |||||
| embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) | |||||
| for word in word_dict: | |||||
| word_id = word_dict[word] | |||||
| if word in model.wv.vocab: | |||||
| embedding_weights[word_id, :] = model[word] | |||||
| return embedding_weights | |||||
| def __len__(self): | |||||
| return len(self.MRDataset_frame) | |||||
| def __getitem__(self,idx): | |||||
| sample = self.MRDataset_wordid[idx] | |||||
| return sample | |||||
| def getsent(self, idx): | |||||
| sample = self.MRDataset_wordid[idx][0] | |||||
| return sample | |||||
| def getlabel(self, idx): | |||||
| label = self.MRDataset_wordid[idx][1] | |||||
| return label | |||||
| def word2id(self): | |||||
| return self.word2id_dict | |||||
| def id2word(self): | |||||
| id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) | |||||
| return id2word_dict | |||||
| class train_set(Dataset): | |||||
| def __init__(self, samples): | |||||
| self.train_frame = samples | |||||
| def __len__(self): | |||||
| return len(self.train_frame) | |||||
| def __getitem__(self, idx): | |||||
| return self.train_frame[idx] | |||||
| class test_set(Dataset): | |||||
| def __init__(self, samples): | |||||
| self.test_frame = samples | |||||
| def __len__(self): | |||||
| return len(self.test_frame) | |||||
| def __getitem__(self, idx): | |||||
| return self.test_frame[idx] | |||||
| @@ -0,0 +1,43 @@ | |||||
| import os | |||||
| import sys | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from torch.autograd import Variable | |||||
| import dataset | |||||
| class CNN_text(nn.Module): | |||||
| def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None): | |||||
| super(CNN_text, self).__init__() | |||||
| self.embedding = nn.Embedding(embed_num,embed_dim) | |||||
| self.dropout = nn.Dropout(dropout) | |||||
| if pretrained_embeddings is not None: | |||||
| self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) | |||||
| #the network structure | |||||
| #Conv2d: input- N,C,H,W output- (50,100,62,1) | |||||
| self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h]) | |||||
| self.fc1 = nn.Linear(300,2) | |||||
| def max_pooling(self, x): | |||||
| x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62) | |||||
| x = F.max_pool1d(x, x.size(2)).squeeze(2) | |||||
| #x.size(2)=62 squeeze: (50,100,1) -> (50,100) | |||||
| return x | |||||
| def forward(self, x): | |||||
| x = self.embedding(x) #output: (N,H,W) = (50,64,300) | |||||
| x = x.unsqueeze(1) #(N,C,H,W) | |||||
| x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)] | |||||
| x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)] | |||||
| x = torch.cat(x,1) | |||||
| x = self.dropout(x) | |||||
| x = self.fc1(x) | |||||
| return x | |||||
| @@ -0,0 +1,102 @@ | |||||
| import os | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torchvision.datasets as dsets | |||||
| import torchvision.transforms as transforms | |||||
| import dataset as dst | |||||
| from model import CNN_text | |||||
| from torch.autograd import Variable | |||||
| from sklearn import cross_validation | |||||
| from sklearn import datasets | |||||
| # Hyper Parameters | |||||
| batch_size = 50 | |||||
| learning_rate = 0.0001 | |||||
| num_epochs = 20 | |||||
| cuda = True | |||||
| #split Dataset | |||||
| dataset = dst.MRDataset() | |||||
| length = len(dataset) | |||||
| train_dataset = dataset[:int(0.9*length)] | |||||
| test_dataset = dataset[int(0.9*length):] | |||||
| train_dataset = dst.train_set(train_dataset) | |||||
| test_dataset = dst.test_set(test_dataset) | |||||
| # Data Loader | |||||
| train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |||||
| batch_size=batch_size, | |||||
| shuffle=True) | |||||
| test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |||||
| batch_size=batch_size, | |||||
| shuffle=False) | |||||
| #cnn | |||||
| cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings()) | |||||
| if cuda: | |||||
| cnn.cuda() | |||||
| # Loss and Optimizer | |||||
| criterion = nn.CrossEntropyLoss() | |||||
| optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | |||||
| #train and test | |||||
| best_acc = None | |||||
| for epoch in range(num_epochs): | |||||
| # Train the Model | |||||
| cnn.train() | |||||
| for i, (sents,labels) in enumerate(train_loader): | |||||
| sents = Variable(sents) | |||||
| labels = Variable(labels) | |||||
| if cuda: | |||||
| sents = sents.cuda() | |||||
| labels = labels.cuda() | |||||
| optimizer.zero_grad() | |||||
| outputs = cnn(sents) | |||||
| loss = criterion(outputs, labels) | |||||
| loss.backward() | |||||
| optimizer.step() | |||||
| if (i+1) % 100 == 0: | |||||
| print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' | |||||
| %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) | |||||
| # Test the Model | |||||
| cnn.eval() | |||||
| correct = 0 | |||||
| total = 0 | |||||
| for sents, labels in test_loader: | |||||
| sents = Variable(sents) | |||||
| if cuda: | |||||
| sents = sents.cuda() | |||||
| labels = labels.cuda() | |||||
| outputs = cnn(sents) | |||||
| _, predicted = torch.max(outputs.data, 1) | |||||
| total += labels.size(0) | |||||
| correct += (predicted == labels).sum() | |||||
| acc = 100. * correct / total | |||||
| print('Test Accuracy: %f %%' % (acc)) | |||||
| if best_acc is None or acc > best_acc: | |||||
| best_acc = acc | |||||
| if os.path.exists("models") is False: | |||||
| os.makedirs("models") | |||||
| torch.save(cnn.state_dict(), 'models/cnn.pkl') | |||||
| else: | |||||
| learning_rate = learning_rate * 0.8 | |||||
| print("Best Accuracy: %f %%" % best_acc) | |||||
| print("Best Model: models/cnn.pkl") | |||||