| @@ -0,0 +1,110 @@ | |||
| # Byte-compiled / optimized / DLL files | |||
| __pycache__/ | |||
| *.py[cod] | |||
| *$py.class | |||
| # C extensions | |||
| *.so | |||
| # Distribution / packaging | |||
| .Python | |||
| build/ | |||
| develop-eggs/ | |||
| dist/ | |||
| downloads/ | |||
| eggs/ | |||
| .eggs/ | |||
| lib/ | |||
| lib64/ | |||
| parts/ | |||
| sdist/ | |||
| var/ | |||
| wheels/ | |||
| *.egg-info/ | |||
| .installed.cfg | |||
| *.egg | |||
| MANIFEST | |||
| # PyInstaller | |||
| # Usually these files are written by a python script from a template | |||
| # before PyInstaller builds the exe, so as to inject date/other infos into it. | |||
| *.manifest | |||
| *.spec | |||
| # Installer logs | |||
| pip-log.txt | |||
| pip-delete-this-directory.txt | |||
| # Unit test / coverage reports | |||
| htmlcov/ | |||
| .tox/ | |||
| .coverage | |||
| .coverage.* | |||
| .cache | |||
| nosetests.xml | |||
| coverage.xml | |||
| *.cover | |||
| .hypothesis/ | |||
| .pytest_cache/ | |||
| # Translations | |||
| *.mo | |||
| *.pot | |||
| # Django stuff: | |||
| *.log | |||
| local_settings.py | |||
| db.sqlite3 | |||
| # Flask stuff: | |||
| instance/ | |||
| .webassets-cache | |||
| # Scrapy stuff: | |||
| .scrapy | |||
| # Sphinx documentation | |||
| docs/_build/ | |||
| # PyBuilder | |||
| target/ | |||
| # Jupyter Notebook | |||
| .ipynb_checkpoints | |||
| # pyenv | |||
| .python-version | |||
| # celery beat schedule file | |||
| celerybeat-schedule | |||
| # SageMath parsed files | |||
| *.sage.py | |||
| # Environments | |||
| .env | |||
| .venv | |||
| env/ | |||
| venv/ | |||
| ENV/ | |||
| env.bak/ | |||
| venv.bak/ | |||
| # Spyder project settings | |||
| .spyderproject | |||
| .spyproject | |||
| # Rope project settings | |||
| .ropeproject | |||
| # mkdocs documentation | |||
| /site | |||
| # mypy | |||
| .mypy_cache | |||
| #custom | |||
| GoogleNews-vectors-negative300.bin/ | |||
| GoogleNews-vectors-negative300.bin.gz | |||
| models/ | |||
| *.swp | |||
| @@ -0,0 +1,77 @@ | |||
| ## Introduction | |||
| This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. | |||
| * MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News) | |||
| * It can be run in both CPU and GPU | |||
| * The best accuracy is 82.61%, which is better than 81.5% in the paper | |||
| (by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!) | |||
| ## Requirement | |||
| * python 3.6 | |||
| * pytorch > 0.1 | |||
| * numpy | |||
| * gensim | |||
| ## Run | |||
| STEP 1 | |||
| install packages like gensim (other needed pakages is the same) | |||
| ``` | |||
| pip install gensim | |||
| ``` | |||
| STEP 2 | |||
| install MRdataset and word2vec resources | |||
| * MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz) | |||
| * word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) | |||
| Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'): | |||
| STEP 3 | |||
| train the model | |||
| ``` | |||
| python train.py | |||
| ``` | |||
| you will get the information printed in the screen, like | |||
| ``` | |||
| Epoch [1/20], Iter [100/192] Loss: 0.7008 | |||
| Test Accuracy: 71.869159 % | |||
| Epoch [2/20], Iter [100/192] Loss: 0.5957 | |||
| Test Accuracy: 75.700935 % | |||
| Epoch [3/20], Iter [100/192] Loss: 0.4934 | |||
| Test Accuracy: 78.130841 % | |||
| ...... | |||
| Epoch [20/20], Iter [100/192] Loss: 0.0364 | |||
| Test Accuracy: 81.495327 % | |||
| Best Accuracy: 82.616822 % | |||
| Best Model: models/cnn.pkl | |||
| ``` | |||
| ## Hyperparameters | |||
| According to the paper and experiment, I set: | |||
| |Epoch|Kernel Size|dropout|learning rate|batch size| | |||
| |---|---|---|---|---| | |||
| |20|\(h,300,100\)|0.5|0.0001|50| | |||
| h = [3,4,5] | |||
| If the accuracy is not improved, the learning rate will /*0.8. | |||
| ## Result | |||
| I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA) | |||
| There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel. | |||
| I have tried CNN-non-static:A model with pre-trained vectors from word2vec. | |||
| All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task | |||
| (which has almost the best performance and the most difficut to implement among the four models) | |||
| |Dataset|Class Size|Best Result|Kim's Paper Result| | |||
| |---|---|---|---| | |||
| |MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)| | |||
| ## Reference | |||
| * [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) | |||
| * https://github.com/Shawn1993/cnn-text-classification-pytorch | |||
| * https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py | |||
| @@ -0,0 +1,149 @@ | |||
| import re | |||
| import sys | |||
| import itertools | |||
| import numpy as np | |||
| from torch.utils.data import Dataset, DataLoader | |||
| import random | |||
| import os | |||
| import pickle | |||
| import codecs | |||
| from gensim import corpora | |||
| import gensim | |||
| def clean_str(string): | |||
| """ | |||
| Tokenization/string cleaning for all datasets except for SST. | |||
| Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py | |||
| """ | |||
| string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |||
| string = re.sub(r"\'s", " \'s", string) | |||
| string = re.sub(r"\'ve", " \'ve", string) | |||
| string = re.sub(r"n\'t", " n\'t", string) | |||
| string = re.sub(r"\'re", " \'re", string) | |||
| string = re.sub(r"\'d", " \'d", string) | |||
| string = re.sub(r"\'ll", " \'ll", string) | |||
| string = re.sub(r",", " , ", string) | |||
| string = re.sub(r"!", " ! ", string) | |||
| string = re.sub(r"\(", " \( ", string) | |||
| string = re.sub(r"\)", " \) ", string) | |||
| string = re.sub(r"\?", " \? ", string) | |||
| string = re.sub(r"\s{2,}", " ", string) | |||
| return string.strip() | |||
| def pad_sentences(sentence, padding_word=" <PAD/>"): | |||
| sequence_length = 64 | |||
| sent = sentence.split() | |||
| padded_sentence = sentence + padding_word * (sequence_length - len(sent)) | |||
| return padded_sentence | |||
| #data loader | |||
| class MRDataset(Dataset): | |||
| def __init__(self): | |||
| #load positive and negative sentenses from files | |||
| with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: | |||
| positive_examples = list(f.readlines()) | |||
| with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: | |||
| negative_examples = list(f.readlines()) | |||
| #s.strip: clear "\n"; clear_str; pad | |||
| positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] | |||
| negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] | |||
| self.examples = positive_examples + negative_examples | |||
| self.sentences_texts = [sample.split() for sample in self.examples] | |||
| #word dictionary | |||
| dictionary = corpora.Dictionary(self.sentences_texts) | |||
| self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} | |||
| #set lables: postive is 1; negative is 0 | |||
| positive_labels = [1 for _ in positive_examples] | |||
| negative_labels = [0 for _ in negative_examples] | |||
| self.lables = positive_labels + negative_labels | |||
| examples_lables = list(zip(self.examples,self.lables)) | |||
| random.shuffle(examples_lables) | |||
| self.MRDataset_frame = examples_lables | |||
| #transform word to id | |||
| self.MRDataset_wordid = \ | |||
| [( | |||
| np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), | |||
| sent[1] | |||
| ) for sent in self.MRDataset_frame] | |||
| def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'): | |||
| #establish from google | |||
| model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) | |||
| print('Please wait ... (it could take a while to load the file : {})'.format(path)) | |||
| word_dict = self.word2id_dict | |||
| embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) | |||
| for word in word_dict: | |||
| word_id = word_dict[word] | |||
| if word in model.wv.vocab: | |||
| embedding_weights[word_id, :] = model[word] | |||
| return embedding_weights | |||
| def __len__(self): | |||
| return len(self.MRDataset_frame) | |||
| def __getitem__(self,idx): | |||
| sample = self.MRDataset_wordid[idx] | |||
| return sample | |||
| def getsent(self, idx): | |||
| sample = self.MRDataset_wordid[idx][0] | |||
| return sample | |||
| def getlabel(self, idx): | |||
| label = self.MRDataset_wordid[idx][1] | |||
| return label | |||
| def word2id(self): | |||
| return self.word2id_dict | |||
| def id2word(self): | |||
| id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) | |||
| return id2word_dict | |||
| class train_set(Dataset): | |||
| def __init__(self, samples): | |||
| self.train_frame = samples | |||
| def __len__(self): | |||
| return len(self.train_frame) | |||
| def __getitem__(self, idx): | |||
| return self.train_frame[idx] | |||
| class test_set(Dataset): | |||
| def __init__(self, samples): | |||
| self.test_frame = samples | |||
| def __len__(self): | |||
| return len(self.test_frame) | |||
| def __getitem__(self, idx): | |||
| return self.test_frame[idx] | |||
| @@ -0,0 +1,61 @@ | |||
| import os | |||
| import sys | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torch.autograd import Variable | |||
| from torch.utils.data import DataLoader, TensorDataset | |||
| import dataset | |||
| """ | |||
| #some information | |||
| mode = "static" | |||
| use_pretrained_embedding = "gensim.word2vec" | |||
| print('MODE = {}'.format(mode)) | |||
| print('EMBEDDING = {}\n'.format(use_pretrained_embeddings) | |||
| embedding_weights = dataset.word_embedding_300() | |||
| embed_num = len(embedding_weights) | |||
| embed_dim = 300 | |||
| class_num = 2 | |||
| len_sentence = 64 | |||
| print('embedding size = {}'.format(embed_num)) | |||
| print('embedding dimension = {}'.format(embed_dim)) | |||
| print('sentence len n = {}'.format(len_sentence)) | |||
| print('num of classes = {}'.format(class_num)) | |||
| """ | |||
| class CNN_text(nn.Module): | |||
| def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None): | |||
| super(CNN_text, self).__init__() | |||
| self.embedding = nn.Embedding(embed_num,embed_dim) | |||
| self.dropout = nn.Dropout(dropout) | |||
| if pretrained_embeddings is not None: | |||
| self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) | |||
| #the network structure | |||
| #Conv2d: input- N,C,H,W output- (50,100,62,1) | |||
| self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h]) | |||
| self.fc1 = nn.Linear(300,2) | |||
| def max_pooling(self, x): | |||
| x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62) | |||
| x = F.max_pool1d(x, x.size(2)).squeeze(2) | |||
| #x.size(2)=62 squeeze: (50,100,1) -> (50,100) | |||
| return x | |||
| def forward(self, x): | |||
| x = self.embedding(x) #output: (N,H,W) = (50,64,300) | |||
| x = x.unsqueeze(1) #(N,C,H,W) | |||
| x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)] | |||
| x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)] | |||
| x = torch.cat(x,1) | |||
| x = self.dropout(x) | |||
| x = self.fc1(x) | |||
| return x | |||
| @@ -0,0 +1,100 @@ | |||
| import os | |||
| import torch | |||
| import torch.nn as nn | |||
| import torchvision.datasets as dsets | |||
| import torchvision.transforms as transforms | |||
| import dataset as dst | |||
| from model import CNN_text | |||
| from torch.autograd import Variable | |||
| from sklearn import cross_validation | |||
| from sklearn import datasets | |||
| # Hyper Parameters | |||
| batch_size = 50 | |||
| learning_rate = 0.0001 | |||
| num_epochs = 20 | |||
| cuda = True | |||
| #split Dataset | |||
| dataset = dst.MRDataset() | |||
| length = len(dataset) | |||
| train_dataset = dataset[:int(0.9*length)] | |||
| test_dataset = dataset[int(0.9*length):] | |||
| train_dataset = dst.train_set(train_dataset) | |||
| test_dataset = dst.test_set(test_dataset) | |||
| # Data Loader | |||
| train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |||
| batch_size=batch_size, | |||
| shuffle=True) | |||
| test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |||
| batch_size=batch_size, | |||
| shuffle=False) | |||
| cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings()) | |||
| if cuda: | |||
| cnn.cuda() | |||
| # Loss and Optimizer | |||
| criterion = nn.CrossEntropyLoss() | |||
| optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | |||
| best_acc = None | |||
| for epoch in range(num_epochs): | |||
| # Train the Model | |||
| cnn.train() | |||
| for i, (sents,labels) in enumerate(train_loader): | |||
| sents = Variable(sents) | |||
| labels = Variable(labels) | |||
| if cuda: | |||
| sents = sents.cuda() | |||
| labels = labels.cuda() | |||
| optimizer.zero_grad() | |||
| outputs = cnn(sents) | |||
| loss = criterion(outputs, labels) | |||
| loss.backward() | |||
| optimizer.step() | |||
| if (i+1) % 100 == 0: | |||
| print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' | |||
| %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) | |||
| # Test the Model | |||
| cnn.eval() | |||
| correct = 0 | |||
| total = 0 | |||
| for sents, labels in test_loader: | |||
| sents = Variable(sents) | |||
| if cuda: | |||
| sents = sents.cuda() | |||
| labels = labels.cuda() | |||
| outputs = cnn(sents) | |||
| _, predicted = torch.max(outputs.data, 1) | |||
| total += labels.size(0) | |||
| correct += (predicted == labels).sum() | |||
| acc = 100. * correct / total | |||
| print('Test Accuracy: %f %%' % (acc)) | |||
| if best_acc is None or acc > best_acc: | |||
| best_acc = acc | |||
| if os.path.exists("models") is False: | |||
| os.makedirs("models") | |||
| torch.save(cnn.state_dict(), 'models/cnn.pkl') | |||
| else: | |||
| learning_rate = learning_rate * 0.8 | |||
| print("Best Accuracy: %f %%" % best_acc) | |||
| print("Best Model: models/cnn.pkl") | |||