import re import sys import itertools import numpy as np from torch.utils.data import Dataset, DataLoader import random import os import pickle import codecs from gensim import corpora import gensim def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() def pad_sentences(sentence, padding_word=" "): sequence_length = 64 sent = sentence.split() padded_sentence = sentence + padding_word * (sequence_length - len(sent)) return padded_sentence #data loader class MRDataset(Dataset): def __init__(self): #load positive and negative sentenses from files with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: positive_examples = list(f.readlines()) with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: negative_examples = list(f.readlines()) #s.strip: clear "\n"; clear_str; pad positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] self.examples = positive_examples + negative_examples self.sentences_texts = [sample.split() for sample in self.examples] #word dictionary dictionary = corpora.Dictionary(self.sentences_texts) self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} #set lables: postive is 1; negative is 0 positive_labels = [1 for _ in positive_examples] negative_labels = [0 for _ in negative_examples] self.lables = positive_labels + negative_labels examples_lables = list(zip(self.examples,self.lables)) random.shuffle(examples_lables) self.MRDataset_frame = examples_lables #transform word to id self.MRDataset_wordid = \ [( np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), sent[1] ) for sent in self.MRDataset_frame] def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'): #establish from google model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) print('Please wait ... (it could take a while to load the file : {})'.format(path)) word_dict = self.word2id_dict embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) for word in word_dict: word_id = word_dict[word] if word in model.wv.vocab: embedding_weights[word_id, :] = model[word] return embedding_weights def __len__(self): return len(self.MRDataset_frame) def __getitem__(self,idx): sample = self.MRDataset_wordid[idx] return sample def getsent(self, idx): sample = self.MRDataset_wordid[idx][0] return sample def getlabel(self, idx): label = self.MRDataset_wordid[idx][1] return label def word2id(self): return self.word2id_dict def id2word(self): id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) return id2word_dict class train_set(Dataset): def __init__(self, samples): self.train_frame = samples def __len__(self): return len(self.train_frame) def __getitem__(self, idx): return self.train_frame[idx] class test_set(Dataset): def __init__(self, samples): self.test_frame = samples def __len__(self): return len(self.test_frame) def __getitem__(self, idx): return self.test_frame[idx]