|
|
|
@@ -21,6 +21,7 @@ import random |
|
|
|
import codecs |
|
|
|
from pathlib import Path |
|
|
|
import numpy as np |
|
|
|
import pandas as pd |
|
|
|
import mindspore.dataset as ds |
|
|
|
|
|
|
|
|
|
|
|
@@ -36,9 +37,56 @@ class Generator(): |
|
|
|
return len(self.input_list) |
|
|
|
|
|
|
|
|
|
|
|
class MovieReview: |
|
|
|
class DataProcessor: |
|
|
|
""" |
|
|
|
preprocess MR dataset |
|
|
|
preprocess dataset |
|
|
|
""" |
|
|
|
def get_dict_len(self): |
|
|
|
""" |
|
|
|
get number of different words in the whole dataset |
|
|
|
""" |
|
|
|
if self.doConvert: |
|
|
|
return len(self.Vocab) |
|
|
|
return -1 |
|
|
|
|
|
|
|
def collect_weight(self, glove_path, embed_size): |
|
|
|
""" collect weight """ |
|
|
|
vocab_size = self.get_dict_len() |
|
|
|
embedding_index = {} |
|
|
|
f = open(glove_path) |
|
|
|
for line in f: |
|
|
|
values = line.split() |
|
|
|
word = values[0] |
|
|
|
vec = np.array(values[1:], dtype='float32') |
|
|
|
embedding_index[word] = vec |
|
|
|
weight_np = np.zeros((vocab_size, embed_size)).astype(np.float32) |
|
|
|
|
|
|
|
for word, vec in embedding_index.items(): |
|
|
|
try: |
|
|
|
index = self.Vocab[word] |
|
|
|
except KeyError: |
|
|
|
continue |
|
|
|
weight_np[index, :] = vec |
|
|
|
return weight_np |
|
|
|
|
|
|
|
def create_train_dataset(self, epoch_size, batch_size, collect_weight=False, glove_path='', embed_size=50): |
|
|
|
if collect_weight: |
|
|
|
weight_np = self.collect_weight(glove_path, embed_size) |
|
|
|
np.savetxt('./weight.txt', weight_np) |
|
|
|
dataset = ds.GeneratorDataset(source=Generator(input_list=self.train), |
|
|
|
column_names=["data", "label"], shuffle=False) |
|
|
|
dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) |
|
|
|
return dataset |
|
|
|
|
|
|
|
def create_test_dataset(self, batch_size): |
|
|
|
dataset = ds.GeneratorDataset(source=Generator(input_list=self.test), |
|
|
|
column_names=["data", "label"], shuffle=False) |
|
|
|
dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) |
|
|
|
return dataset |
|
|
|
|
|
|
|
class MovieReview(DataProcessor): |
|
|
|
""" |
|
|
|
preprocess MovieReview dataset |
|
|
|
""" |
|
|
|
def __init__(self, root_dir, maxlen, split): |
|
|
|
""" |
|
|
|
@@ -151,7 +199,6 @@ class MovieReview: |
|
|
|
# Vocab = {word : index} |
|
|
|
self.Vocab = dict() |
|
|
|
|
|
|
|
# self.Vocab['None'] |
|
|
|
for SentenceLabel in self.Pos+self.Neg: |
|
|
|
vector = [0]*maxlen |
|
|
|
for index, word in enumerate(SentenceLabel[0]): |
|
|
|
@@ -185,27 +232,256 @@ class MovieReview: |
|
|
|
self.train = [i for item in pos_temp+neg_temp for i in item] |
|
|
|
|
|
|
|
random.shuffle(self.train) |
|
|
|
# random.shuffle(self.test) |
|
|
|
|
|
|
|
def get_dict_len(self): |
|
|
|
class Subjectivity(DataProcessor): |
|
|
|
""" |
|
|
|
preprocess Subjectivity dataset |
|
|
|
""" |
|
|
|
def __init__(self, root_dir, maxlen, split): |
|
|
|
self.path = root_dir |
|
|
|
self.feelMap = { |
|
|
|
'neg': 0, |
|
|
|
'pos': 1 |
|
|
|
} |
|
|
|
self.files = [] |
|
|
|
self.doConvert = False |
|
|
|
mypath = Path(self.path) |
|
|
|
|
|
|
|
if not mypath.exists() or not mypath.is_dir(): |
|
|
|
print("please check the root_dir!") |
|
|
|
raise ValueError |
|
|
|
|
|
|
|
# walk through the root_dir |
|
|
|
for root, _, filename in os.walk(self.path): |
|
|
|
for each in filename: |
|
|
|
self.files.append(os.path.join(root, each)) |
|
|
|
break |
|
|
|
|
|
|
|
# begin to read data |
|
|
|
self.word_num = 0 |
|
|
|
self.maxlen = 0 |
|
|
|
self.minlen = float("inf") |
|
|
|
self.maxlen = float("-inf") |
|
|
|
self.Pos = [] |
|
|
|
self.Neg = [] |
|
|
|
for filename in self.files: |
|
|
|
self.read_data(filename) |
|
|
|
self.PosNeg = self.Pos + self.Neg |
|
|
|
self.text2vec(maxlen=maxlen) |
|
|
|
self.split_dataset(split=split) |
|
|
|
|
|
|
|
def read_data(self, filePath): |
|
|
|
""" |
|
|
|
get number of different words in the whole dataset |
|
|
|
read text into memory |
|
|
|
|
|
|
|
input: |
|
|
|
filePath: the path where the data is stored in |
|
|
|
""" |
|
|
|
if self.doConvert: |
|
|
|
return len(self.Vocab) |
|
|
|
return -1 |
|
|
|
#else: |
|
|
|
# print("Haven't finished Text2Vec") |
|
|
|
# return -1 |
|
|
|
with open(filePath, 'r', encoding="ISO-8859-1") as f: |
|
|
|
for sentence in f.readlines(): |
|
|
|
sentence = sentence.replace('\n', '')\ |
|
|
|
.replace('"', '')\ |
|
|
|
.replace('\'', '')\ |
|
|
|
.replace('.', '')\ |
|
|
|
.replace(',', '')\ |
|
|
|
.replace('[', '')\ |
|
|
|
.replace(']', '')\ |
|
|
|
.replace('(', '')\ |
|
|
|
.replace(')', '')\ |
|
|
|
.replace(':', '')\ |
|
|
|
.replace('--', '')\ |
|
|
|
.replace('-', '')\ |
|
|
|
.replace('\\', '')\ |
|
|
|
.replace('0', '')\ |
|
|
|
.replace('1', '')\ |
|
|
|
.replace('2', '')\ |
|
|
|
.replace('3', '')\ |
|
|
|
.replace('4', '')\ |
|
|
|
.replace('5', '')\ |
|
|
|
.replace('6', '')\ |
|
|
|
.replace('7', '')\ |
|
|
|
.replace('8', '')\ |
|
|
|
.replace('9', '')\ |
|
|
|
.replace('`', '')\ |
|
|
|
.replace('=', '')\ |
|
|
|
.replace('$', '')\ |
|
|
|
.replace('/', '')\ |
|
|
|
.replace('*', '')\ |
|
|
|
.replace(';', '')\ |
|
|
|
.replace('<b>', '')\ |
|
|
|
.replace('%', '') |
|
|
|
sentence = sentence.split(' ') |
|
|
|
sentence = list(filter(lambda x: x, sentence)) |
|
|
|
if sentence: |
|
|
|
self.word_num += len(sentence) |
|
|
|
self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence) |
|
|
|
self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence) |
|
|
|
if 'quote' in filePath: |
|
|
|
self.Pos.append([sentence, self.feelMap['pos']]) |
|
|
|
elif 'plot' in filePath: |
|
|
|
self.Neg.append([sentence, self.feelMap['neg']]) |
|
|
|
|
|
|
|
def create_train_dataset(self, epoch_size, batch_size): |
|
|
|
dataset = ds.GeneratorDataset(source=Generator(input_list=self.train), |
|
|
|
column_names=["data", "label"], shuffle=False) |
|
|
|
dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) |
|
|
|
return dataset |
|
|
|
def text2vec(self, maxlen): |
|
|
|
""" |
|
|
|
convert the sentence into a vector in an int type |
|
|
|
|
|
|
|
def create_test_dataset(self, batch_size): |
|
|
|
dataset = ds.GeneratorDataset(source=Generator(input_list=self.test), |
|
|
|
column_names=["data", "label"], shuffle=False) |
|
|
|
dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) |
|
|
|
return dataset |
|
|
|
input: |
|
|
|
maxlen: max length of the sentence |
|
|
|
""" |
|
|
|
# Vocab = {word : index} |
|
|
|
self.Vocab = dict() |
|
|
|
|
|
|
|
for SentenceLabel in self.Pos+self.Neg: |
|
|
|
vector = [0]*maxlen |
|
|
|
for index, word in enumerate(SentenceLabel[0]): |
|
|
|
if index >= maxlen: |
|
|
|
break |
|
|
|
if word not in self.Vocab.keys(): |
|
|
|
self.Vocab[word] = len(self.Vocab) |
|
|
|
vector[index] = len(self.Vocab) - 1 |
|
|
|
else: |
|
|
|
vector[index] = self.Vocab[word] |
|
|
|
SentenceLabel[0] = vector |
|
|
|
self.doConvert = True |
|
|
|
|
|
|
|
def split_dataset(self, split): |
|
|
|
""" |
|
|
|
split the dataset into training set and test set |
|
|
|
input: |
|
|
|
split: the ratio of training set to test set |
|
|
|
rank: logic order |
|
|
|
size: device num |
|
|
|
""" |
|
|
|
trunk_pos_size = math.ceil((1-split)*len(self.Pos)) |
|
|
|
trunk_neg_size = math.ceil((1-split)*len(self.Neg)) |
|
|
|
trunk_num = int(1/(1-split)) |
|
|
|
pos_temp = list() |
|
|
|
neg_temp = list() |
|
|
|
for index in range(trunk_num): |
|
|
|
pos_temp.append(self.Pos[index*trunk_pos_size:(index+1)*trunk_pos_size]) |
|
|
|
neg_temp.append(self.Neg[index*trunk_neg_size:(index+1)*trunk_neg_size]) |
|
|
|
self.test = pos_temp.pop(2)+neg_temp.pop(2) |
|
|
|
self.train = [i for item in pos_temp+neg_temp for i in item] |
|
|
|
|
|
|
|
random.shuffle(self.train) |
|
|
|
|
|
|
|
class SST2(DataProcessor): |
|
|
|
""" |
|
|
|
preprocess SST2 dataset |
|
|
|
""" |
|
|
|
def __init__(self, root_dir, maxlen, split): |
|
|
|
self.path = root_dir |
|
|
|
self.files = [] |
|
|
|
self.train = [] |
|
|
|
self.test = [] |
|
|
|
self.doConvert = False |
|
|
|
mypath = Path(self.path) |
|
|
|
|
|
|
|
if not mypath.exists() or not mypath.is_dir(): |
|
|
|
print("please check the root_dir!") |
|
|
|
raise ValueError |
|
|
|
|
|
|
|
# walk through the root_dir |
|
|
|
for root, _, filename in os.walk(self.path): |
|
|
|
for each in filename: |
|
|
|
self.files.append(os.path.join(root, each)) |
|
|
|
break |
|
|
|
|
|
|
|
# begin to read data |
|
|
|
self.word_num = 0 |
|
|
|
self.maxlen = 0 |
|
|
|
self.minlen = float("inf") |
|
|
|
self.maxlen = float("-inf") |
|
|
|
for filename in self.files: |
|
|
|
if 'train' in filename or 'dev' in filename: |
|
|
|
f = codecs.open(filename, 'r') |
|
|
|
ff = f.read() |
|
|
|
file_object = codecs.open(filename, 'w', 'utf-8') |
|
|
|
file_object.write(ff) |
|
|
|
self.read_data(filename) |
|
|
|
self.text2vec(maxlen=maxlen) |
|
|
|
self.split_dataset(split=split) |
|
|
|
|
|
|
|
def read_data(self, filePath): |
|
|
|
""" |
|
|
|
read text into memory |
|
|
|
|
|
|
|
input: |
|
|
|
filePath: the path where the data is stored in |
|
|
|
""" |
|
|
|
df = pd.read_csv(filePath, delimiter='\t') |
|
|
|
for sentence, label in zip(df['sentence'], df['label']): |
|
|
|
sentence = sentence.replace('\n', '')\ |
|
|
|
.replace('"', '')\ |
|
|
|
.replace('\'', '')\ |
|
|
|
.replace('.', '')\ |
|
|
|
.replace(',', '')\ |
|
|
|
.replace('[', '')\ |
|
|
|
.replace(']', '')\ |
|
|
|
.replace('(', '')\ |
|
|
|
.replace(')', '')\ |
|
|
|
.replace(':', '')\ |
|
|
|
.replace('--', '')\ |
|
|
|
.replace('-', '')\ |
|
|
|
.replace('\\', '')\ |
|
|
|
.replace('0', '')\ |
|
|
|
.replace('1', '')\ |
|
|
|
.replace('2', '')\ |
|
|
|
.replace('3', '')\ |
|
|
|
.replace('4', '')\ |
|
|
|
.replace('5', '')\ |
|
|
|
.replace('6', '')\ |
|
|
|
.replace('7', '')\ |
|
|
|
.replace('8', '')\ |
|
|
|
.replace('9', '')\ |
|
|
|
.replace('`', '')\ |
|
|
|
.replace('=', '')\ |
|
|
|
.replace('$', '')\ |
|
|
|
.replace('/', '')\ |
|
|
|
.replace('*', '')\ |
|
|
|
.replace(';', '')\ |
|
|
|
.replace('<b>', '')\ |
|
|
|
.replace('%', '') |
|
|
|
sentence = sentence.split(' ') |
|
|
|
sentence = list(filter(lambda x: x, sentence)) |
|
|
|
if sentence: |
|
|
|
self.word_num += len(sentence) |
|
|
|
self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence) |
|
|
|
self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence) |
|
|
|
if 'train' in filePath: |
|
|
|
self.train.append([sentence, label]) |
|
|
|
elif 'dev' in filePath: |
|
|
|
self.test.append([sentence, label]) |
|
|
|
|
|
|
|
def text2vec(self, maxlen): |
|
|
|
""" |
|
|
|
convert the sentence into a vector in an int type |
|
|
|
|
|
|
|
input: |
|
|
|
maxlen: max length of the sentence |
|
|
|
""" |
|
|
|
# Vocab = {word : index} |
|
|
|
self.Vocab = dict() |
|
|
|
|
|
|
|
for SentenceLabel in self.train+self.test: |
|
|
|
vector = [0]*maxlen |
|
|
|
for index, word in enumerate(SentenceLabel[0]): |
|
|
|
if index >= maxlen: |
|
|
|
break |
|
|
|
if word not in self.Vocab.keys(): |
|
|
|
self.Vocab[word] = len(self.Vocab) |
|
|
|
vector[index] = len(self.Vocab) - 1 |
|
|
|
else: |
|
|
|
vector[index] = self.Vocab[word] |
|
|
|
SentenceLabel[0] = vector |
|
|
|
self.doConvert = True |
|
|
|
|
|
|
|
def split_dataset(self, split): |
|
|
|
""" |
|
|
|
split the dataset into training set and test set |
|
|
|
input: |
|
|
|
split: the ratio of training set to test set |
|
|
|
rank: logic order |
|
|
|
size: device num |
|
|
|
""" |
|
|
|
random.shuffle(self.train) |