| @@ -1,2 +1,2 @@ | |||||
| # FastNLP | # FastNLP | ||||
| FastNLP | |||||
| FastNLP | |||||
| @@ -0,0 +1,8 @@ | |||||
| SpaCy "Doc" | |||||
| https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80 | |||||
| SpaCy "Vocab" | |||||
| https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25 | |||||
| SpaCy "Token" | |||||
| https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27 | |||||
| @@ -1,3 +1,6 @@ | |||||
| from saver.logger import Logger | |||||
| class Action(object): | class Action(object): | ||||
| """ | """ | ||||
| base class for Trainer and Tester | base class for Trainer and Tester | ||||
| @@ -5,12 +8,39 @@ class Action(object): | |||||
| def __init__(self): | def __init__(self): | ||||
| super(Action, self).__init__() | super(Action, self).__init__() | ||||
| self.logger = Logger("logger_output.txt") | |||||
| def load_config(self, args): | def load_config(self, args): | ||||
| pass | |||||
| raise NotImplementedError | |||||
| def load_dataset(self, args): | def load_dataset(self, args): | ||||
| pass | |||||
| raise NotImplementedError | |||||
| def log(self, string): | |||||
| self.logger.log(string) | |||||
| def batchify(self, batch_size, X, Y=None): | |||||
| """ | |||||
| :param batch_size: int | |||||
| :param X: feature matrix of size [n_sample, m_feature] | |||||
| :param Y: label vector of size [n_sample, 1] (optional) | |||||
| :return iteration:int, the number of step in each epoch | |||||
| generator:generator, to generate batch inputs | |||||
| """ | |||||
| n_samples = X.size()[0] | |||||
| num_iter = n_samples // batch_size | |||||
| if Y is None: | |||||
| generator = self._batch_generate(batch_size, num_iter, X) | |||||
| else: | |||||
| generator = self._batch_generate(batch_size, num_iter, X, Y) | |||||
| return num_iter, generator | |||||
| @staticmethod | |||||
| def _batch_generate(batch_size, num_iter, *data): | |||||
| for step in range(num_iter): | |||||
| start = batch_size * step | |||||
| end = batch_size * (step + 1) | |||||
| yield tuple([x[start:end] for x in data]) | |||||
| def log(self, args): | |||||
| pass | |||||
| def make_log(self, *args): | |||||
| return "log" | |||||
| @@ -1,9 +1,87 @@ | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| from action.action import Action | from action.action import Action | ||||
| class Tester(Action): | class Tester(Action): | ||||
| """docstring for Tester""" | """docstring for Tester""" | ||||
| def __init__(self, arg): | |||||
| TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output", | |||||
| "save_loss", "batch_size"]) | |||||
| def __init__(self, test_args): | |||||
| """ | |||||
| :param test_args: named tuple | |||||
| """ | |||||
| super(Tester, self).__init__() | super(Tester, self).__init__() | ||||
| self.arg = arg | |||||
| self.validate_in_training = test_args.validate_in_training | |||||
| self.save_dev_input = test_args.save_dev_input | |||||
| self.valid_x = None | |||||
| self.valid_y = None | |||||
| self.save_output = test_args.save_output | |||||
| self.output = None | |||||
| self.save_loss = test_args.save_loss | |||||
| self.mean_loss = None | |||||
| self.batch_size = test_args.batch_size | |||||
| def test(self, network, data): | |||||
| print("testing") | |||||
| network.mode(test=True) # turn on the testing mode | |||||
| if self.save_dev_input: | |||||
| if self.valid_x is None: | |||||
| valid_x, valid_y = network.prepare_input(data) | |||||
| self.valid_x = valid_x | |||||
| self.valid_y = valid_y | |||||
| else: | |||||
| valid_x = self.valid_x | |||||
| valid_y = self.valid_y | |||||
| else: | |||||
| valid_x, valid_y = network.prepare_input(data) | |||||
| # split into batches by self.batch_size | |||||
| iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y) | |||||
| batch_output = list() | |||||
| loss_history = list() | |||||
| # turn on the testing mode of the network | |||||
| network.mode(test=True) | |||||
| for step in range(iterations): | |||||
| batch_x, batch_y = test_batch_generator.__next__() | |||||
| # forward pass from tests input to predicted output | |||||
| prediction = network.data_forward(batch_x) | |||||
| loss = network.get_loss(prediction, batch_y) | |||||
| if self.save_output: | |||||
| batch_output.append(prediction.data) | |||||
| if self.save_loss: | |||||
| loss_history.append(loss) | |||||
| self.log(self.make_log(step, loss)) | |||||
| if self.save_loss: | |||||
| self.mean_loss = np.mean(np.array(loss_history)) | |||||
| if self.save_output: | |||||
| self.output = self.make_output(batch_output) | |||||
| @property | |||||
| def loss(self): | |||||
| return self.mean_loss | |||||
| @property | |||||
| def result(self): | |||||
| return self.output | |||||
| @staticmethod | |||||
| def make_output(batch_outputs): | |||||
| # construct full prediction with batch outputs | |||||
| return np.concatenate(batch_outputs, axis=0) | |||||
| def load_config(self, args): | |||||
| raise NotImplementedError | |||||
| def load_dataset(self, args): | |||||
| raise NotImplementedError | |||||
| @@ -1,14 +1,91 @@ | |||||
| from action.action import Action | |||||
| from collections import namedtuple | |||||
| from .action import Action | |||||
| from .tester import Tester | |||||
| class Trainer(Action): | class Trainer(Action): | ||||
| """ | """ | ||||
| Trainer for common training logic of all models | Trainer for common training logic of all models | ||||
| """ | """ | ||||
| TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better", | |||||
| "log_per_step", "log_validation", "batch_size"]) | |||||
| def __init__(self, arg): | |||||
| def __init__(self, train_args): | |||||
| """ | |||||
| :param train_args: namedtuple | |||||
| """ | |||||
| super(Trainer, self).__init__() | super(Trainer, self).__init__() | ||||
| self.arg = arg | |||||
| self.n_epochs = train_args.epochs | |||||
| self.validate = train_args.validate | |||||
| self.save_when_better = train_args.save_when_better | |||||
| self.log_per_step = train_args.log_per_step | |||||
| self.log_validation = train_args.log_validation | |||||
| self.batch_size = train_args.batch_size | |||||
| def train(self, network, train_data, dev_data): | |||||
| """ | |||||
| :param network: the model controller | |||||
| :param train_data: raw data for training | |||||
| :param dev_data: raw data for validation | |||||
| :return: | |||||
| """ | |||||
| train_x, train_y = network.prepare_input(train_data) | |||||
| iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y) | |||||
| test_args = Tester.TestConfig(save_output=True, validate_in_training=True, | |||||
| save_dev_input=True, save_loss=True, batch_size=self.batch_size) | |||||
| evaluator = Tester(test_args) | |||||
| best_loss = 1e10 | |||||
| loss_history = list() | |||||
| for epoch in range(self.n_epochs): | |||||
| network.mode(test=False) # turn on the train mode | |||||
| network.define_optimizer() | |||||
| for step in range(iterations): | |||||
| batch_x, batch_y = train_batch_generator.__next__() | |||||
| prediction = network.data_forward(batch_x) | |||||
| loss = network.get_loss(prediction, batch_y) | |||||
| network.grad_backward() | |||||
| if step % self.log_per_step == 0: | |||||
| print("step ", step) | |||||
| loss_history.append(loss) | |||||
| self.log(self.make_log(epoch, step, loss)) | |||||
| #################### evaluate over dev set ################### | |||||
| if self.validate: | |||||
| # give all controls to tester | |||||
| evaluator.test(network, dev_data) | |||||
| if self.log_validation: | |||||
| self.log(self.make_valid_log(epoch, evaluator.loss)) | |||||
| if evaluator.loss < best_loss: | |||||
| best_loss = evaluator.loss | |||||
| if self.save_when_better: | |||||
| self.save_model(network) | |||||
| # finish training | |||||
| def make_log(self, *args): | |||||
| return "make a log" | |||||
| def make_valid_log(self, *args): | |||||
| return "make a valid log" | |||||
| def save_model(self, model): | |||||
| model.save() | |||||
| def load_data(self, data_name): | |||||
| print("load data") | |||||
| def load_config(self, args): | |||||
| raise NotImplementedError | |||||
| def train(self, args): | |||||
| pass | |||||
| def load_dataset(self, args): | |||||
| raise NotImplementedError | |||||
| @@ -13,3 +13,19 @@ class BaseLoader(object): | |||||
| with open(self.data_path, "r", encoding="utf-8") as f: | with open(self.data_path, "r", encoding="utf-8") as f: | ||||
| text = f.read() | text = f.read() | ||||
| return text | return text | ||||
| class ToyLoader0(BaseLoader): | |||||
| """ | |||||
| For charLM | |||||
| """ | |||||
| def __init__(self, name, path): | |||||
| super(ToyLoader0, self).__init__(name, path) | |||||
| def load(self): | |||||
| with open(self.data_path, 'r') as f: | |||||
| corpus = f.read().lower() | |||||
| import re | |||||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||||
| return corpus.split() | |||||
| @@ -10,5 +10,4 @@ class ConfigLoader(BaseLoader): | |||||
| @staticmethod | @staticmethod | ||||
| def parse(string): | def parse(string): | ||||
| # To do | |||||
| return string | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,82 @@ | |||||
| import numpy as np | |||||
| class BaseModel(object): | |||||
| """PyTorch base model for all models""" | |||||
| def __init__(self): | |||||
| pass | |||||
| def prepare_input(self, data): | |||||
| """ | |||||
| :param data: str, raw input vector(?) | |||||
| :return (X, Y): tuple, input features and labels | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def mode(self, test=False): | |||||
| raise NotImplementedError | |||||
| def data_forward(self, *x): | |||||
| # required by PyTorch nn | |||||
| raise NotImplementedError | |||||
| def grad_backward(self): | |||||
| raise NotImplementedError | |||||
| def get_loss(self, pred, truth): | |||||
| raise NotImplementedError | |||||
| class ToyModel(BaseModel): | |||||
| """This is for code testing.""" | |||||
| def __init__(self): | |||||
| super(ToyModel, self).__init__() | |||||
| self.test_mode = False | |||||
| self.weight = np.random.rand(5, 1) | |||||
| self.bias = np.random.rand() | |||||
| self._loss = 0 | |||||
| def prepare_input(self, data): | |||||
| return data[:, :-1], data[:, -1] | |||||
| def mode(self, test=False): | |||||
| self.test_mode = test | |||||
| def data_forward(self, x): | |||||
| return np.matmul(x, self.weight) + self.bias | |||||
| def grad_backward(self): | |||||
| print("loss gradient backward") | |||||
| def get_loss(self, pred, truth): | |||||
| self._loss = np.mean(np.square(pred - truth)) | |||||
| return self._loss | |||||
| class Vocabulary(object): | |||||
| """ | |||||
| A collection of lookup tables. | |||||
| """ | |||||
| def __init__(self): | |||||
| self.word_set = None | |||||
| self.word2idx = None | |||||
| self.emb_matrix = None | |||||
| def lookup(self, word): | |||||
| if word in self.word_set: | |||||
| return self.emb_matrix[self.word2idx[word]] | |||||
| return LookupError("The key " + word + " does not exist.") | |||||
| class Document(object): | |||||
| """ | |||||
| contains a sequence of tokens | |||||
| each token is a character with linguistic attributes | |||||
| """ | |||||
| def __init__(self): | |||||
| # wrap pandas.dataframe | |||||
| self.dataframe = None | |||||
| @@ -0,0 +1,356 @@ | |||||
| import os | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torch.optim as optim | |||||
| from torch.autograd import Variable | |||||
| from model.base_model import BaseModel | |||||
| USE_GPU = True | |||||
| class CharLM(BaseModel): | |||||
| """ | |||||
| Controller of the Character-level Neural Language Model | |||||
| To do: | |||||
| - where the data goes, call data savers. | |||||
| """ | |||||
| DataTuple = namedtuple("DataTuple", ["feature", "label"]) | |||||
| def __init__(self, lstm_batch_size, lstm_seq_len): | |||||
| super(CharLM, self).__init__() | |||||
| """ | |||||
| Settings: should come from config loader or pre-processing | |||||
| """ | |||||
| self.word_embed_dim = 300 | |||||
| self.char_embedding_dim = 15 | |||||
| self.cnn_batch_size = lstm_batch_size * lstm_seq_len | |||||
| self.lstm_seq_len = lstm_seq_len | |||||
| self.lstm_batch_size = lstm_batch_size | |||||
| self.num_epoch = 10 | |||||
| self.old_PPL = 100000 | |||||
| self.best_PPL = 100000 | |||||
| """ | |||||
| These parameters are set by pre-processing. | |||||
| """ | |||||
| self.max_word_len = None | |||||
| self.num_char = None | |||||
| self.vocab_size = None | |||||
| self.preprocess("./data_for_tests/charlm.txt") | |||||
| self.data = None # named tuple to store all data set | |||||
| self.data_ready = False | |||||
| self.criterion = nn.CrossEntropyLoss() | |||||
| self._loss = None | |||||
| self.use_gpu = USE_GPU | |||||
| # word_emb_dim == hidden_size / num of hidden units | |||||
| self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)), | |||||
| to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim))) | |||||
| self.model = charLM(self.char_embedding_dim, | |||||
| self.word_embed_dim, | |||||
| self.vocab_size, | |||||
| self.num_char, | |||||
| use_gpu=self.use_gpu) | |||||
| for param in self.model.parameters(): | |||||
| nn.init.uniform(param.data, -0.05, 0.05) | |||||
| self.learning_rate = 0.1 | |||||
| self.optimizer = None | |||||
| def prepare_input(self, raw_text): | |||||
| """ | |||||
| :param raw_text: raw input text consisting of words | |||||
| :return: torch.Tensor, torch.Tensor | |||||
| feature matrix, label vector | |||||
| This function is only called once in Trainer.train, but may called multiple times in Tester.test | |||||
| So Tester will save test input for frequent calls. | |||||
| """ | |||||
| if os.path.exists("cache/prep.pt") is False: | |||||
| self.preprocess("./data_for_tests/charlm.txt") # To do: This is not good. Need to fix.. | |||||
| objects = torch.load("cache/prep.pt") | |||||
| word_dict = objects["word_dict"] | |||||
| char_dict = objects["char_dict"] | |||||
| max_word_len = self.max_word_len | |||||
| print("word/char dictionary built. Start making inputs.") | |||||
| words = raw_text | |||||
| input_vec = np.array(text2vec(words, char_dict, max_word_len)) | |||||
| # Labels are next-word index in word_dict with the same length as inputs | |||||
| input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]]) | |||||
| feature_input = torch.from_numpy(input_vec) | |||||
| label_input = torch.from_numpy(input_label) | |||||
| return feature_input, label_input | |||||
| def mode(self, test=False): | |||||
| if test: | |||||
| self.model.eval() | |||||
| else: | |||||
| self.model.train() | |||||
| def data_forward(self, x): | |||||
| """ | |||||
| :param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2] | |||||
| :return: Tensor of size [num_words, ?] | |||||
| """ | |||||
| # additional processing of inputs after batching | |||||
| num_seq = x.size()[0] // self.lstm_seq_len | |||||
| x = x[:num_seq * self.lstm_seq_len, :] | |||||
| x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2) | |||||
| # detach hidden state of LSTM from last batch | |||||
| hidden = [state.detach() for state in self.hidden] | |||||
| output, self.hidden = self.model(to_var(x), hidden) | |||||
| return output | |||||
| def grad_backward(self): | |||||
| self.model.zero_grad() | |||||
| self._loss.backward() | |||||
| torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) | |||||
| self.optimizer.step() | |||||
| def get_loss(self, predict, truth): | |||||
| self._loss = self.criterion(predict, to_var(truth)) | |||||
| return self._loss.data # No pytorch data structure exposed outsides | |||||
| def define_optimizer(self): | |||||
| # redefine optimizer for every new epoch | |||||
| self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85) | |||||
| def save(self): | |||||
| print("network saved") | |||||
| # torch.save(self.model, "cache/model.pkl") | |||||
| def preprocess(self, all_text_files): | |||||
| word_dict, char_dict = create_word_char_dict(all_text_files) | |||||
| num_char = len(char_dict) | |||||
| self.vocab_size = len(word_dict) | |||||
| char_dict["BOW"] = num_char + 1 | |||||
| char_dict["EOW"] = num_char + 2 | |||||
| char_dict["PAD"] = 0 | |||||
| self.num_char = num_char + 3 | |||||
| # char_dict is a dict of (int, string), int counting from 0 to 47 | |||||
| reverse_word_dict = {value: key for key, value in word_dict.items()} | |||||
| self.max_word_len = max([len(word) for word in word_dict]) | |||||
| objects = { | |||||
| "word_dict": word_dict, | |||||
| "char_dict": char_dict, | |||||
| "reverse_word_dict": reverse_word_dict, | |||||
| } | |||||
| torch.save(objects, "cache/prep.pt") | |||||
| print("Preprocess done.") | |||||
| """ | |||||
| Global Functions | |||||
| """ | |||||
| def batch_generator(x, batch_size): | |||||
| # x: [num_words, in_channel, height, width] | |||||
| # partitions x into batches | |||||
| num_step = x.size()[0] // batch_size | |||||
| for t in range(num_step): | |||||
| yield x[t * batch_size:(t + 1) * batch_size] | |||||
| def text2vec(words, char_dict, max_word_len): | |||||
| """ Return list of list of int """ | |||||
| word_vec = [] | |||||
| for word in words: | |||||
| vec = [char_dict[ch] for ch in word] | |||||
| if len(vec) < max_word_len: | |||||
| vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||||
| vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||||
| word_vec.append(vec) | |||||
| return word_vec | |||||
| def read_data(file_name): | |||||
| with open(file_name, 'r') as f: | |||||
| corpus = f.read().lower() | |||||
| import re | |||||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||||
| return corpus.split() | |||||
| def get_char_dict(vocabulary): | |||||
| char_dict = dict() | |||||
| count = 1 | |||||
| for word in vocabulary: | |||||
| for ch in word: | |||||
| if ch not in char_dict: | |||||
| char_dict[ch] = count | |||||
| count += 1 | |||||
| return char_dict | |||||
| def create_word_char_dict(*file_name): | |||||
| text = [] | |||||
| for file in file_name: | |||||
| text += read_data(file) | |||||
| word_dict = {word: ix for ix, word in enumerate(set(text))} | |||||
| char_dict = get_char_dict(word_dict) | |||||
| return word_dict, char_dict | |||||
| def to_var(x): | |||||
| if torch.cuda.is_available() and USE_GPU: | |||||
| x = x.cuda() | |||||
| return Variable(x) | |||||
| """ | |||||
| Neural Network | |||||
| """ | |||||
| class Highway(nn.Module): | |||||
| """Highway network""" | |||||
| def __init__(self, input_size): | |||||
| super(Highway, self).__init__() | |||||
| self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||||
| self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||||
| def forward(self, x): | |||||
| t = F.sigmoid(self.fc1(x)) | |||||
| return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x) | |||||
| class charLM(nn.Module): | |||||
| """Character-level Neural Language Model | |||||
| CNN + highway network + LSTM | |||||
| # Input: | |||||
| 4D tensor with shape [batch_size, in_channel, height, width] | |||||
| # Output: | |||||
| 2D Tensor with shape [batch_size, vocab_size] | |||||
| # Arguments: | |||||
| char_emb_dim: the size of each character's embedding | |||||
| word_emb_dim: the size of each word's embedding | |||||
| vocab_size: num of unique words | |||||
| num_char: num of characters | |||||
| use_gpu: True or False | |||||
| """ | |||||
| def __init__(self, char_emb_dim, word_emb_dim, | |||||
| vocab_size, num_char, use_gpu): | |||||
| super(charLM, self).__init__() | |||||
| self.char_emb_dim = char_emb_dim | |||||
| self.word_emb_dim = word_emb_dim | |||||
| self.vocab_size = vocab_size | |||||
| # char embedding layer | |||||
| self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||||
| # convolutions of filters with different sizes | |||||
| self.convolutions = [] | |||||
| # list of tuples: (the number of filter, width) | |||||
| # self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||||
| self.filter_num_width = [(25, 1), (50, 2), (75, 3)] | |||||
| for out_channel, filter_width in self.filter_num_width: | |||||
| self.convolutions.append( | |||||
| nn.Conv2d( | |||||
| 1, # in_channel | |||||
| out_channel, # out_channel | |||||
| kernel_size=(char_emb_dim, filter_width), # (height, width) | |||||
| bias=True | |||||
| ) | |||||
| ) | |||||
| self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||||
| self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||||
| # highway net | |||||
| self.highway1 = Highway(self.highway_input_dim) | |||||
| self.highway2 = Highway(self.highway_input_dim) | |||||
| # LSTM | |||||
| self.lstm_num_layers = 2 | |||||
| self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||||
| hidden_size=self.word_emb_dim, | |||||
| num_layers=self.lstm_num_layers, | |||||
| bias=True, | |||||
| dropout=0.5, | |||||
| batch_first=True) | |||||
| # output layer | |||||
| self.dropout = nn.Dropout(p=0.5) | |||||
| self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||||
| if use_gpu is True: | |||||
| for x in range(len(self.convolutions)): | |||||
| self.convolutions[x] = self.convolutions[x].cuda() | |||||
| self.highway1 = self.highway1.cuda() | |||||
| self.highway2 = self.highway2.cuda() | |||||
| self.lstm = self.lstm.cuda() | |||||
| self.dropout = self.dropout.cuda() | |||||
| self.char_embed = self.char_embed.cuda() | |||||
| self.linear = self.linear.cuda() | |||||
| self.batch_norm = self.batch_norm.cuda() | |||||
| def forward(self, x, hidden): | |||||
| # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||||
| # Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||||
| lstm_batch_size = x.size()[0] | |||||
| lstm_seq_len = x.size()[1] | |||||
| x = x.contiguous().view(-1, x.size()[2]) | |||||
| # [num_seq*seq_len, max_word_len+2] | |||||
| x = self.char_embed(x) | |||||
| # [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||||
| x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||||
| # [num_seq*seq_len, 1, char_emb_dim, max_word_len+2] | |||||
| x = self.conv_layers(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = self.batch_norm(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = self.highway1(x) | |||||
| x = self.highway2(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) | |||||
| # [num_seq, seq_len, total_num_filters] | |||||
| x, hidden = self.lstm(x, hidden) | |||||
| # [seq_len, num_seq, hidden_size] | |||||
| x = self.dropout(x) | |||||
| # [seq_len, num_seq, hidden_size] | |||||
| x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1) | |||||
| # [num_seq*seq_len, hidden_size] | |||||
| x = self.linear(x) | |||||
| # [num_seq*seq_len, vocab_size] | |||||
| return x, hidden | |||||
| def conv_layers(self, x): | |||||
| chosen_list = list() | |||||
| for conv in self.convolutions: | |||||
| feature_map = F.tanh(conv(x)) | |||||
| # (batch_size, out_channel, 1, max_word_len-width+1) | |||||
| chosen = torch.max(feature_map, 3)[0] | |||||
| # (batch_size, out_channel, 1) | |||||
| chosen = chosen.squeeze() | |||||
| # (batch_size, out_channel) | |||||
| chosen_list.append(chosen) | |||||
| # (batch_size, total_num_filers) | |||||
| return torch.cat(chosen_list, 1) | |||||
| @@ -1,17 +1,12 @@ | |||||
| import os | import os | ||||
| import torch | |||||
| import | |||||
| import torch | |||||
| import torch.nn as nn | import torch.nn as nn | ||||
| import torchvision.datasets as dsets | |||||
| import torchvision.transforms as transforms | |||||
| import dataset as dst | |||||
| from model import CNN_text | |||||
| .dataset as dst | |||||
| from .model import CNN_text | |||||
| from torch.autograd import Variable | from torch.autograd import Variable | ||||
| from sklearn import cross_validation | |||||
| from sklearn import datasets | |||||
| # Hyper Parameters | # Hyper Parameters | ||||
| batch_size = 50 | batch_size = 50 | ||||
| learning_rate = 0.0001 | learning_rate = 0.0001 | ||||
| @@ -51,8 +46,7 @@ if cuda: | |||||
| criterion = nn.CrossEntropyLoss() | criterion = nn.CrossEntropyLoss() | ||||
| optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | ||||
| #train and test | |||||
| # train and tests | |||||
| best_acc = None | best_acc = None | ||||
| for epoch in range(num_epochs): | for epoch in range(num_epochs): | ||||
| @@ -1,12 +1,12 @@ | |||||
| import os | import os | ||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from torch.autograd import Variable | |||||
| import torch.nn as nn | import torch.nn as nn | ||||
| import torch.nn.functional as F | |||||
| import numpy as np | |||||
| from model import charLM | |||||
| from torch.autograd import Variable | |||||
| from utilities import * | from utilities import * | ||||
| from collections import namedtuple | |||||
| def to_var(x): | def to_var(x): | ||||
| if torch.cuda.is_available(): | if torch.cuda.is_available(): | ||||
| @@ -76,18 +76,18 @@ if __name__ == "__main__": | |||||
| if os.path.exists("cache/data_sets.pt") is False: | if os.path.exists("cache/data_sets.pt") is False: | ||||
| test_text = read_data("./test.txt") | |||||
| test_text = read_data("./tests.txt") | |||||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | ||||
| # Labels are next-word index in word_dict with the same length as inputs | # Labels are next-word index in word_dict with the same length as inputs | ||||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | ||||
| category = {"test": test_set, "tlabel":test_label} | |||||
| category = {"tests": test_set, "tlabel": test_label} | |||||
| torch.save(category, "cache/data_sets.pt") | torch.save(category, "cache/data_sets.pt") | ||||
| else: | else: | ||||
| data_sets = torch.load("cache/data_sets.pt") | data_sets = torch.load("cache/data_sets.pt") | ||||
| test_set = data_sets["test"] | |||||
| test_set = data_sets["tests"] | |||||
| test_label = data_sets["tlabel"] | test_label = data_sets["tlabel"] | ||||
| train_set = data_sets["tdata"] | train_set = data_sets["tdata"] | ||||
| train_label = data_sets["trlabel"] | train_label = data_sets["trlabel"] | ||||
| @@ -1,20 +1,16 @@ | |||||
| import torch | |||||
| from torch.autograd import Variable | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torch.optim as optim | |||||
| import numpy as np | |||||
| import os | import os | ||||
| from model import charLM | |||||
| from utilities import * | |||||
| from collections import namedtuple | from collections import namedtuple | ||||
| from test import test | |||||
| import numpy as np | |||||
| import torch.optim as optim | |||||
| from .model import charLM | |||||
| from .test import test | |||||
| from .utilities import * | |||||
| def preprocess(): | def preprocess(): | ||||
| word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt") | |||||
| word_dict, char_dict = create_word_char_dict("charlm.txt", "train.txt", "tests.txt") | |||||
| num_words = len(word_dict) | num_words = len(word_dict) | ||||
| num_char = len(char_dict) | num_char = len(char_dict) | ||||
| char_dict["BOW"] = num_char+1 | char_dict["BOW"] = num_char+1 | ||||
| @@ -43,7 +39,18 @@ def to_var(x): | |||||
| def train(net, data, opt): | def train(net, data, opt): | ||||
| """ | |||||
| :param net: the pytorch model | |||||
| :param data: numpy array | |||||
| :param opt: named tuple | |||||
| 1. random seed | |||||
| 2. define local input | |||||
| 3. training settting: learning rate, loss, etc | |||||
| 4. main loop epoch | |||||
| 5. batchify | |||||
| 6. validation | |||||
| 7. save model | |||||
| """ | |||||
| torch.manual_seed(1024) | torch.manual_seed(1024) | ||||
| train_input = torch.from_numpy(data.train_input) | train_input = torch.from_numpy(data.train_input) | ||||
| @@ -125,9 +132,9 @@ def train(net, data, opt): | |||||
| ################################################## | ################################################## | ||||
| #################### Training #################### | #################### Training #################### | ||||
| net.train() | net.train() | ||||
| optimizer = optim.SGD(net.parameters(), | |||||
| lr = learning_rate, | |||||
| momentum=0.85) | |||||
| optimizer = optim.SGD(net.parameters(), | |||||
| lr = learning_rate, | |||||
| momentum=0.85) | |||||
| # split the first dim | # split the first dim | ||||
| input_generator = batch_generator(train_input, opt.lstm_batch_size) | input_generator = batch_generator(train_input, opt.lstm_batch_size) | ||||
| @@ -183,8 +190,8 @@ if __name__=="__main__": | |||||
| if os.path.exists("cache/data_sets.pt") is False: | if os.path.exists("cache/data_sets.pt") is False: | ||||
| train_text = read_data("./train.txt") | train_text = read_data("./train.txt") | ||||
| valid_text = read_data("./valid.txt") | |||||
| test_text = read_data("./test.txt") | |||||
| valid_text = read_data("./charlm.txt") | |||||
| test_text = read_data("./tests.txt") | |||||
| train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | ||||
| valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | ||||
| @@ -195,14 +202,14 @@ if __name__=="__main__": | |||||
| valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | ||||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | ||||
| category = {"tdata":train_set, "vdata":valid_set, "test": test_set, | |||||
| category = {"tdata": train_set, "vdata": valid_set, "tests": test_set, | |||||
| "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} | "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} | ||||
| torch.save(category, "cache/data_sets.pt") | torch.save(category, "cache/data_sets.pt") | ||||
| else: | else: | ||||
| data_sets = torch.load("cache/data_sets.pt") | data_sets = torch.load("cache/data_sets.pt") | ||||
| train_set = data_sets["tdata"] | train_set = data_sets["tdata"] | ||||
| valid_set = data_sets["vdata"] | valid_set = data_sets["vdata"] | ||||
| test_set = data_sets["test"] | |||||
| test_set = data_sets["tests"] | |||||
| train_label = data_sets["trlabel"] | train_label = data_sets["trlabel"] | ||||
| valid_label = data_sets["vlabel"] | valid_label = data_sets["vlabel"] | ||||
| test_label = data_sets["tlabel"] | test_label = data_sets["tlabel"] | ||||
| @@ -0,0 +1,14 @@ | |||||
| class BaseSaver(object): | |||||
| """base class for all savers""" | |||||
| def __init__(self, save_path): | |||||
| self.save_path = save_path | |||||
| def save_bytes(self): | |||||
| raise NotImplementedError | |||||
| def save_str(self): | |||||
| raise NotImplementedError | |||||
| def compress(self): | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,12 @@ | |||||
| from saver.base_saver import BaseSaver | |||||
| class Logger(BaseSaver): | |||||
| """Logging""" | |||||
| def __init__(self, save_path): | |||||
| super(Logger, self).__init__(save_path) | |||||
| def log(self, string): | |||||
| with open(self.save_path, "a") as f: | |||||
| f.write(string) | |||||
| @@ -0,0 +1,8 @@ | |||||
| from saver.base_saver import BaseSaver | |||||
| class ModelSaver(BaseSaver): | |||||
| """Save a model""" | |||||
| def __init__(self, save_path): | |||||
| super(ModelSaver, self).__init__(save_path) | |||||
| @@ -0,0 +1,31 @@ | |||||
| from action.tester import Tester | |||||
| from action.trainer import Trainer | |||||
| from loader.base_loader import ToyLoader0 | |||||
| from model.char_language_model import CharLM | |||||
| def test_charlm(): | |||||
| train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True, | |||||
| log_per_step=10, log_validation=True, batch_size=160) | |||||
| trainer = Trainer(train_config) | |||||
| model = CharLM(lstm_batch_size=16, lstm_seq_len=10) | |||||
| train_data = ToyLoader0("load_train", "./data_for_tests/charlm.txt").load() | |||||
| valid_data = ToyLoader0("load_valid", "./data_for_tests/charlm.txt").load() | |||||
| trainer.train(model, train_data, valid_data) | |||||
| trainer.save_model(model) | |||||
| test_config = Tester.TestConfig(save_output=True, validate_in_training=True, | |||||
| save_dev_input=True, save_loss=True, batch_size=160) | |||||
| tester = Tester(test_config) | |||||
| test_data = ToyLoader0("load_test", "./data_for_tests/charlm.txt").load() | |||||
| tester.test(model, test_data) | |||||
| if __name__ == "__main__": | |||||
| test_charlm() | |||||
| @@ -0,0 +1,21 @@ | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| from action.trainer import Trainer | |||||
| from model.base_model import ToyModel | |||||
| def test_trainer(): | |||||
| Config = namedtuple("config", ["epochs", "validate", "save_when_better"]) | |||||
| train_config = Config(epochs=5, validate=True, save_when_better=True) | |||||
| trainer = Trainer(train_config) | |||||
| net = ToyModel() | |||||
| data = np.random.rand(20, 6) | |||||
| dev_data = np.random.rand(20, 6) | |||||
| trainer.train(net, data, dev_data) | |||||
| if __name__ == "__main__": | |||||
| test_trainer() | |||||