import time

import aggregation
import dataloader
import embedding
import encoder
import predict
import torch
import torch.nn as nn
import torch.optim as optim

WORD_NUM = 357361
WORD_SIZE = 100
HIDDEN_SIZE = 300
D_A = 350
R = 10
MLP_HIDDEN = 2000 
CLASSES_NUM = 5

from fastNLP.models.base_model import BaseModel


class MyNet(BaseModel):
    def __init__(self):
        super(MyNet, self).__init__()
        self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
        self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
        self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
        self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)
        self.penalty = None

    def encode(self, x):
        return self.encode(self.embedding(x))

    def aggregate(self, x):
        x, self.penalty = self.aggregate(x)
        return x

    def decode(self, x):
        return [self.predict(x), self.penalty]


class Net(nn.Module):
    """
    A model for sentiment analysis using lstm and self-attention
    """
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
        self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
        self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
        self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)

    def forward(self, x):
        x = self.embedding(x)
        x = self.encoder(x)
        x, penalty = self.aggregation(x)
        x = self.predict(x)
        return x, penalty


def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
    momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
    """
    training procedure

    Args: 
    If model_dict is given (a file address), it will continue training on the given model.
    Otherwise, it would train a new model from scratch.
    If using_cuda is true, the training would be conducted on GPU.
    Learning_rate and momentum is for SGD optimizer.
    coef is the coefficent between the cross-entropy loss and the penalization term.
    interval is the frequncy of reporting.

    the result will be saved with a form "model_dict_+current time", which could be used for further training
    """
    
    if using_cuda:
        net = Net().cuda()
    else:
        net = Net()
        
    if model_dict != None:
        net.load_state_dict(torch.load(model_dict))

    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)

    #statistics
    loss_count = 0
    prepare_time = 0
    run_time = 0
    count = 0

    for epoch in range(epochs):
        print("epoch: %d"%(epoch))
        for i, batch in enumerate(dataset):
            t1 = time.time()
            X = batch["feature"]
            y = batch["class"]
            
            t2 = time.time()
            y_pred, y_penl = net(X)
            loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm(net.parameters(), 0.5)
            optimizer.step()
            t3 = time.time()

            loss_count += torch.sum(y_penl).data[0]
            prepare_time += (t2 - t1)
            run_time += (t3 - t2)
            p, idx = torch.max(y_pred.data, dim=1)
            count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))

            if (i + 1) % interval == 0:
                print("epoch : %d, iters: %d"%(epoch, i + 1))     
                print("loss count:" + str(loss_count / (interval * batch_size)))
                print("acuracy:" + str(count / (interval * batch_size)))
                print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
                print("prepare time:" + str(prepare_time))
                print("run time:" + str(run_time))
                prepare_time = 0
                run_time = 0
                loss_count = 0
                count = 0
        string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
        torch.save(net.state_dict(), "model_dict_%s.dict"%(string))

def test(model_dict, using_cuda=True):
    if using_cuda:
        net = Net().cuda()
    else:
        net = Net()
    net.load_state_dict(torch.load(model_dict))
    dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
    count = 0
    for i, batch in enumerate(dataset):
        X = batch["feature"]
        y = batch["class"]
        y_pred, _ = net(X)
        p, idx = torch.max(y_pred.data, dim=1)
        count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
    print("accuracy: %f"%(count / dataset.num))
        

if __name__ == "__main__":
    train(using_cuda=torch.cuda.is_available())