You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

LSTMModel.py 4.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. from __future__ import absolute_import
  2. from __future__ import division
  3. from __future__ import print_function
  4. import torch
  5. import torch.nn as nn
  6. from torch.autograd import *
  7. from torch.distributions import *
  8. from .Encoder import Encoder
  9. from .DeepLSTM import DeepLSTM
  10. from transformer.SubLayers import MultiHeadAttention,PositionwiseFeedForward
  11. class SummarizationModel(nn.Module):
  12. def __init__(self, hps, embed):
  13. """
  14. :param hps: hyperparameters for the model
  15. :param vocab: vocab object
  16. """
  17. super(SummarizationModel, self).__init__()
  18. self._hps = hps
  19. # sentence encoder
  20. self.encoder = Encoder(hps, embed)
  21. # Multi-layer highway lstm
  22. self.num_layers = hps.n_layers
  23. self.sent_embedding_size = (hps.max_kernel_size - hps.min_kernel_size + 1) * hps.output_channel
  24. self.lstm_hidden_size = hps.lstm_hidden_size
  25. self.recurrent_dropout = hps.recurrent_dropout_prob
  26. self.deep_lstm = DeepLSTM(self.sent_embedding_size, self.lstm_hidden_size, self.num_layers, self.recurrent_dropout,
  27. hps.use_orthnormal_init, hps.fix_mask, hps.cuda)
  28. # Multi-head attention
  29. self.n_head = hps.n_head
  30. self.d_v = self.d_k = int(self.lstm_hidden_size / hps.n_head)
  31. self.d_inner = hps.ffn_inner_hidden_size
  32. self.slf_attn = MultiHeadAttention(hps.n_head, self.lstm_hidden_size , self.d_k, self.d_v, dropout=hps.atten_dropout_prob)
  33. self.pos_ffn = PositionwiseFeedForward(self.d_v, self.d_inner, dropout = hps.ffn_dropout_prob)
  34. self.wh = nn.Linear(self.d_v, 2)
  35. def forward(self, input, input_len, Train):
  36. """
  37. :param input: [batch_size, N, seq_len], word idx long tensor
  38. :param input_len: [batch_size, N], 1 for sentence and 0 for padding
  39. :param Train: True for train and False for eval and test
  40. :param return_atten: True or False to return multi-head attention output self.output_slf_attn
  41. :return:
  42. p_sent: [batch_size, N, 2]
  43. output_slf_attn: (option) [n_head, batch_size, N, N]
  44. """
  45. # -- Sentence Encoder
  46. self.sent_embedding = self.encoder(input) # [batch, N, Co * kernel_sizes]
  47. # -- Multi-layer highway lstm
  48. input_len = input_len.float() # [batch, N]
  49. self.inputs = [None] * (self.num_layers + 1)
  50. self.input_masks = [None] * (self.num_layers + 1)
  51. self.inputs[0] = self.sent_embedding.permute(1, 0, 2) # [N, batch, Co * kernel_sizes]
  52. self.input_masks[0] = input_len.permute(1, 0).unsqueeze(2)
  53. self.lstm_output_state = self.deep_lstm(self.inputs, self.input_masks, Train) # [batch, N, hidden_size]
  54. # -- Prepare masks
  55. batch_size, N = input_len.size()
  56. slf_attn_mask = input_len.eq(0.0) # [batch, N], 1 for padding
  57. slf_attn_mask = slf_attn_mask.unsqueeze(1).expand(-1, N, -1) # [batch, N, N]
  58. # -- Multi-head attention
  59. self.atten_output, self.output_slf_attn = self.slf_attn(self.lstm_output_state, self.lstm_output_state, self.lstm_output_state, mask=slf_attn_mask)
  60. self.atten_output *= input_len.unsqueeze(2) # [batch_size, N, lstm_hidden_size = (n_head * d_v)]
  61. self.multi_atten_output = self.atten_output.view(batch_size, N, self.n_head, self.d_v) # [batch_size, N, n_head, d_v]
  62. self.multi_atten_context = self.multi_atten_output[:, :, 0::2, :].sum(2) - self.multi_atten_output[:, :, 1::2, :].sum(2) # [batch_size, N, d_v]
  63. # -- Position-wise Feed-Forward Networks
  64. self.output_state = self.pos_ffn(self.multi_atten_context)
  65. self.output_state = self.output_state * input_len.unsqueeze(2) # [batch_size, N, d_v]
  66. p_sent = self.wh(self.output_state) # [batch, N, 2]
  67. idx = None
  68. if self._hps.m == 0:
  69. prediction = p_sent.view(-1, 2).max(1)[1]
  70. prediction = prediction.view(batch_size, -1)
  71. else:
  72. mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N]
  73. mask_output = mask_output.masked_fill(input_len.eq(0), 0)
  74. topk, idx = torch.topk(mask_output, self._hps.m)
  75. prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1)
  76. prediction = prediction.long().view(batch_size, -1)
  77. if self._hps.cuda:
  78. prediction = prediction.cuda()
  79. return {"p_sent": p_sent, "prediction": prediction, "pred_idx": idx}