You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

LSTMModel.py 4.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. from __future__ import absolute_import
  2. from __future__ import division
  3. from __future__ import print_function
  4. import torch
  5. import torch.nn as nn
  6. from torch.autograd import *
  7. from torch.distributions import *
  8. from .Encoder import Encoder
  9. from .DeepLSTM import DeepLSTM
  10. from transformer.SubLayers import MultiHeadAttention,PositionwiseFeedForward
  11. class SummarizationModel(nn.Module):
  12. def __init__(self, hps, embed):
  13. """
  14. :param hps: hyperparameters for the model
  15. :param embed: word embedding
  16. """
  17. super(SummarizationModel, self).__init__()
  18. self._hps = hps
  19. self.Train = (hps.mode == 'train')
  20. # sentence encoder
  21. self.encoder = Encoder(hps, embed)
  22. # Multi-layer highway lstm
  23. self.num_layers = hps.n_layers
  24. self.sent_embedding_size = (hps.max_kernel_size - hps.min_kernel_size + 1) * hps.output_channel
  25. self.lstm_hidden_size = hps.lstm_hidden_size
  26. self.recurrent_dropout = hps.recurrent_dropout_prob
  27. self.deep_lstm = DeepLSTM(self.sent_embedding_size, self.lstm_hidden_size, self.num_layers, self.recurrent_dropout,
  28. hps.use_orthnormal_init, hps.fix_mask, hps.cuda)
  29. # Multi-head attention
  30. self.n_head = hps.n_head
  31. self.d_v = self.d_k = int(self.lstm_hidden_size / hps.n_head)
  32. self.d_inner = hps.ffn_inner_hidden_size
  33. self.slf_attn = MultiHeadAttention(hps.n_head, self.lstm_hidden_size , self.d_k, self.d_v, dropout=hps.atten_dropout_prob)
  34. self.pos_ffn = PositionwiseFeedForward(self.d_v, self.d_inner, dropout = hps.ffn_dropout_prob)
  35. self.wh = nn.Linear(self.d_v, 2)
  36. def forward(self, words, seq_len):
  37. """
  38. :param input: [batch_size, N, seq_len], word idx long tensor
  39. :param input_len: [batch_size, N], 1 for sentence and 0 for padding
  40. :return:
  41. p_sent: [batch_size, N, 2]
  42. output_slf_attn: (option) [n_head, batch_size, N, N]
  43. """
  44. input = words
  45. input_len = seq_len
  46. # -- Sentence Encoder
  47. self.sent_embedding = self.encoder(input) # [batch, N, Co * kernel_sizes]
  48. # -- Multi-layer highway lstm
  49. input_len = input_len.float() # [batch, N]
  50. self.inputs = [None] * (self.num_layers + 1)
  51. self.input_masks = [None] * (self.num_layers + 1)
  52. self.inputs[0] = self.sent_embedding.permute(1, 0, 2) # [N, batch, Co * kernel_sizes]
  53. self.input_masks[0] = input_len.permute(1, 0).unsqueeze(2)
  54. self.lstm_output_state = self.deep_lstm(self.inputs, self.input_masks, Train=self.train) # [batch, N, hidden_size]
  55. # -- Prepare masks
  56. batch_size, N = input_len.size()
  57. slf_attn_mask = input_len.eq(0.0) # [batch, N], 1 for padding
  58. slf_attn_mask = slf_attn_mask.unsqueeze(1).expand(-1, N, -1) # [batch, N, N]
  59. # -- Multi-head attention
  60. self.atten_output, self.output_slf_attn = self.slf_attn(self.lstm_output_state, self.lstm_output_state, self.lstm_output_state, mask=slf_attn_mask)
  61. self.atten_output *= input_len.unsqueeze(2) # [batch_size, N, lstm_hidden_size = (n_head * d_v)]
  62. self.multi_atten_output = self.atten_output.view(batch_size, N, self.n_head, self.d_v) # [batch_size, N, n_head, d_v]
  63. self.multi_atten_context = self.multi_atten_output[:, :, 0::2, :].sum(2) - self.multi_atten_output[:, :, 1::2, :].sum(2) # [batch_size, N, d_v]
  64. # -- Position-wise Feed-Forward Networks
  65. self.output_state = self.pos_ffn(self.multi_atten_context)
  66. self.output_state = self.output_state * input_len.unsqueeze(2) # [batch_size, N, d_v]
  67. p_sent = self.wh(self.output_state) # [batch, N, 2]
  68. idx = None
  69. if self._hps.m == 0:
  70. prediction = p_sent.view(-1, 2).max(1)[1]
  71. prediction = prediction.view(batch_size, -1)
  72. else:
  73. mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N]
  74. mask_output = mask_output.masked_fill(input_len.eq(0), 0)
  75. topk, idx = torch.topk(mask_output, self._hps.m)
  76. prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1)
  77. prediction = prediction.long().view(batch_size, -1)
  78. if self._hps.cuda:
  79. prediction = prediction.cuda()
  80. return {"p_sent": p_sent, "prediction": prediction, "pred_idx": idx}