/ NLP  

NLP系列

基于LSTM的监督学习语义表达抽取

InferSent

InferSent的官方代码可以从GitHub上找到。

我们这里省略数据预处理和训练的环节,只看模型的定义部分。模型利用PyTorch实现。

drawing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

"""
Main module for Natural Language Inference
"""


class NLINet(nn.Module):
def __init__(self, config):
super(NLINet, self).__init__()

# classifier
self.nonlinear_fc = config['nonlinear_fc']
self.fc_dim = config['fc_dim']
self.n_classes = config['n_classes']
self.enc_lstm_dim = config['enc_lstm_dim']
self.encoder_type = config['encoder_type']
self.dpout_fc = config['dpout_fc']

self.encoder = eval(self.encoder_type)(config)
self.inputdim = 4*2*self.enc_lstm_dim
self.inputdim = 4*self.inputdim if self.encoder_type in \
["ConvNetEncoder", "InnerAttentionMILAEncoder"] else self.inputdim
self.inputdim = self.inputdim/2 if self.encoder_type == "LSTMEncoder" \
else self.inputdim
if self.nonlinear_fc: # 非线性的神经网络分类器
self.classifier = nn.Sequential(
nn.Dropout(p=self.dpout_fc),
nn.Linear(self.inputdim, self.fc_dim),
nn.Tanh(),
nn.Dropout(p=self.dpout_fc),
nn.Linear(self.fc_dim, self.fc_dim),
nn.Tanh(),
nn.Dropout(p=self.dpout_fc),
nn.Linear(self.fc_dim, self.n_classes),
)
else: # 线性神经网络分类器
self.classifier = nn.Sequential(
nn.Linear(self.inputdim, self.fc_dim),
nn.Linear(self.fc_dim, self.fc_dim),
nn.Linear(self.fc_dim, self.n_classes)
)

def forward(self, s1, s2):
# s1 : (s1, s1_len)
u = self.encoder(s1) # 编码句子1
v = self.encoder(s2) # 编码句子2

features = torch.cat((u, v, torch.abs(u-v), u*v), 1) # feature engineering
output = self.classifier(features) # 分类
return output

def encode(self, s1):
emb = self.encoder(s1)
return emb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

"""
LSTM encoder
"""


class LSTMEncoder(nn.Module):
def __init__(self, config):
super(LSTMEncoder, self).__init__()
self.bsize = config['bsize']
self.word_emb_dim = config['word_emb_dim']
self.enc_lstm_dim = config['enc_lstm_dim']
self.pool_type = config['pool_type']
self.dpout_model = config['dpout_model']

self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
bidirectional=False, dropout=self.dpout_model)

def forward(self, sent_tuple):
# sent_len [max_len, ..., min_len] (batch)
# sent (seqlen x batch x worddim)

sent, sent_len = sent_tuple

# 按照句子的长短排序,并保留原始的idx顺序
sent_len, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
sent = sent.index_select(1, torch.cuda.LongTensor(idx_sort))

# 用pytorch自带的函数处理RNN的padding问题
sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len)
# LSTM编码序列
sent_output = self.enc_lstm(sent_packed)[1][0].squeeze(0) # batch x 2*nhid

# 把句子返回原来的顺序
idx_unsort = np.argsort(idx_sort)
emb = sent_output.index_select(0, torch.cuda.LongTensor(idx_unsort))

return emb