/ NLP  

NLP系列

基于Transformer的聊天机器人构建

代码大家可以参考Transformer-in-generating-dialogue的实现,基于上述代码的小黄鸡对话语料构建聊天机器人版本可以参考transformer-chatbot

1
!git clone https://github.com/dengqiqi123/transformer-chatbot.git
Cloning into 'transformer-chatbot'...
remote: Enumerating objects: 46, done.
remote: Counting objects: 100% (46/46), done.
remote: Compressing objects: 100% (46/46), done.
Unpacking objects:  21% (10/46)   
1
!ls transformer-chatbot
data  getData.py  main.py  model  model.py  modules.py    train.py  utils.py
1
2
%cd transformer-chatbot/
!python main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# %load transformer-chatbot/utils.py
import codecs
import csv
import array
import numpy as np
import tensorflow as tf
import re
import math
import random
import jieba
import logging
import os
def create_model_and_embedding(session,Model_class,path,config,is_train):
model = Model_class(config,is_train)
ckpt = tf.train.get_checkpoint_state(path)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
model.saver.restore(session, ckpt.model_checkpoint_path)
else:
session.run(tf.global_variables_initializer())
return model
def save_model(sess, model, path,logger):
checkpoint_path = os.path.join(path, "chatbot.ckpt")
model.saver.save(sess, checkpoint_path)
logger.info("model saved")
def load_sor_vocab():
vocab = [line.split()[0] for line in codecs.open('data/vocab.tsv', 'r', 'utf-8').read().splitlines()]
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}
return word2idx, idx2word
def load_mub_vocab():
vocab = [line.split()[0] for line in codecs.open('data/vocab.answer.tsv', 'r', 'utf-8').read().splitlines()]
#word2idx = {word: idx for idx, word in enumerate(vocab)}
#idx2word = {idx: word for idx, word in enumerate(vocab)}
#return word2idx, idx2word
def load_sentences(sor_path,mub_path):
de_sents = [line.strip().replace('\r','') for line in codecs.open(sor_path, 'r', 'utf-8').read().split("\n")]
en_sents = [line.strip().replace('\r','') for line in codecs.open(mub_path, 'r', 'utf-8').read().split("\n")]
de_sents = [' '.join([i for i in line.strip()]) for line in de_sents]
en_sents = [' '.join([i for i in line.strip()]) for line in en_sents]
X, Y, Sources, Targets = create_data(de_sents, en_sents)
return X, Y
def create_data(source_sents, target_sents):
word2id,id2word = load_sor_vocab()
#mub2id,id2mud = load_mub_vocab()
x_list, y_list, Sources, Targets = [], [], [], []
for source_sent, target_sent in zip(source_sents, target_sents):
x = [word2id.get(word, 1) for word in (source_sent).split()] # 1: OOV, </S>: End of Text
y = [word2id.get(word, 1) for word in (target_sent+" </S>").split()]
if max(len(x), len(y)) <= 20:
x_list.append(np.array(x))
y_list.append(np.array(y))
Sources.append(source_sent)
Targets.append(target_sent)
return x_list, y_list, Sources, Targets
#实例化日志类
def get_logger(log_file):
logger = logging.getLogger(log_file)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(log_file)
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
return logger
def input_from_line(line, char_to_id):
inputs = list()
#把空格替换为$
line = line.replace(" ", "")
#查字典,把输入字符中能查到字典的字符转换为ID值,查不到的字标记为<UNK>
ids = [char_to_id[char] if char in char_to_id else char_to_id["<UNK>"] for char in line]
#+[char_to_id['</S>']]
inputs.append([ids])
inputs.append([line])
return inputs
class BatchManager(object):
def __init__(self, sor_data,mub_data,batch_size):
self.batch_data = self.sort_and_pad(sor_data,mub_data,batch_size)
self.len_data = len(self.batch_data)
def sort_and_pad(self,sor_data,mub_data, batch_size):
alldata = []
for ask,answer in zip(sor_data, mub_data):
sentence = []
sentence.append(ask)
sentence.append(answer)
alldata.append(sentence)
num_batch = int(math.ceil(len(alldata) /batch_size))

#sorted_data = sorted(sor_data, key=lambda x: len(x[0]))
#sorted_data = sor_data

random.shuffle(alldata)
batch_data = []
for i in range(num_batch):
batch_data.append(self.pad_data(alldata[i*int(batch_size) : (i+1)*int(batch_size)]))
return batch_data
@staticmethod
def pad_data(data):
ask,answer = [],[]
max_sor = max([len(sentence[0]) for sentence in data])
max_mub = max([len(sentence[1]) for sentence in data])
for line in data:
qpadding = [0] * (max_sor- len(line[0]))
ask.append(list(line[0])+qpadding)
apadding = [0] * (max_mub - len(line[1]))
answer.append(list(line[1])+apadding)
return [ask,answer]
def iter_batch(self, shuffle=False):
if shuffle:
random.shuffle(self.batch_data)
for idx in range(self.len_data):
yield self.batch_data[idx]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# %load transformer-chatbot/getData.py
#enconding=utf-8
import os,sys,csv
import numpy as np
import pandas as pd
import codecs
import tensorflow as tf
from modules import *

def full_to_half(s):
"""
将全角字符转换为半角字符
"""
n = []
for char in s:
num = ord(char)
if num == 0x3000:
num = 32
elif 0xFF01 <= num <= 0xFF5E:
num -= 0xfee0
char = chr(num)
n.append(char)
return ''.join(n)

def replace_html(s):
s = s.replace('&quot;','"')
s = s.replace('&amp;','&')
s = s.replace('&lt;','<')
s = s.replace('&gt;','>')
s = s.replace('&nbsp;',' ')
s = s.replace("&ldquo;", "")
s = s.replace("&rdquo;", "")
s = s.replace("&mdash;","")
s = s.replace("\xa0", " ")
return(s)
def setdata(line):
line = line.replace('。','')
line = line.replace('?','')
line = line.replace('!','')
line = line.replace(',','')
line = line.replace('.','')
line = line.replace(',','')
line = line.replace('?','')
line = line.replace('!','')
line = line.replace('“','')
line = line.replace('”','')
return line
'''
y = tf.constant([[4,2,3,4,5,6,7,8,9]])
enc = embedding(y,
vocab_size=20,
num_units=8,
scale=True,
scope="enc_embed")

key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(enc), axis=-1)), -1)
with tf.Session() as sess:
initall = tf.global_variables_initializer()
sess.run(initall)
print(sess.run(key_masks))
'''
vocab = {line.split()[0]:int(line.split()[1]) for line in codecs.open('data/vocab.tsv', 'r', 'utf-8').read().splitlines()}
fp = codecs.open('data/train.answer.tsv','r',encoding='utf-8-sig').read().split('\n')
#vocab = {}
for w in fp:
for i in w.strip():
if i in vocab.keys():
vocab[i] += 1
else:
vocab[i] = 1

with open('data/vocab.tsv','w',encoding='utf-8') as fa:
for k,v in vocab.items():
strs = k+' '+str(v)
fa.write(strs+'\n')
fa.close()
'''
fp = codecs.open('data/xiaohuangji50w_nofenci.conv','r',encoding='utf-8')
i = 1
asks = []
answers = []
sentence = []
for k,w in enumerate(fp):
w = w.strip()
if k > 0:
if "M" not in w and w != 'E':
continue
if i%3 == 0:
sentence[1] = sentence[1].replace(' ','')
sentence[2] = sentence[2].replace(' ','')
if sentence[1][1:] != '' and sentence[2][1:] != '':
asks.append(sentence[1][1:])
answers.append(sentence[2][1:])
sentence = []
i = 1
sentence.append(w)
else:
i += 1
sentence.append(w)
else:
sentence.append(w)
asks = list(filter(None,asks))
answers = list(filter(None,answers))
'''
fp = codecs.open('data/123.txt','r',encoding='utf-8-sig')
i = 1
asks = []
answers = []
for k,w in enumerate(fp):
w = w.strip()
w = full_to_half(w)
w = replace_html(w)
w = setdata(w)
if k%2 == 0:
asks.append(w)
else:
answers.append(w)
with open('data/train.ask.tsv','w',encoding='utf-8') as fa:
for w in asks:
fa.write(w+'\n')
with open('data/train.answer.tsv','w',encoding='utf-8') as fs:
for w in answers:
fs.write(w+'\n')
fa.close()
fs.close()
print('ok')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# %load transformer-chatbot/model.py
import numpy as np
import tensorflow as tf
from utils import load_sor_vocab,load_mub_vocab
from tensorflow.contrib.layers.python.layers import initializers
from modules import *

class Model(object):
def __init__(self, config,is_train=True):
self.is_train = is_train
self.config = config
self.lr = config["learning_rate"]
self.maxlen = config['sequence_length']
self.dropout_rate = config['dropout_rate']
self.hidden_units = config['hidden_units']
self.num_blocks = config['num_blocks']
self.num_heads = config['num_heads']

self.global_step = tf.Variable(0,trainable=False)
#定义编码输入input
self.sor_inputs = tf.placeholder(dtype=tf.int32,shape=[None,None],name='sorinput')
#定义编码输入output
self.out_inputs = tf.placeholder(dtype=tf.int32,shape=[None,None],name='outinput')
self.decode_input = tf.concat((tf.ones_like(self.out_inputs[:, :1])*2, self.out_inputs[:, :-1]), -1)
word2id,id2word = load_sor_vocab()
# Encoder
with tf.variable_scope("encoder"):
self.enc = embedding(self.sor_inputs, len(word2id), self.hidden_units,scale=True,scope="enc_embed")
key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)
# Positional Encoding
if False:
self.enc += positional_encoding(self.sor_inputs,num_units=self.hidden_units,zero_pad=False,scale=False,scope="enc_pe")
else:
self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.sor_inputs)[1]), 0), [tf.shape(self.sor_inputs)[0], 1]),vocab_size=self.maxlen,
num_units=self.hidden_units,zero_pad=False,scale=False,scope="enc_pe")

self.enc *= key_masks
# Dropout
self.enc = tf.layers.dropout(self.enc,rate=self.dropout_rate,training=tf.convert_to_tensor(self.is_train))
# Blocks
for i in range(self.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i)):
# Multihead Attention
self.enc = multihead_attention(queries=self.enc,keys=self.enc,num_units=self.hidden_units,num_heads=self.num_heads,dropout_rate=self.dropout_rate,is_training=self.is_train,
causality=False)
# Feed Forward
self.enc = feedforward(self.enc, num_units=[4*self.hidden_units, self.hidden_units])
#Decode
with tf.variable_scope("decoder"):
# Embedding
self.dec = embedding(self.decode_input,vocab_size=len(word2id),num_units=self.hidden_units,scale=True,scope="dec_embed")
key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)
# Positional Encoding
if False:
self.dec += positional_encoding(self.decode_input,vocab_size=self.maxlen,num_units=self.hidden_units,zero_pad=False,scale=False,scope="dec_pe")
else:
self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decode_input)[1]), 0), [tf.shape(self.decode_input)[0], 1]),vocab_size=self.maxlen,num_units=self.hidden_units,
zero_pad=False,
scale=False,
scope="dec_pe")
self.dec *= key_masks
# Dropout
self.dec = tf.layers.dropout(self.dec,rate=self.dropout_rate,training=tf.convert_to_tensor(self.is_train))
# Blocks
for i in range(self.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i)):
# Multihead Attention ( self-attention)
self.dec = multihead_attention(queries=self.dec,keys=self.dec,num_units=self.hidden_units,num_heads=self.num_heads, dropout_rate=self.dropout_rate,is_training=self.is_train,
causality=True,
scope="self_attention")
# Multihead Attention ( vanilla attention)
self.dec = multihead_attention(queries=self.dec,keys=self.enc,num_units=self.hidden_units,num_heads=self.num_heads,dropout_rate=self.dropout_rate,is_training=self.is_train,
causality=False,
scope="vanilla_attention")
# Feed Forward
self.dec = feedforward(self.dec, num_units=[4*self.hidden_units, self.hidden_units])
# Final linear projection
self.logits = tf.layers.dense(self.dec, len(word2id))
self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
self.istarget = tf.to_float(tf.not_equal(self.out_inputs, 0))
self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.out_inputs))*self.istarget)/ (tf.reduce_sum(self.istarget))
#tf.summary.scalar('acc', self.acc)
# Loss
self.y_smoothed = label_smoothing(tf.one_hot(self.out_inputs, depth=len(word2id)))
self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))

# 定义优化器
with tf.variable_scope('optimizer'):
self.optimizer = tf.train.AdamOptimizer(self.lr)#, beta1=0.9, beta2=0.98, epsilon=1e-8
grads_vars = self.optimizer.compute_gradients(self.loss)
capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
self.train_op = self.optimizer.apply_gradients(capped_grads_vars,self.global_step)
self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=1)
def create_feed_dict(self,is_train,batch):
if is_train:
ask,answer = batch
feed_dict = {
self.sor_inputs: np.asarray(ask),
self.out_inputs: np.asarray(answer)
}
else:
ask,_ = batch
feed_dict = {
#self.sor_inputs: np.asarray(ask),
#self.out_inputs:np.zeros((1, len(ask[0])), np.int32)
}
return feed_dict
def run_step(self,sess,is_train,batch):
feed_dict = self.create_feed_dict(is_train,batch)
if is_train:
global_step,y_smoothed,loss,logits,preds,_ = sess.run([self.global_step,self.y_smoothed,self.mean_loss,self.logits,self.preds,self.train_op],feed_dict)
return global_step, loss
else:
ask,_ = batch
preds = np.ones((1,20), np.int32)
#preds[:,0] = 2
#preds[:,19] = 3
for i in range(20):
_preds = sess.run(self.preds, {self.sor_inputs: np.asarray(ask), self.out_inputs:preds})
preds[:,i] = _preds[:,i]
#preds = sess.run([self.preds], feed_dict)
return preds
def evaluate_line(self, sess, inputs):
probs = self.run_step(sess, False, inputs)
return probs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# %load transformer-chatbot/modules.py
#/usr/bin/python2
'''
June 2017 by kyubyong park.
kbpark.linguist@gmail.com.
https://www.github.com/kyubyong/transformer
'''

from __future__ import print_function
import tensorflow as tf

def normalize(inputs,
epsilon = 1e-8,
scope="ln",
reuse=None):
'''Applies layer normalization.

Args:
inputs: A tensor with 2 or more dimensions, where the first dimension has
`batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.

Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]

mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta= tf.Variable(tf.zeros(params_shape))
gamma = tf.Variable(tf.ones(params_shape))
normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
outputs = gamma * normalized + beta

return outputs

def embedding(inputs,
vocab_size,
num_units,
zero_pad=True,
scale=True,
scope="embedding",
reuse=None):
'''Embeds a given tensor.

Args:
inputs: A `Tensor` with type `int32` or `int64` containing the ids
to be looked up in `lookup table`.
vocab_size: An int. Vocabulary size.
num_units: An int. Number of embedding hidden units.
zero_pad: A boolean. If True, all the values of the fist row (id 0)
should be constant zeros.
scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.

Returns:
A `Tensor` with one more rank than inputs's. The last dimensionality
should be `num_units`.

For example,
import tensorflow as tf

inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
outputs = embedding(inputs, 6, 2, zero_pad=True)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print sess.run(outputs)
>>
[[[ 0.          0.        ]
  [ 0.09754146  0.67385566]
  [ 0.37864095 -0.35689294]]

 [[-1.01329422 -1.09939694]
  [ 0.7521342   0.38203377]
  [-0.04973143 -0.06210355]]]
1
 
import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=False) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[-0.19172323 -0.39159766] [-0.43212751 -0.66207761] [ 1.03452027 -0.26704335]] [[-0.11634696 -0.35983452] [ 0.50208133 0.53509563] [ 1.22204471 -0.96587461]]]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    '''
with tf.variable_scope(scope, reuse=reuse):
lookup_table = tf.get_variable('lookup_table',
dtype=tf.float32,
shape=[vocab_size, num_units],
initializer=tf.contrib.layers.xavier_initializer())
if zero_pad:
lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),lookup_table[1:, :]), 0)
outputs = tf.nn.embedding_lookup(lookup_table, inputs)

if scale:
outputs = outputs * (num_units ** 0.5)

return outputs


def positional_encoding(inputs,
num_units,
zero_pad=True,
scale=True,
scope="positional_encoding",
reuse=None):
'''Sinusoidal Positional_Encoding.

Args:
inputs: A 2d Tensor with shape of (N, T).
num_units: Output dimensionality
zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.

Returns:
A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
'''

N, T = inputs.get_shape().as_list()
with tf.variable_scope(scope, reuse=reuse):
position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])

# First part of the PE function: sin and cos argument
position_enc = np.array([
[pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
for pos in range(T)])

# Second part, apply the cosine to even columns and sin to odds.
position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1

# Convert to a tensor
lookup_table = tf.convert_to_tensor(position_enc)

if zero_pad:
lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
lookup_table[1:, :]), 0)
outputs = tf.nn.embedding_lookup(lookup_table, position_ind)

if scale:
outputs = outputs * num_units**0.5

return outputs



def multihead_attention(queries,
keys,
num_units=None,
num_heads=8,
dropout_rate=0,
is_training=True,
causality=False,
scope="multihead_attention",
reuse=None):
'''Applies multihead attention.

Args:
queries: A 3d tensor with shape of [N, T_q, C_q].
keys: A 3d tensor with shape of [N, T_k, C_k].
num_units: A scalar. Attention size.
dropout_rate: A floating point number.
is_training: Boolean. Controller of mechanism for dropout.
causality: Boolean. If true, units that reference the future are masked.
num_heads: An int. Number of heads.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.

Returns
A 3d tensor with shape of (N, T_q, C)
'''
with tf.variable_scope(scope, reuse=reuse):
# Set the fall back option for num_units
if num_units is None:
num_units = queries.get_shape().as_list()[-1]

# Linear projections
Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)

# Split and concat
Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h)
K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)

# Multiplication
outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)

# Scale
outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)

# Key Masking
key_masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)

paddings = tf.ones_like(outputs)*(-2**32+1)
outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)

# Causality = Future blinding
if causality:
diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)

paddings = tf.ones_like(masks)*(-2**32+1)
outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)

# Activation
outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)

# Query Masking
query_masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
outputs *= query_masks # broadcasting. (N, T_q, C)

# Dropouts
outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

# Weighted sum
outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)

# Restore shape
outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)

# Residual connection
outputs += queries

# Normalize
outputs = normalize(outputs) # (N, T_q, C)

return outputs

def feedforward(inputs,
num_units=[2048, 512],
scope="multihead_attention",
reuse=None):
'''Point-wise feed forward net.

Args:
inputs: A 3d tensor with shape of [N, T, C].
num_units: A list of two integers.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.

Returns:
A 3d tensor with the same shape and dtype as inputs
'''
with tf.variable_scope(scope, reuse=reuse):
# Inner layer
params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,"activation": tf.nn.relu, "use_bias": True}
outputs = tf.layers.conv1d(**params)

# Readout layer
params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,"activation": None, "use_bias": True}
outputs = tf.layers.conv1d(**params)

# Residual connection
outputs += inputs

# Normalize
outputs = normalize(outputs)

return outputs

def label_smoothing(inputs, epsilon=0.1):
'''Applies label smoothing. See https://arxiv.org/abs/1512.00567.

Args:
inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
epsilon: Smoothing rate.

For example,
import tensorflow as tf inputs = tf.convert_to_tensor([[[0, 0, 1], [0, 1, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0], [0, 1, 0]]], tf.float32) outputs = label_smoothing(inputs) with tf.Session() as sess: print(sess.run([outputs])) >> [array([[[ 0.03333334, 0.03333334, 0.93333334], [ 0.03333334, 0.93333334, 0.03333334], [ 0.93333334, 0.03333334, 0.03333334]], [[ 0.93333334, 0.03333334, 0.03333334], [ 0.93333334, 0.03333334, 0.03333334], [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)]
1
2
3
'''
K = inputs.get_shape().as_list()[-1] # number of channels
return ((1-epsilon) * inputs) + (epsilon / K)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# %load transformer-chatbot/train.py
#/usr/bin/python2
'''
June 2017 by kyubyong park.
kbpark.linguist@gmail.com.
https://www.github.com/kyubyong/transformer
'''
from __future__ import print_function
import tensorflow as tf

from hyperparams import Hyperparams as hp
from data_load import get_batch_data, load_de_vocab, load_en_vocab
from modules import *
import os, codecs
from tqdm import tqdm

class Graph():
def __init__(self, is_training=True):
self.graph = tf.Graph()
with self.graph.as_default():
if is_training:
self.x, self.y, self.num_batch = get_batch_data() # (N, T)
else: # inference
self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

# define decoder inputs
self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>

# Load vocabulary
de2idx, idx2de = load_de_vocab()
en2idx, idx2en = load_en_vocab()

# Encoder
with tf.variable_scope("encoder"):
## Embedding
self.enc = embedding(self.x,
vocab_size=len(de2idx),
num_units=hp.hidden_units,
scale=True,
scope="enc_embed")

key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)

## Positional Encoding
if hp.sinusoid:
self.enc += positional_encoding(self.x,
num_units=hp.hidden_units,
zero_pad=False,
scale=False,
scope="enc_pe")
else:
self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
vocab_size=hp.maxlen,
num_units=hp.hidden_units,
zero_pad=False,
scale=False,
scope="enc_pe")

self.enc *= key_masks

## Dropout
self.enc = tf.layers.dropout(self.enc,
rate=hp.dropout_rate,
training=tf.convert_to_tensor(is_training))

## Blocks
for i in range(hp.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i)):
### Multihead Attention
self.enc = multihead_attention(queries=self.enc,
keys=self.enc,
num_units=hp.hidden_units,
num_heads=hp.num_heads,
dropout_rate=hp.dropout_rate,
is_training=is_training,
causality=False)

### Feed Forward
self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])

# Decoder
with tf.variable_scope("decoder"):
## Embedding
self.dec = embedding(self.decoder_inputs,
vocab_size=len(en2idx),
num_units=hp.hidden_units,
scale=True,
scope="dec_embed")

key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

## Positional Encoding
if hp.sinusoid:
self.dec += positional_encoding(self.decoder_inputs,
vocab_size=hp.maxlen,
num_units=hp.hidden_units,
zero_pad=False,
scale=False,
scope="dec_pe")
else:
self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
vocab_size=hp.maxlen,
num_units=hp.hidden_units,
zero_pad=False,
scale=False,
scope="dec_pe")
self.dec *= key_masks

## Dropout
self.dec = tf.layers.dropout(self.dec,
rate=hp.dropout_rate,
training=tf.convert_to_tensor(is_training))

## Blocks
for i in range(hp.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i)):
## Multihead Attention ( self-attention)
self.dec = multihead_attention(queries=self.dec,
keys=self.dec,
num_units=hp.hidden_units,
num_heads=hp.num_heads,
dropout_rate=hp.dropout_rate,
is_training=is_training,
causality=True,
scope="self_attention")

## Multihead Attention ( vanilla attention)
self.dec = multihead_attention(queries=self.dec,
keys=self.enc,
num_units=hp.hidden_units,
num_heads=hp.num_heads,
dropout_rate=hp.dropout_rate,
is_training=is_training,
causality=False,
scope="vanilla_attention")

## Feed Forward
self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])

# Final linear projection
self.logits = tf.layers.dense(self.dec, len(en2idx))
self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
self.istarget = tf.to_float(tf.not_equal(self.y, 0))
self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
tf.summary.scalar('acc', self.acc)

if is_training:
# Loss
self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))

# Training Scheme
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
# Summary
tf.summary.scalar('mean_loss', self.mean_loss)
self.merged = tf.summary.merge_all()

if __name__ == '__main__':
# Load vocabulary
de2idx, idx2de = load_de_vocab()
en2idx, idx2en = load_en_vocab()

# Construct graph
g = Graph("train"); print("Graph loaded")

# Start session
sv = tf.train.Supervisor(graph=g.graph,logdir=hp.logdir,save_model_secs=0)
with sv.managed_session() as sess:
for epoch in range(1, hp.num_epochs+1):
if sv.should_stop(): break
for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
sess.run(g.train_op)

gs = sess.run(g.global_step)
sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))

print("Done")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# %load transformer-chatbot/main.py
from __future__ import print_function
import tensorflow as tf
import os, codecs,sys
import numpy as np
import pandas as pd
from utils import load_sentences,BatchManager,create_model_and_embedding,get_logger,save_model,input_from_line,load_sor_vocab,load_mub_vocab
from model import Model
from flask import Flask, jsonify, request
from collections import OrderedDict

flags = tf.app.flags
flags.DEFINE_integer("block",6,"layer_size")
flags.DEFINE_integer("sequence_length",20,"word vector dim")
flags.DEFINE_integer("steps_check", 10, "steps per checkpoint")
flags.DEFINE_integer("num_of_epoch", 100000, "epoch number")
flags.DEFINE_integer("batch_size",64 ,"word vector dim")
flags.DEFINE_integer('hidden_units',128,' ')
flags.DEFINE_integer('num_blocks',6,' ')
flags.DEFINE_integer('num_heads',8,' ')
flags.DEFINE_float("dropout_rate", 0.0, "Learning rate")

flags.DEFINE_string("model_path","model/","vocab file path")
flags.DEFINE_string("train_sor_path","data/train.ask.tsv","train file path")
flags.DEFINE_string("train_mub_path","data/train.answer.tsv","train file path")
flags.DEFINE_string("logger_path","logger/train.log","vocab file path")
flags.DEFINE_float("learning_rate", 0.00001, "Learning rate")
flags.DEFINE_string("optimizer", "adam", "Optimizer for training")
flags.DEFINE_boolean('flag',True,' ')
FLAGS = tf.app.flags.FLAGS
app = Flask(__name__)
def config_model():
config = OrderedDict()
config["optimizer"] = FLAGS.optimizer
config["layer_size"] = FLAGS.block
config["sequence_length"] = FLAGS.sequence_length
config["batch_size"] = FLAGS.batch_size
config["hidden_units"] = FLAGS.hidden_units
config["num_blocks"] = FLAGS.num_blocks
config["num_heads"] = FLAGS.num_heads
config["dropout_rate"] = FLAGS.dropout_rate

config["train_sor_path"] = FLAGS.train_sor_path
config["train_mub_path"] = FLAGS.train_mub_path
config["model_path"] = FLAGS.model_path
config["logger_path"] = FLAGS.logger_path
config["learning_rate"] = FLAGS.learning_rate
config['flag'] = FLAGS.flag
return config
def train():
#加载训练数据并生成可训练数据
train_sor_data,train_mub_data = load_sentences(FLAGS.train_sor_path,FLAGS.train_mub_path)
#将训练数据处理成N批次数据
train_manager = BatchManager(train_sor_data,train_mub_data, FLAGS.batch_size)
#设置gpu参数
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
#加载FLAGS参数
config = config_model()
logger = get_logger(config["logger_path"])
#计算批次数
word2id,id2word = load_sor_vocab()
steps_per_epoch = train_manager.len_data
with tf.Session(config=tf_config) as sess:
model = create_model_and_embedding(sess, Model, FLAGS.model_path, config,True)
logger.info("start training")
loss = []
with tf.device('/gpu:0'):
for i in range(FLAGS.num_of_epoch):
for batch in train_manager.iter_batch(shuffle=True):
step,batch_loss = model.run_step(sess,True,batch)
loss.append(batch_loss)
if step%FLAGS.steps_check == 0:
iteration = step // steps_per_epoch + 1
logger.info("iteration:{} step:{}/{},chatbot loss:{:>9.6f}".format(iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
loss = []
if i%10 == 0:
save_model(sess, model, FLAGS.model_path,logger)
def predict():
word2id,id2word = load_sor_vocab()
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
config = config_model()
logger = get_logger(config["logger_path"])
graph = tf.Graph()
sess = tf.Session(graph=graph,config=tf_config)
with graph.as_default():
sess.run(tf.global_variables_initializer())
model = create_model_and_embedding(sess, Model, FLAGS.model_path, config,False)
sys.stdout.write('请输入测试句子:')
sys.stdout.flush()
sentences = sys.stdin.readline()
while True:
sentences = sentences.replace('\n','')
rs = model.evaluate_line(sess,input_from_line(sentences,word2id))
res = ''.join([id2word[w] for w in rs[0]]).split('</S>')[0].strip()
print(res)
print('请输入测试句子:',end='')
sys.stdout.flush()
sentences = sys.stdin.readline()
print('ok')
def main(_):
predict()
if __name__ == '__main__':
tf.app.run(main)