基于Transformer的聊天机器人构建

代码大家可以参考Transformer-in-generating-dialogue的实现，基于上述代码的小黄鸡对话语料构建聊天机器人版本可以参考transformer-chatbot

1	!git clone https://github.com/dengqiqi123/transformer-chatbot.git

Cloning into 'transformer-chatbot'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (46/46), done.[K
Unpacking objects:  21% (10/46)

1	!ls transformer-chatbot

data  getData.py  main.py  model  model.py  modules.py    train.py  utils.py

1 2	%cd transformer-chatbot/ !python main.py

# %load transformer-chatbot/utils.py
import codecs
import csv
import array
import numpy as np
import tensorflow as tf
import re
import math
import random
import jieba
import logging
import os
def create_model_and_embedding(session,Model_class,path,config,is_train):
    model = Model_class(config,is_train)
    ckpt = tf.train.get_checkpoint_state(path) 
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        session.run(tf.global_variables_initializer())
    return model 
def save_model(sess, model, path,logger):
    checkpoint_path = os.path.join(path, "chatbot.ckpt")
    model.saver.save(sess, checkpoint_path)
    logger.info("model saved")
def load_sor_vocab():
    vocab = [line.split()[0] for line in codecs.open('data/vocab.tsv', 'r', 'utf-8').read().splitlines()]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word    
def load_mub_vocab():   
    vocab = [line.split()[0] for line in codecs.open('data/vocab.answer.tsv', 'r', 'utf-8').read().splitlines()]
    #word2idx = {word: idx for idx, word in enumerate(vocab)}
    #idx2word = {idx: word for idx, word in enumerate(vocab)}
    #return word2idx, idx2word    
def load_sentences(sor_path,mub_path):
    de_sents = [line.strip().replace('\r','') for line in codecs.open(sor_path, 'r', 'utf-8').read().split("\n")]
    en_sents = [line.strip().replace('\r','') for line in codecs.open(mub_path, 'r', 'utf-8').read().split("\n")]
    de_sents = [' '.join([i for i in line.strip()])  for line in de_sents]
    en_sents = [' '.join([i for i in line.strip()])  for line in en_sents]
    X, Y, Sources, Targets = create_data(de_sents, en_sents)
    return X, Y 
def create_data(source_sents, target_sents):
    word2id,id2word = load_sor_vocab()
    #mub2id,id2mud = load_mub_vocab()
    x_list, y_list, Sources, Targets = [], [], [], []
    for source_sent, target_sent in zip(source_sents, target_sents):
        x = [word2id.get(word, 1) for word in (source_sent).split()] # 1: OOV, </S>: End of Text
        y = [word2id.get(word, 1) for word in (target_sent+" </S>").split()] 
        if max(len(x), len(y)) <= 20:
            x_list.append(np.array(x))
            y_list.append(np.array(y))
            Sources.append(source_sent)
            Targets.append(target_sent)
    return x_list, y_list, Sources, Targets 
#实例化日志类
def get_logger(log_file):
    logger = logging.getLogger(log_file)
    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    ch.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(ch)
    logger.addHandler(fh)
    return logger
def input_from_line(line, char_to_id):
    inputs = list()
    #把空格替换为$
    line = line.replace(" ", "")    
    #查字典，把输入字符中能查到字典的字符转换为ID值，查不到的字标记为<UNK>
    ids = [char_to_id[char] if char in char_to_id else char_to_id["<UNK>"] for char in line] 
    #+[char_to_id['</S>']]
    inputs.append([ids])
    inputs.append([line])
    return inputs
class BatchManager(object):
    def __init__(self, sor_data,mub_data,batch_size):
        self.batch_data = self.sort_and_pad(sor_data,mub_data,batch_size)
        self.len_data = len(self.batch_data)
    def sort_and_pad(self,sor_data,mub_data, batch_size):
        alldata = []
        for ask,answer in zip(sor_data, mub_data):
            sentence = []
            sentence.append(ask)
            sentence.append(answer)
            alldata.append(sentence)
        num_batch = int(math.ceil(len(alldata) /batch_size))
        
        #sorted_data = sorted(sor_data, key=lambda x: len(x[0]))
        #sorted_data = sor_data
               
        random.shuffle(alldata)
        batch_data = []
        for i in range(num_batch):
            batch_data.append(self.pad_data(alldata[i*int(batch_size) : (i+1)*int(batch_size)]))
        return batch_data
    @staticmethod
    def pad_data(data):
        ask,answer = [],[]
        max_sor = max([len(sentence[0]) for sentence in data])
        max_mub = max([len(sentence[1]) for sentence in data])
        for line in data:
            qpadding = [0] * (max_sor- len(line[0]))
            ask.append(list(line[0])+qpadding)
            apadding = [0] * (max_mub - len(line[1]))
            answer.append(list(line[1])+apadding)            
        return [ask,answer]
    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]

# %load transformer-chatbot/getData.py
#enconding=utf-8
import os,sys,csv
import numpy as np
import pandas as pd
import codecs
import tensorflow as tf
from modules import *

def full_to_half(s):
    """
    将全角字符转换为半角字符 
    """
    n = []
    for char in s:
        num = ord(char)
        if num == 0x3000:
            num = 32
        elif 0xFF01 <= num <= 0xFF5E:
            num -= 0xfee0
        char = chr(num)
        n.append(char)
    return ''.join(n)

def replace_html(s):
    s = s.replace('&quot;','"')
    s = s.replace('&amp;','&')
    s = s.replace('&lt;','<')
    s = s.replace('&gt;','>')
    s = s.replace('&nbsp;',' ')
    s = s.replace("&ldquo;", "")
    s = s.replace("&rdquo;", "")
    s = s.replace("&mdash;","")
    s = s.replace("\xa0", " ")
    return(s)
def setdata(line):
    line = line.replace('。','')
    line = line.replace('？','')
    line = line.replace('！','')
    line = line.replace('，','')
    line = line.replace('.','')
    line = line.replace(',','')
    line = line.replace('?','')
    line = line.replace('!','')
    line = line.replace('“','')
    line = line.replace('”','')
    return line
'''
y = tf.constant([[4,2,3,4,5,6,7,8,9]])
enc = embedding(y, 
                         vocab_size=20, 
                                  num_units=8, 
                                  scale=True,
                                  scope="enc_embed")

key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(enc), axis=-1)), -1)
with tf.Session() as sess:
    initall = tf.global_variables_initializer()
    sess.run(initall)
    print(sess.run(key_masks))
'''
vocab = {line.split()[0]:int(line.split()[1]) for line in codecs.open('data/vocab.tsv', 'r', 'utf-8').read().splitlines()}
fp = codecs.open('data/train.answer.tsv','r',encoding='utf-8-sig').read().split('\n')
#vocab = {}
for w in fp:
    for i in w.strip():
        if i in vocab.keys():
            vocab[i] += 1
        else:
            vocab[i] = 1

with open('data/vocab.tsv','w',encoding='utf-8') as fa:
    for k,v in vocab.items():
        strs = k+' '+str(v)
        fa.write(strs+'\n')
fa.close()
'''
fp = codecs.open('data/xiaohuangji50w_nofenci.conv','r',encoding='utf-8')
i = 1
asks = []
answers = []
sentence = []
for k,w in enumerate(fp):
    w = w.strip()
    if k > 0:
        if "M" not in w and w != 'E':
            continue        
        if i%3 == 0:
            sentence[1] = sentence[1].replace(' ','')
            sentence[2] = sentence[2].replace(' ','')
            if sentence[1][1:] != '' and sentence[2][1:] != '':
                asks.append(sentence[1][1:])
                answers.append(sentence[2][1:])
            sentence = []
            i = 1
            sentence.append(w)
        else:
            i += 1
            sentence.append(w)
    else:
        sentence.append(w)
asks = list(filter(None,asks))
answers = list(filter(None,answers))
'''
fp = codecs.open('data/123.txt','r',encoding='utf-8-sig')
i = 1
asks = []
answers = []
for k,w in enumerate(fp):
    w = w.strip()
    w = full_to_half(w)
    w = replace_html(w)    
    w = setdata(w)
    if k%2 == 0:
        asks.append(w)
    else:
        answers.append(w)
with open('data/train.ask.tsv','w',encoding='utf-8') as fa:
    for w in asks:
        fa.write(w+'\n')
with open('data/train.answer.tsv','w',encoding='utf-8') as fs:
    for w in answers:
        fs.write(w+'\n')
fa.close()
fs.close()
print('ok')

# %load transformer-chatbot/model.py
import numpy as np
import tensorflow as tf
from utils import load_sor_vocab,load_mub_vocab
from tensorflow.contrib.layers.python.layers import initializers
from modules import *

class Model(object):
    def __init__(self, config,is_train=True):
        self.is_train =  is_train
        self.config = config
        self.lr = config["learning_rate"]
        self.maxlen = config['sequence_length']
        self.dropout_rate = config['dropout_rate']
        self.hidden_units = config['hidden_units']
        self.num_blocks = config['num_blocks']
        self.num_heads = config['num_heads']        
        
        self.global_step = tf.Variable(0,trainable=False)  
        #定义编码输入input
        self.sor_inputs = tf.placeholder(dtype=tf.int32,shape=[None,None],name='sorinput')
        #定义编码输入output
        self.out_inputs = tf.placeholder(dtype=tf.int32,shape=[None,None],name='outinput')
        self.decode_input = tf.concat((tf.ones_like(self.out_inputs[:, :1])*2, self.out_inputs[:, :-1]), -1)
        word2id,id2word = load_sor_vocab()
        # Encoder
        with tf.variable_scope("encoder"):
            self.enc = embedding(self.sor_inputs, len(word2id), self.hidden_units,scale=True,scope="enc_embed")
            key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)
            # Positional Encoding
            if False:
                self.enc += positional_encoding(self.sor_inputs,num_units=self.hidden_units,zero_pad=False,scale=False,scope="enc_pe")
            else:
                self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.sor_inputs)[1]), 0), [tf.shape(self.sor_inputs)[0], 1]),vocab_size=self.maxlen, 
                                      num_units=self.hidden_units,zero_pad=False,scale=False,scope="enc_pe")

            self.enc *= key_masks
            # Dropout
            self.enc = tf.layers.dropout(self.enc,rate=self.dropout_rate,training=tf.convert_to_tensor(self.is_train))   
            # Blocks
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    # Multihead Attention
                    self.enc = multihead_attention(queries=self.enc,keys=self.enc,num_units=self.hidden_units,num_heads=self.num_heads,dropout_rate=self.dropout_rate,is_training=self.is_train,
                                                                causality=False)
                    # Feed Forward
                    self.enc = feedforward(self.enc, num_units=[4*self.hidden_units, self.hidden_units])  
        #Decode
        with tf.variable_scope("decoder"):
            # Embedding
            self.dec = embedding(self.decode_input,vocab_size=len(word2id),num_units=self.hidden_units,scale=True,scope="dec_embed") 
            key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)
            # Positional Encoding
            if False:
                self.dec += positional_encoding(self.decode_input,vocab_size=self.maxlen,num_units=self.hidden_units,zero_pad=False,scale=False,scope="dec_pe")
            else:
                self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decode_input)[1]), 0), [tf.shape(self.decode_input)[0], 1]),vocab_size=self.maxlen,num_units=self.hidden_units, 
                                              zero_pad=False, 
                                              scale=False,
                                              scope="dec_pe")
            self.dec *= key_masks 
            # Dropout
            self.dec = tf.layers.dropout(self.dec,rate=self.dropout_rate,training=tf.convert_to_tensor(self.is_train)) 
            # Blocks
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    # Multihead Attention ( self-attention)
                    self.dec = multihead_attention(queries=self.dec,keys=self.dec,num_units=self.hidden_units,num_heads=self.num_heads, dropout_rate=self.dropout_rate,is_training=self.is_train,
                                                                causality=True, 
                                                                scope="self_attention")
                    # Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(queries=self.dec,keys=self.enc,num_units=self.hidden_units,num_heads=self.num_heads,dropout_rate=self.dropout_rate,is_training=self.is_train, 
                                                                causality=False,
                                                                scope="vanilla_attention")
                    # Feed Forward
                    self.dec = feedforward(self.dec, num_units=[4*self.hidden_units, self.hidden_units]) 
        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(word2id))
        self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
        self.istarget = tf.to_float(tf.not_equal(self.out_inputs, 0))
        self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.out_inputs))*self.istarget)/ (tf.reduce_sum(self.istarget))
        #tf.summary.scalar('acc', self.acc)   
        # Loss
        self.y_smoothed = label_smoothing(tf.one_hot(self.out_inputs, depth=len(word2id)))
        self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
        self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
       
        # 定义优化器
        with tf.variable_scope('optimizer'):
            self.optimizer = tf.train.AdamOptimizer(self.lr)#, beta1=0.9, beta2=0.98, epsilon=1e-8
            grads_vars = self.optimizer.compute_gradients(self.loss)
            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]        
            self.train_op = self.optimizer.apply_gradients(capped_grads_vars,self.global_step)
        self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=1)
    def create_feed_dict(self,is_train,batch):
        if is_train:
            ask,answer = batch
            feed_dict = {
                self.sor_inputs: np.asarray(ask),
                self.out_inputs: np.asarray(answer)
            }
        else:
            ask,_ = batch
            feed_dict = {
            #self.sor_inputs: np.asarray(ask),
            #self.out_inputs:np.zeros((1, len(ask[0])), np.int32)
            }            
        return feed_dict        
    def run_step(self,sess,is_train,batch):
        feed_dict = self.create_feed_dict(is_train,batch)
        if is_train:
            global_step,y_smoothed,loss,logits,preds,_ = sess.run([self.global_step,self.y_smoothed,self.mean_loss,self.logits,self.preds,self.train_op],feed_dict)                 
            return global_step, loss
        else:
            ask,_ = batch
            preds = np.ones((1,20), np.int32)
            #preds[:,0] = 2
            #preds[:,19] = 3
            for i in range(20):
                _preds = sess.run(self.preds, {self.sor_inputs: np.asarray(ask), self.out_inputs:preds})
                preds[:,i] = _preds[:,i]                
            #preds = sess.run([self.preds], feed_dict)
            return preds
    def evaluate_line(self, sess, inputs):
        probs = self.run_step(sess, False, inputs)
        return probs

# %load transformer-chatbot/modules.py
#/usr/bin/python2
'''
June 2017 by kyubyong park. 
kbpark.linguist@gmail.com.
https://www.github.com/kyubyong/transformer
'''

from __future__ import print_function
import tensorflow as tf

def normalize(inputs, 
              epsilon = 1e-8,
              scope="ln",
              reuse=None):
    '''Applies layer normalization.
    
    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has
        `batch_size`.
      epsilon: A floating number. A very small number for preventing ZeroDivision Error.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
      
    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
    
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.Variable(tf.zeros(params_shape))
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta
        
    return outputs

def embedding(inputs, 
              vocab_size, 
              num_units, 
              zero_pad=True, 
              scale=True,
              scope="embedding", 
              reuse=None):
    '''Embeds a given tensor.

    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids
         to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0)
        should be constant zeros.
      scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A `Tensor` with one more rank than inputs's. The last dimensionality
        should be `num_units`.
        
    For example,

import tensorflow as tf

inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
outputs = embedding(inputs, 6, 2, zero_pad=True)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print sess.run(outputs)
>>
[[[ 0.          0.        ]
  [ 0.09754146  0.67385566]
  [ 0.37864095 -0.35689294]]

 [[-1.01329422 -1.09939694]
  [ 0.7521342   0.38203377]
  [-0.04973143 -0.06210355]]]
1
 


import tensorflow as tf

inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
outputs = embedding(inputs, 6, 2, zero_pad=False)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print sess.run(outputs)
>>
[[[-0.19172323 -0.39159766]
  [-0.43212751 -0.66207761]
  [ 1.03452027 -0.26704335]]

 [[-0.11634696 -0.35983452]
  [ 0.50208133  0.53509563]
  [ 1.22204471 -0.96587461]]]    
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    '''
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)
        
        if scale:
            outputs = outputs * (num_units ** 0.5) 
            
    return outputs
    

def positional_encoding(inputs,
                        num_units,
                        zero_pad=True,
                        scale=True,
                        scope="positional_encoding",
                        reuse=None):
    '''Sinusoidal Positional_Encoding.

    Args:
      inputs: A 2d Tensor with shape of (N, T).
      num_units: Output dimensionality
      zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
      scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
        A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
    '''

    N, T = inputs.get_shape().as_list()
    with tf.variable_scope(scope, reuse=reuse):
        position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])

        # First part of the PE function: sin and cos argument
        position_enc = np.array([
            [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
            for pos in range(T)])

        # Second part, apply the cosine to even columns and sin to odds.
        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1

        # Convert to a tensor
        lookup_table = tf.convert_to_tensor(position_enc)

        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, position_ind)

        if scale:
            outputs = outputs * num_units**0.5

        return outputs



def multihead_attention(queries, 
                        keys, 
                        num_units=None, 
                        num_heads=8, 
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope="multihead_attention", 
                        reuse=None):
    '''Applies multihead attention.
    
    Args:
      queries: A 3d tensor with shape of [N, T_q, C_q].
      keys: A 3d tensor with shape of [N, T_k, C_k].
      num_units: A scalar. Attention size.
      dropout_rate: A floating point number.
      is_training: Boolean. Controller of mechanism for dropout.
      causality: Boolean. If true, units that reference the future are masked. 
      num_heads: An int. Number of heads.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      A 3d tensor with shape of (N, T_q, C)  
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list()[-1]
        
        # Linear projections
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        
        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
        
        # Scale
        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
        
        # Key Masking
        key_masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
        
        paddings = tf.ones_like(outputs)*(-2**32+1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  
        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
            tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
   
            paddings = tf.ones_like(masks)*(-2**32+1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  
        # Activation
        outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
         
        # Query Masking
        query_masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
        outputs *= query_masks # broadcasting. (N, T_q, C)
          
        # Dropouts
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
               
        # Weighted sum
        outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
        
        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
              
        # Residual connection
        outputs += queries
              
        # Normalize
        outputs = normalize(outputs) # (N, T_q, C)
 
    return outputs

def feedforward(inputs, 
                num_units=[2048, 512],
                scope="multihead_attention", 
                reuse=None):
    '''Point-wise feed forward net.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, C].
      num_units: A list of two integers.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns:
      A 3d tensor with the same shape and dtype as inputs
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Inner layer
        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,"activation": tf.nn.relu, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Readout layer
        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,"activation": None, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Residual connection
        outputs += inputs
        
        # Normalize
        outputs = normalize(outputs)
    
    return outputs

def label_smoothing(inputs, epsilon=0.1):
    '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
      epsilon: Smoothing rate.
    
    For example,


import tensorflow as tf
inputs = tf.convert_to_tensor([[[0, 0, 1], 
   [0, 1, 0],
   [1, 0, 0]],

  [[1, 0, 0],
   [1, 0, 0],
   [0, 1, 0]]], tf.float32)

outputs = label_smoothing(inputs)

with tf.Session() as sess:
    print(sess.run([outputs]))

>>
[array([[[ 0.03333334,  0.03333334,  0.93333334],
    [ 0.03333334,  0.93333334,  0.03333334],
    [ 0.93333334,  0.03333334,  0.03333334]],

   [[ 0.93333334,  0.03333334,  0.03333334],
    [ 0.93333334,  0.03333334,  0.03333334],
    [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
1
2
3
'''
K = inputs.get_shape().as_list()[-1] # number of channels
return ((1-epsilon) * inputs) + (epsilon / K)

# %load transformer-chatbot/train.py
#/usr/bin/python2
'''
June 2017 by kyubyong park. 
kbpark.linguist@gmail.com.
https://www.github.com/kyubyong/transformer
'''
from __future__ import print_function
import tensorflow as tf

from hyperparams import Hyperparams as hp
from data_load import get_batch_data, load_de_vocab, load_en_vocab
from modules import *
import os, codecs
from tqdm import tqdm

class Graph():
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data() # (N, T)
            else: # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>

            # Load vocabulary    
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()
            
            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x, 
                                      vocab_size=len(de2idx), 
                                      num_units=hp.hidden_units, 
                                      scale=True,
                                      scope="enc_embed")

                key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")

                self.enc *= key_masks
                 
                ## Dropout
                self.enc = tf.layers.dropout(self.enc, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(queries=self.enc, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=False)
                        
                        ### Feed Forward
                        self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
            
            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs, 
                                      vocab_size=len(en2idx), 
                                      num_units=hp.hidden_units,
                                      scale=True, 
                                      scope="dec_embed")

                key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                self.dec *= key_masks
                
                ## Dropout
                self.dec = tf.layers.dropout(self.dec, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.dec, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=True, 
                                                        scope="self_attention")
                        
                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads,
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training, 
                                                        causality=False,
                                                        scope="vanilla_attention")
                        
                        ## Feed Forward
                        self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
                
            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)
                
            if is_training:  
                # Loss
                self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
               
                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                # Summary 
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()

if __name__ == '__main__':                
    # Load vocabulary    
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    
    # Construct graph
    g = Graph("train"); print("Graph loaded")
    
    # Start session
    sv = tf.train.Supervisor(graph=g.graph,logdir=hp.logdir,save_model_secs=0)
    with sv.managed_session() as sess:
        for epoch in range(1, hp.num_epochs+1): 
            if sv.should_stop(): break
            for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
                sess.run(g.train_op)
                
            gs = sess.run(g.global_step)   
            sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
    
    print("Done")

# %load transformer-chatbot/main.py
from __future__ import print_function
import tensorflow as tf
import os, codecs,sys
import numpy as np
import pandas as pd
from utils import load_sentences,BatchManager,create_model_and_embedding,get_logger,save_model,input_from_line,load_sor_vocab,load_mub_vocab
from model import Model
from flask import Flask, jsonify, request
from collections import OrderedDict

flags = tf.app.flags
flags.DEFINE_integer("block",6,"layer_size")
flags.DEFINE_integer("sequence_length",20,"word vector dim")
flags.DEFINE_integer("steps_check", 10, "steps per checkpoint")
flags.DEFINE_integer("num_of_epoch", 100000, "epoch number")
flags.DEFINE_integer("batch_size",64 ,"word vector dim")
flags.DEFINE_integer('hidden_units',128,'   ')
flags.DEFINE_integer('num_blocks',6,'   ')
flags.DEFINE_integer('num_heads',8,'   ')
flags.DEFINE_float("dropout_rate", 0.0, "Learning rate")

flags.DEFINE_string("model_path","model/","vocab file path")
flags.DEFINE_string("train_sor_path","data/train.ask.tsv","train file path")
flags.DEFINE_string("train_mub_path","data/train.answer.tsv","train file path")
flags.DEFINE_string("logger_path","logger/train.log","vocab file path")
flags.DEFINE_float("learning_rate", 0.00001, "Learning rate")
flags.DEFINE_string("optimizer",    "adam",     "Optimizer for training")
flags.DEFINE_boolean('flag',True,' ')
FLAGS = tf.app.flags.FLAGS
app = Flask(__name__)
def config_model():
    config = OrderedDict()
    config["optimizer"] = FLAGS.optimizer
    config["layer_size"] = FLAGS.block
    config["sequence_length"] = FLAGS.sequence_length
    config["batch_size"] = FLAGS.batch_size
    config["hidden_units"] = FLAGS.hidden_units
    config["num_blocks"] = FLAGS.num_blocks
    config["num_heads"] = FLAGS.num_heads
    config["dropout_rate"] = FLAGS.dropout_rate    
    
    config["train_sor_path"] = FLAGS.train_sor_path
    config["train_mub_path"] = FLAGS.train_mub_path
    config["model_path"] = FLAGS.model_path
    config["logger_path"] = FLAGS.logger_path
    config["learning_rate"] = FLAGS.learning_rate
    config['flag'] = FLAGS.flag
    return config
def train():
    #加载训练数据并生成可训练数据
    train_sor_data,train_mub_data = load_sentences(FLAGS.train_sor_path,FLAGS.train_mub_path)
    #将训练数据处理成N批次数据
    train_manager = BatchManager(train_sor_data,train_mub_data, FLAGS.batch_size)
    #设置gpu参数
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    #加载FLAGS参数
    config = config_model()
    logger = get_logger(config["logger_path"])
    #计算批次数
    word2id,id2word = load_sor_vocab() 
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model_and_embedding(sess, Model, FLAGS.model_path, config,True)
        logger.info("start training")
        loss = []  
        with tf.device('/gpu:0'):
            for i in range(FLAGS.num_of_epoch):
                for batch in train_manager.iter_batch(shuffle=True):
                    step,batch_loss = model.run_step(sess,True,batch)
                    loss.append(batch_loss)
                    if step%FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{},chatbot loss:{:>9.6f}".format(iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                        loss = []
                if i%10 == 0:
                    save_model(sess, model, FLAGS.model_path,logger) 
def predict():
    word2id,id2word = load_sor_vocab()   
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    config = config_model() 
    logger = get_logger(config["logger_path"])  
    graph = tf.Graph()
    sess = tf.Session(graph=graph,config=tf_config)
    with graph.as_default():
        sess.run(tf.global_variables_initializer())
        model = create_model_and_embedding(sess, Model, FLAGS.model_path, config,False)
        sys.stdout.write('请输入测试句子：')
        sys.stdout.flush()
        sentences = sys.stdin.readline()
        while True:
            sentences = sentences.replace('\n','')        
            rs = model.evaluate_line(sess,input_from_line(sentences,word2id))
            res = ''.join([id2word[w] for w in rs[0]]).split('</S>')[0].strip()
            print(res)
            print('请输入测试句子：',end='')
            sys.stdout.flush()
            sentences = sys.stdin.readline()            
        print('ok')
def main(_):
    predict()
if __name__ == '__main__':
    tf.app.run(main)