构建于Ubuntu对话数据集上的基于检索的聊天机器人 提示:如果大家觉得计算资源有限,欢迎大家在”科学上网“后免费试用google的colab ,有免费的K80 GPU供大家使用,大家只需要把课程的notebook上传即可运行
完整的数据可以在Google Drive文件夹中找到:https://drive.google.com/open?id=1RIIbsS-vxR7Dlo2_v6FWHDFE7q1XPPgj
1) 下载 以下文件:
- glove.6B.50d.txt (Subfolder GloVe)
- training_10000.csv (Subfolder MAIN FILES)
- validation_1000.csv (Subfolder MAIN FILES)
- testing_same_structure_1000.csv (Subfolder MAIN FILES)
- testing_different_structure_100.csv (Subfolder MAIN FILES)
- saved_model_10000_gpu.pt (Subfolder SAVED MODELS)
2) 调整变量大小 :对于代码中出现的 num_training_examples , num_validation_examples , embedding_dim , test_dataframe_same_structure , test_dataframe_different_structure 和saved model file name 可以根据数据量的大小进行调整
3) 调整超参数设置 :具体模型的参数大家可以自己调整,也可以参考SAVED MODELS文件夹下的内容,你可以找到模型截图 ,做和它一样的设定,大家也可以复现本notebook的结果。
1 2 from google.colab import drivedrive.mount('/content/gdrive' )
1 !ls /content/gdrive/My\ Drive/Dialogue\ Files\
GloVe 'MAIN FILES' 'Original Files' 'SAVED MODELS'
1 !pip install torch torchvision
1 2 3 4 5 6 7 8 9 10 11 import pandas as pdimport numpy as npimport torch.nn as nnimport torchimport torch.autograd as autogradfrom torch.nn import initimport torch.nn.utils.rnn import datetimeimport operatornp.random.seed(0 )
定义helper函数以构建训练和验证过程中的变量 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 def create_dataframe (csvfile) : dataframe = pd.read_csv(csvfile) return dataframe def shuffle_dataframe (dataframe) : dataframe.reindex(np.random.permutation(dataframe.index)) def create_vocab (dataframe) : vocab = [] word_freq = {} for index, row in dataframe.iterrows(): context_cell = row["Context" ] response_cell = row["Utterance" ] train_words = str(context_cell).split() + str(response_cell).split() for word in train_words: if word.lower() not in vocab: vocab.append(word.lower()) if word.lower() not in word_freq: word_freq[word.lower()] = 1 else : word_freq[word] += 1 word_freq_sorted = sorted(word_freq.items(), key=lambda item: item[1 ], reverse=True ) vocab = ["<UNK>" ] + [pair[0 ] for pair in word_freq_sorted] return vocab def create_word_to_id (vocab) : word_to_id = {word: id for id, word in enumerate(vocab)} return word_to_id def create_id_to_vec (word_to_id, glovefile) : lines = open(glovefile, 'r' ).readlines() id_to_vec = {} vector = None for line in lines: word = line.split()[0 ] vector = np.array(line.split()[1 :], dtype='float32' ) if word in word_to_id: id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(vector)) for word, id in word_to_id.items(): if word_to_id[word] not in id_to_vec: v = np.zeros(*vector.shape, dtype='float32' ) v[:] = np.random.randn(*v.shape)*0.01 id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(v)) embedding_dim = id_to_vec[0 ].shape[0 ] return id_to_vec, embedding_dim def load_ids_and_labels (row, word_to_id) : context_ids = [] response_ids = [] context_cell = row['Context' ] response_cell = row['Utterance' ] label_cell = row['Label' ] max_context_len = 160 context_words = context_cell.split() if len(context_words) > max_context_len: context_words = context_words[:max_context_len] for word in context_words: if word in word_to_id: context_ids.append(word_to_id[word]) else : context_ids.append(0 ) response_words = response_cell.split() for word in response_words: if word in word_to_id: response_ids.append(word_to_id[word]) else : response_ids.append(0 ) label = np.array(label_cell).astype(np.float32) return context_ids, response_ids, label
模型定义 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 class Encoder (nn.Module) : def __init__ (self, emb_size, hidden_size, vocab_size, p_dropout) : super(Encoder, self).__init__() self.emb_size = emb_size self.hidden_size = hidden_size self.vocab_size = vocab_size self.p_dropout = p_dropout self.embedding = nn.Embedding(self.vocab_size, self.emb_size) self.lstm = nn.LSTM(self.emb_size, self.hidden_size) self.dropout_layer = nn.Dropout(self.p_dropout) self.init_weights() def init_weights (self) : init.uniform(self.lstm.weight_ih_l0, a = -0.01 , b = 0.01 ) init.orthogonal(self.lstm.weight_hh_l0) self.lstm.weight_ih_l0.requires_grad = True self.lstm.weight_hh_l0.requires_grad = True embedding_weights = torch.FloatTensor(self.vocab_size, self.emb_size) for id, vec in id_to_vec.items(): embedding_weights[id] = vec self.embedding.weight = nn.Parameter(embedding_weights, requires_grad = True ) def forward (self, inputs) : embeddings = self.embedding(inputs) _, (last_hidden, _) = self.lstm(embeddings) last_hidden = self.dropout_layer(last_hidden[-1 ]) return last_hidden class DualEncoder (nn.Module) : def __init__ (self, encoder) : super(DualEncoder, self).__init__() self.encoder = encoder self.hidden_size = self.encoder.hidden_size M = torch.FloatTensor(self.hidden_size, self.hidden_size) init.xavier_normal(M) self.M = nn.Parameter(M, requires_grad = True ) def forward (self, context_tensor, response_tensor) : context_last_hidden = self.encoder(context_tensor) response_last_hidden = self.encoder(response_tensor) context = context_last_hidden.mm(self.M) context = context.view(-1 , 1 , self.hidden_size) response = response_last_hidden.view(-1 , self.hidden_size, 1 ) score = torch.bmm(context, response).view(-1 , 1 ) return score
数据与变量构建 定义函数去调用所有的helper函数,以便完成各种数据和变量初始化,以及部分的预训练词向量加载等
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def creating_variables (num_training_examples, num_validation_examples, embedding_dim) : print(str(datetime.datetime.now()).split('.' )[0 ], "Creating variables for training and validation..." ) training_dataframe = create_dataframe('training_%d.csv' %num_training_examples) vocab = create_vocab(training_dataframe) word_to_id = create_word_to_id(vocab) id_to_vec, emb_dim = create_id_to_vec(word_to_id, 'glove.6B.%dd.txt' %embedding_dim) validation_dataframe = create_dataframe('validation_%d.csv' %num_validation_examples) print(str(datetime.datetime.now()).split('.' )[0 ], "Variables created.\n" ) return training_dataframe, vocab, word_to_id, id_to_vec, emb_dim, validation_dataframe
模型构建 调用Encoder和DualEncoder去构建模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def creating_model (hidden_size, p_dropout) : print(str(datetime.datetime.now()).split('.' )[0 ], "Calling model..." ) encoder = Encoder( emb_size = emb_dim, hidden_size = hidden_size, vocab_size = len(vocab), p_dropout = p_dropout) dual_encoder = DualEncoder(encoder) print(str(datetime.datetime.now()).split('.' )[0 ], "Model created.\n" ) print(dual_encoder) return encoder, dual_encoder
1 2 3 4 5 6 7 8 9 10 def increase_count (correct_count, score, label) : if ((score.data[0 ][0 ] >= 0.5 ) and (label.data[0 ][0 ] == 1.0 )) or ((score.data[0 ][0 ] < 0.5 ) and (label.data[0 ][0 ] == 0.0 )): correct_count +=1 return correct_count def get_accuracy (correct_count, dataframe) : accuracy = correct_count/(len(dataframe)) return accuracy
模型训练 构建模型训练函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 def train_model (learning_rate, l2_penalty, epochs) : print(str(datetime.datetime.now()).split('.' )[0 ], "Starting training and validation...\n" ) print("====================Data and Hyperparameter Overview====================\n" ) print("Number of training examples: %d, Number of validation examples: %d" %(len(training_dataframe), len(validation_dataframe))) print("Learning rate: %.5f, Embedding Dimension: %d, Hidden Size: %d, Dropout: %.2f, L2:%.10f\n" %(learning_rate, emb_dim, encoder.hidden_size, encoder.p_dropout, l2_penalty)) print("================================Results...==============================\n" ) optimizer = torch.optim.Adam(dual_encoder.parameters(), lr = learning_rate, weight_decay = l2_penalty) loss_func = torch.nn.BCEWithLogitsLoss() best_validation_accuracy = 0.0 for epoch in range(epochs): shuffle_dataframe(training_dataframe) sum_loss_training = 0.0 training_correct_count = 0 dual_encoder.train() for index, row in training_dataframe.iterrows(): context_ids, response_ids, label = load_ids_and_labels(row, word_to_id) context = autograd.Variable(torch.LongTensor(context_ids).view(-1 ,1 ), requires_grad = False ) response = autograd.Variable(torch.LongTensor(response_ids).view(-1 , 1 ), requires_grad = False ) label = autograd.Variable(torch.FloatTensor(torch.from_numpy(np.array(label).reshape(1 ,1 ))), requires_grad = False ) score = dual_encoder(context, response) loss = loss_func(score, label) sum_loss_training += loss.data[0 ] loss.backward() optimizer.step() optimizer.zero_grad() training_correct_count = increase_count(training_correct_count, score, label) training_accuracy = get_accuracy(training_correct_count, training_dataframe) shuffle_dataframe(validation_dataframe) validation_correct_count = 0 sum_loss_validation = 0.0 dual_encoder.eval() for index, row in validation_dataframe.iterrows(): context_ids, response_ids, label = load_ids_and_labels(row, word_to_id) context = autograd.Variable(torch.LongTensor(context_ids).view(-1 ,1 )) response = autograd.Variable(torch.LongTensor(response_ids).view(-1 , 1 )) label = autograd.Variable(torch.FloatTensor(torch.from_numpy(np.array(label).reshape(1 ,1 )))) score = dual_encoder(context, response) loss = loss_func(score, label) sum_loss_validation += loss.data[0 ] validation_correct_count = increase_count(validation_correct_count, score, label) validation_accuracy = get_accuracy(validation_correct_count, validation_dataframe) print(str(datetime.datetime.now()).split('.' )[0 ], "Epoch: %d/%d" %(epoch,epochs), "TrainLoss: %.3f" %(sum_loss_training/len(training_dataframe)), "TrainAccuracy: %.3f" %(training_accuracy), "ValLoss: %.3f" %(sum_loss_validation/len(validation_dataframe)), "ValAccuracy: %.3f" %(validation_accuracy)) if validation_accuracy > best_validation_accuracy: best_validation_accuracy = validation_accuracy torch.save(dual_encoder.state_dict(), 'saved_model_%d_examples.pt' %(len(training_dataframe))) print("New best found and saved." ) print(str(datetime.datetime.now()).split('.' )[0 ], "Training and validation epochs finished." )
1 2 3 training_dataframe, vocab, word_to_id, id_to_vec, emb_dim, validation_dataframe = creating_variables(num_training_examples = 10000 , embedding_dim = 50 , num_validation_examples = 1000 )
设定hidden size和dropout概率,构建模型
1 2 3 4 5 6 7 8 9 encoder, dual_encoder = creating_model(hidden_size = 50 , p_dropout = 0.85 ) for name, param in dual_encoder.named_parameters(): if param.requires_grad: print(name)
1 2 3 train_model(learning_rate = 0.0001 , l2_penalty = 0.0001 , epochs = 100 )
1 2 3 dual_encoder.load_state_dict(torch.load('saved_model_10000_examples.pt' )) dual_encoder.eval()
测试数据集和训练还有验证数据集有着一样的数据组织格式 (context, response, label)
Loading data:
1 test_dataframe_same_structure = pd.read_csv('testing_same_structure_1000.csv' )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 def testing_same_structure () : test_correct_count = 0 for index, row in test_dataframe_same_structure.iterrows(): context_ids, response_ids, label = load_ids_and_labels(row, word_to_id) context = autograd.Variable(torch.LongTensor(context_ids).view(-1 ,1 )) response = autograd.Variable(torch.LongTensor(response_ids).view(-1 , 1 )) label = autograd.Variable(torch.FloatTensor(torch.from_numpy(np.array(label).reshape(1 ,1 )))) score = dual_encoder(context, response) test_correct_count = increase_count(test_correct_count, score, label) test_accuracy = get_accuracy(test_correct_count, test_dataframe_same_structure) return test_accuracy
1 2 test_accuracy = testing_same_structure() print("Test accuracy for %d training examples and %d test examples: %.2f" %(len(training_dataframe),len(test_dataframe_same_structure),test_accuracy))
测试数据集和训练/验证集格式不一样 (1个问题,1个标准答案,9个干扰项错误答案)
1 test_dataframe_different_structure = pd.read_csv('testing_different_structure_100.csv' )
以字典形态存储对话word ids
Outer dictionary “ids_per_example_and_candidate”: keys = examples, values = inner dictionaries
Inner dictionaries “ids_per_candidate”: keys = candidate names, values = list of word IDs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 def load_ids (test_dataframe_different_structure, word_to_id) : print(str(datetime.datetime.now()).split('.' )[0 ], "Loading test IDs..." ) max_context_len = 160 ids_per_example_and_candidate = {} for i, example in test_dataframe_different_structure.iterrows(): ids_per_candidate = {} for column_name, cell in example.iteritems(): id_list = [] words = str(cell).split() if len(words) > max_context_len: words = words[:max_context_len] for word in words: if word in word_to_id: id_list.append(word_to_id[word]) else : id_list.append(0 ) ids_per_candidate[column_name] = id_list ids_per_example_and_candidate[i] = ids_per_candidate print(str(datetime.datetime.now()).split('.' )[0 ], "Test IDs loaded." ) return ids_per_example_and_candidate
1 ids_per_example_and_candidate = load_ids(test_dataframe_different_structure, word_to_id)
Outer dictionary “scores_per_example_and_candidate”: keys = examples, values = inner dictionaries
Inner dictionaries “scores_per_candidate”: keys = candidate names, values = score
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 def load_scores () : print(str(datetime.datetime.now()).split('.' )[0 ], "Computing test scores..." ) scores_per_example_and_candidate = {} for example, utterance_ids_dict in sorted(ids_per_example_and_candidate.items()): score_per_candidate = {} for utterance_name, ids_list in sorted(utterance_ids_dict.items()): context = autograd.Variable(torch.LongTensor(utterance_ids_dict['Context' ]).view(-1 ,1 )) if utterance_name != 'Context' : candidate_response = autograd.Variable(torch.LongTensor(utterance_ids_dict[utterance_name]).view(-1 , 1 )) score = torch.sigmoid(dual_encoder(context, candidate_response)) score_per_candidate["Score with " + utterance_name] = score.data[0 ][0 ] scores_per_example_and_candidate[example] = score_per_candidate print(str(datetime.datetime.now()).split('.' )[0 ], "Test scores computed." ) return scores_per_example_and_candidate
1 scores_per_example_and_candidate = load_scores()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def get_recall_at_k (k) : count_true_hits = 0 for example, score_per_candidate_dict in sorted(scores_per_example_and_candidate.items()): top_k = dict(sorted(score_per_candidate_dict.items(), key=operator.itemgetter(1 ), reverse=True )[:k]) if 'Score with Ground Truth Utterance' in top_k: count_true_hits += 1 number_of_examples = len(scores_per_example_and_candidate) recall_at_k = count_true_hits/number_of_examples return recall_at_k
1 2 3 print("recall_at_5 =" ,get_recall_at_k(k = 5 )) print("recall_at_2 =" ,get_recall_at_k(k = 2 )) print("recall_at_1 =" ,get_recall_at_k(k = 1 ))