1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| import gensim from gensim.models import word2vec cut_file='./data/cnews.train_jieba.txt' save_model_name = './data/train_word2vec.model.bin' sentences =word2vec.Text8Corpus(cut_file) model =gensim.models.Word2Vec(sentences, size=200) model.save(save_model_name) from gensim.models import word2vec from sklearn.decomposition import PCA import numpy as np import matplotlib import matplotlib.pyplot as plt
mopdelfilePath = './data/train_word2vec.model.bin' model = word2vec.Word2Vec.load(mopdelfilePath) raw_word_vec = model.wv.vectors
cent_word1 = "新闻" cent_word2 = "娱乐" cent_word3 = "家具" cent_word4 = "房产" cent_word5 = "教育" cent_word6 = "时尚"
wordList1 = model.most_similar(cent_word1) wordList2 = model.most_similar(cent_word2) wordList3 = model.most_similar(cent_word3) wordList4 = model.most_similar(cent_word4) wordList5 = model.most_similar(cent_word5) wordList6 = model.most_similar(cent_word6)
wordList1 = np.append([item[0] for item in wordList1], cent_word1) wordList2 = np.append([item[0] for item in wordList2], cent_word2) wordList3 = np.append([item[0] for item in wordList3], cent_word3) wordList4 = np.append([item[0] for item in wordList4], cent_word4) wordList5 = np.append([item[0] for item in wordList5], cent_word5) wordList6 = np.append([item[0] for item in wordList6], cent_word6)
def get_word_index(word): index = model.wv.vocab[word].index return index
index_list1 = map(get_word_index, wordList1) index_list2 = map(get_word_index, wordList2) index_list3 = map(get_word_index, wordList3) index_list4 = map(get_word_index, wordList4) index_list5 = map(get_word_index, wordList5) index_list6 = map(get_word_index, wordList6)
vec_reduced = PCA(n_components=2).fit_transform(raw_word_vec) zhfont = matplotlib.font_manager.FontProperties(fname='./data/msyh.ttf') x = np.arange(-10, 10, 0.1) y = x plt.plot(x, y)
for i in index_list1: plt.text(vec_reduced[i][0], vec_reduced[i][1], model.wv.index2word[i], color='r', fontproperties=zhfont)
for i in index_list2: plt.text(vec_reduced[i][0], vec_reduced[i][1], model.wv.index2word[i], color='b', fontproperties=zhfont)
for i in index_list3: plt.text(vec_reduced[i][0], vec_reduced[i][1], model.wv.index2word[i], color='g', fontproperties=zhfont)
for i in index_list4: plt.text(vec_reduced[i][0], vec_reduced[i][1], model.wv.index2word[i], color='k', fontproperties=zhfont)
for i in index_list5: plt.text(vec_reduced[i][0], vec_reduced[i][1], model.wv.index2word[i], color='c', fontproperties=zhfont) for i in index_list6: plt.text(vec_reduced[i][0], vec_reduced[i][1], model.wv.index2word[i], color='c', fontproperties=zhfont)
plt.show() plt.savefig("./img/title.png")
|