用gensim doc2vec计算文本相似度,Python可以跑通的代码

Python3.7版本,转载自:https://blog.csdn.net/juanjuan1314/article/details/75124046

wangyi_title.txt文件下载地址:链接:https://pan.baidu.com/s/1uL75P13t98YHMqgv3Kx7TQ  密码:oqxt

对原文有修改,原文代码是Python2,有很多问题。

# coding:utf-8
 
import sys
import gensim
import sklearn
import numpy as np
 
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
 
TaggededDocument = gensim.models.doc2vec.TaggedDocument
 
def get_datasest():
    with open("wangyi_title.txt", 'r') as cf:
        docs = cf.readlines()
        print(len(docs))
 
    x_train = []
    #y = np.concatenate(np.ones(len(docs)))
    for i, text in enumerate(docs):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l-1] = word_list[l-1].strip()
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)
 
    return x_train
 
def getVecs(model, corpus, vector_size):
    vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, vector_size)) for z in corpus]
    return np.concatenate(vecs)
 
def train(x_train, vector_size=200, epoch_num=1):
    model_dm = Doc2Vec(x_train,min_count=1, window = 3, vector_size = vector_size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
    model_dm.save('model_dm_wangyi')
 
    return model_dm
 
def test():
    model_dm = Doc2Vec.load("model_dm_wangyi")
    test_text = ['《', '舞林', '争霸' '》', '十强' '出炉', '复活', '舞者', '澳门', '踢馆']
    inferred_vector_dm = model_dm.infer_vector(test_text)
    print(inferred_vector_dm)
    sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
 
 
    return sims
 
if __name__ == '__main__':
    x_train = get_datasest()
    model_dm = train(x_train)
 
    sims = test()
    for count, sim in sims:
        sentence = x_train[count]
        words = ''
        for word in sentence[0]:
            words = words + word + ' '
        print (words, sim, len(sentence[0]))

用了网页的热门娱乐新闻标题作为训练语料,输出结果如下,很相似的句子确实少。

42754
[-2.1229391e-05 -5.3489220e-04 -1.4628534e-03  1.0101878e-03
  1.9613570e-03  1.2337929e-04 -1.5623088e-03  1.4899696e-03
  1.7250431e-04  1.7861715e-03  5.4765341e-04  1.7854273e-03
  1.4752866e-04 -4.7224312e-04 -2.0143031e-03 -1.3678997e-03
  2.1347464e-03 -4.2291704e-04 -2.2612642e-03  1.9719985e-03
 -1.7474928e-03  6.7744928e-04 -1.1667489e-03  1.4224678e-03
 -4.9147848e-04  1.9250986e-03  1.5286671e-04 -1.0706087e-03
 -1.2940766e-03 -1.1336872e-03 -4.8530920e-04  1.4789804e-03
  1.7939236e-03 -1.2773223e-03 -2.4406663e-03  1.9606731e-03
  2.4594443e-03  1.5459055e-03 -9.8075520e-04  1.6827125e-03
  1.4778823e-03  2.0646905e-03 -3.4740806e-05 -1.5140681e-03
 -7.6300337e-04 -2.1761435e-03 -1.9383265e-04  6.5391039e-04
 -9.3230215e-04  3.8053558e-04  1.6529204e-05  1.5503957e-03
  5.2016345e-04  1.2898637e-03  1.7284699e-03  2.2767365e-03
 -9.5764997e-05 -8.4209896e-05 -1.5726103e-03  2.2212588e-03
 -5.9885468e-04 -2.1759607e-03 -1.9564391e-03  1.2035059e-03
 -3.9055874e-04  1.1362566e-03  1.0841021e-03 -9.0546644e-04
  2.3774474e-03  1.3961376e-03 -1.8707723e-03  1.5263865e-03
 -1.1634092e-03 -2.2435680e-03  1.8672579e-03  5.6013430e-04
  2.3103815e-03  1.2101847e-03 -2.4156671e-03 -5.1514624e-04
  2.1143679e-03  2.3558659e-03 -1.0352633e-03 -8.4526307e-04
  2.2150134e-03  5.3238236e-05 -2.3913602e-03 -1.5362124e-04
  1.5323326e-03  2.4526857e-03 -1.6107119e-04 -3.4444834e-04
  1.6401864e-03  1.0141496e-03  3.7656463e-04 -1.2738963e-04
 -1.1323770e-03 -2.0433934e-03  3.7525350e-04 -1.6017296e-04
 -3.3818476e-04  2.2791843e-03 -1.4202974e-03 -2.7641861e-04
  1.1009629e-04 -4.2639120e-04  1.8214980e-03 -1.7151656e-03
 -1.5390049e-03 -1.3191046e-03  1.7080955e-03  1.1002786e-03
  1.6142949e-03  1.8982554e-03 -7.0945674e-04 -4.6570468e-04
  9.8265568e-04 -7.4710487e-04  2.4075075e-03 -2.1547875e-03
 -2.1082300e-03 -1.8821321e-03  9.6265052e-04 -1.1552537e-03
 -1.6849015e-03 -1.2968426e-03 -1.5383511e-04 -7.5135130e-04
 -1.8727558e-03  5.2730407e-04 -2.3783895e-03  2.4225495e-03
  2.3140633e-03 -1.0093495e-03  1.5953591e-03 -1.6097585e-03
 -5.1834644e-04  5.6184967e-05  2.8760443e-04  2.0393797e-03
  1.4612459e-03  2.1953927e-03 -2.1270583e-03 -9.9687604e-04
  1.2225753e-03  2.0009447e-03  4.6715033e-04  2.1180776e-03
  2.8774102e-04 -8.8365687e-06 -1.7047256e-03 -9.7245700e-04
 -4.0429382e-04  1.9775415e-03 -2.2045472e-03  1.5636642e-03
 -1.9885909e-03  2.0202452e-03  2.1154643e-03  1.7958126e-03
  1.0514902e-03  1.9323002e-03 -1.5818867e-03  1.3666560e-03
 -9.1630412e-04  3.2067264e-04  1.7956816e-04 -2.3987342e-03
  9.4504084e-04 -2.9586093e-04 -1.6545136e-03 -9.1628381e-04
 -1.2085686e-04  8.3511556e-04  9.2640345e-04 -1.0981049e-03
 -2.6373079e-04 -1.1188543e-04 -1.0378383e-03  3.7422587e-04
 -2.0860252e-03  8.9370640e-04  1.1446123e-03 -1.3295287e-03
  1.2766315e-03  1.3684760e-03  2.1959674e-03  6.3199044e-04
 -2.7432822e-04  5.7462428e-04  2.3212784e-03  9.1525499e-04
  1.9918189e-03  1.1947503e-03 -1.1286519e-03 -6.5884611e-04
 -6.7673821e-04 -3.2887704e-04 -1.9954341e-03 -1.1857023e-04]
/usr/local/lib/python3.7/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):
《全民目击》错失上影节 导演称是为品质让路  0.26860833168029785 2
[星态度]蔡依林:在感情里我不是强势者  0.2664705514907837 1
《大魔术师》发剧照 梁朝伟刘青云等人搞笑耍宝  0.23988798260688782 2
三池崇史戛纳回应媒体反馈:恶评在意料之中  0.23537132143974304 1
《单身男女》试映爆满获好评 本周四正式上映  0.22329775989055634 2
金鸡百花节门票创“史上天价”最贵11560元  0.2214442789554596 1
《霍比特人》登杂志封面 曝彼得杰克逊工作照  0.2193169891834259 2
章子怡回应与汪峰婚期:应该快了 我也不小了  0.21707454323768616 2
《画壁》发MV 邓超:这是属于我和孙俪的爱情  0.21643859148025513 2
《生化危机5》曝战神自白预告 重现屠魔之旅  0.21323256194591522 2
[Finished in 114.1s]

1

1

猜你喜欢

转载自blog.csdn.net/BTUJACK/article/details/83897592