NLP学习——文本相似度计算

导入模块

from gensim.models import Word2Vec
import jieba
import numpy as np
FILE_PATH = "./data/wiki_tiny.txt"
MODEL_PATH = 'word_vec.model'

读取文件

def read_text(FILE_PATH):
    sentences = []
    with open (FILE_PATH,encoding="utf-8") as f:
        for line in f.readlines():
            if line.strip():
                sentences.append(line.strip().split(" "))
            if len(sentences) == 1000:
                break
    return sentences

模型训练

def train(sentences,MODEL_PATH):
    model = Word2Vec(sentences,sg = 1,size=100, window=5, min_count=5, negative=3, hs=1, workers=4)
    model.wv.save(MODEL_PATH)
    return model.wv

句向量转为词向量

def sentence2vec(sen,vecs):
    segment = list(jieba.cut(sen))
    sen2vec = np.zeros(100)
    for seg in segment:
        try:
            sen2vec += vecs[seg]
        except:
            pass
    return sen2vec /len(segment)

余弦相似度计算

def cosine(a,b):
    return np.matmul(a, b) / np.linalg.norm(a) / np.linalg.norm(b)

读取模型

def load():
    return KeyedVectors.load('./data/word_vec.model')

模型训练

# 读取数据
sentences = read_text(FILE_PATH)
# 训练
model = train(sentences,MODEL_PATH)

测试

sen1 = '办理银行卡'
sen2 = '办理储蓄卡'
vec1 = sentence2vec(sen1, model)# 计算第一个句子的句向量
vec2 = sentence2vec(sen2, model)# 计算第二个句子的句向量

sim = cosine(vec1, vec2)# 计算相似度
print('sim: ', sim)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\heitao\AppData\Local\Temp\jieba.cache
Loading model cost 0.776 seconds.
Prefix dict has been built succesfully.


sim:  0.9563070869153697
发布了160 篇原创文章 · 获赞 33 · 访问量 5万+

猜你喜欢

转载自blog.csdn.net/Heitao5200/article/details/103823189