机器学习算法Python实现:gensim里的similarities文本相似度计算

# -*- coding:utf-8 -*
#本代码是在jupyter notebook上实现,author:huzhifei, create time:2018/8/14
#本脚本主要实现了基于python的gensim包里的similarities接口对文本做相似度的项目目的


#导入gensim与jieba包
from gensim import corpora, models, similarities
import jieba


#去除中英停用词
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file,encoding='utf-8')as f:
        stopwords=f.read()
    stopwords_list=stopwords.split('\n')
    custom_stopwords_list=[i for i in stopwords_list]
    return custom_stopwords_list


#调用停用词函数
stop_words_file="stopwordsHIT.txt"
stopwords=get_custom_stopwords(stop_words_file)
print(len(stopwords))


#jieba分词函数
def cut(sentence):
    generator = jieba.cut(sentence)
    return [word for word in generator if  word not in stopwords]


#连接数据库 
import pyodbc
conn = 'DRIVER={SQL Server Native Client 10.0};DATABASE=%s;SERVER=%s;UID=%s;PWD=%s'%('database', 'server', 'username', 'password')

mssql_conn = pyodbc.connect(conn)
cur = mssql_conn.cursor() 
sql='select  ArticleId, 标题, 摘要, Taskid from table

cur.execute(sql) 
listl=cur.fetchall()
cur.close()
mssql_conn.commit()
mssql_conn.close()


# 数据处理——将数据库里的数据存入一个list
s=[]
for i in listl:
    s.append(list(i))


# 对上面的s列表的摘要一列文本数据进行分词
t=[]
for line in s:
    t.append(line[2])
texts = [cut(str(text)) for text in t[:]]


# 对分好词的文本数据建立语料词典
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id.keys())
corpus = [dictionary.doc2bow(text) for text in texts]


#对语料进行tfidf计算并对要做相似度的那批文本数据做词典向量转换
tfidf = models.TfidfModel(corpus)
new_vec=[]
for i in t[:]:
    new_vec.append(dictionary.doc2bow(cut(str(i))))


# 相似度计算并对满足条件的数据存入数据库
import pyodbc
import datetime
import time
start = time.time()
conn = 'DRIVER={SQL Server Native Client 10.0};DATABASE=%s;SERVER=%s;UID=%s;PWD=%s'%('database', 'server', 'username', 'password')

mssql_conn = pyodbc.connect(conn)
cur = mssql_conn.cursor() 

sql="insert into table  values (?,?,?,?,?,?,?,?,?)"

index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt,num_best=50000)
w=[]
insert={}
for k,j in enumerate(new_vec):
    sim = index[tfidf[j]]
    for i in range(len(sim)):
        insert[s[k][0]]=s[sim[i][0]][0]
        if 0.75<=sim[i][1]<=0.99 and s[sim[i][0]][0] not in insert.keys():#提取了相似度在0.75以上,0.99以下(排除相似度为1的数据自身)的非重复的文本数据
            cur.execute(sql,(s[k][0],s[k][1],s[k][2],s[sim[i][0]][0],s[sim[i][0]][1],s[sim[i][0]][2],str(s[k][3]),sim[i][1],datetime.datetime.now().strftime('%D')));
cur.close()
mssql_conn.commit()
mssql_conn.close()
end = time.time()
print (end-start)#打印出计算相似度所消耗的时间

猜你喜欢

转载自blog.csdn.net/hellozhxy/article/details/82083251