机器学习算法Python实现:tfidf 特征词提取及文本相似度分类

# coding: utf-8
#本代码主要实现了对于商品名称根据tfidf提取特征词,然后基于已经训练好的word2vec模型,对每行商品的tfidf值大于某一阈值的特征词相似度匹配已经给定的商品类别。

import jieba
import jieba.posseg as pseg
import jieba.analyse
import pymssql
import xlwt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd 
jieba.load_userdict('C:\\Users\\Desktop\\s_proj\\dict.txt') #导入自定义的分词词典

#连接数据库 
conn = pymssql.connect(host='1.1.1.1',user='username',password='password',database='database',charset='utf8')
cur = conn.cursor() 
sql='select distinct(column) from table' 
cur.execute(sql) 
listl=cur.fetchall()

#分词
words=[]
for word in listl:
    for i in word:
        seg_list = jieba.cut(i, cut_all=False)
        words.append(" ".join(seg_list))

#计算tfidf
tfidf_vectorizer = CountVectorizer(min_df=2,max_df = 0.8,token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b')
transformer=TfidfTransformer()
word_tfidf = tfidf_vectorizer.fit_transform(words)
tfidf=transformer.fit_transform(word_tfidf)
print(tfidf.shape)

#获取特征词
features=tfidf_vectorizer.get_feature_names()

#加载已训练好的word2vec模型
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)
model=gensim.models.KeyedVectors.load_word2vec_format('model_word.bin',binary=True)  

#打开要匹配相似度的商品类文本
f4=open('C:\\Users\\Desktop\\s_proj\\c.txt','r',encoding='utf-8',errors='ignore') 

#对商品类文本做文本处理
ff=[]
for j in f4.readlines():
    j=j.replace('\n','')
    ff.append(j)
print(len(ff))

#tfidf 稀疏矩阵
tfidf_tuple = tfidf.nonzero()
tfidf_rows = tfidf_tuple[0]
tfidf_columns = tfidf_tuple[1]
size = len(tfidf_columns)
print('nonzero.size=%s' % size)

# 将TF-IDF>=某一阈值d的数据存入字典,Key为电商索引。
product_dict = {}
for i in range(size):
    row = tfidf_rows[i]
    column = tfidf_columns[i]
    tfidf_value = tfidf[row,column]
    if tfidf_value <=0.4:
        continue
    key_words = product_dict.setdefault(row, []) 
    key_word = {}
    key_word["key_word"]=features[column]
    key_word["tfidf_value"]=tfidf_value
    key_words.append(key_word)
print('product_dict.len=%s' % len(product_dict))

#tfidf提取的所有features计算word2vec相似度
f=open('gabbage.txt','w',encoding='utf-8',errors='ignore') 
word2vec={}
for i in range(size):
    column = tfidf_columns[i]
    cate=word2vec.setdefault(features[column],[])
    for jj in ff:
        try:
            y1 = model.wv.similarity(features[column],jj) 
            insert={"category_name":jj,"similarity":y1}
            cate.append(insert)
        except:
            f.write('')
print('word2vec.len=%s' % len(word2vec))
f.close()

#相似度值排序
import operator
for k,v in word2vec.items():
    new_dict_list=[]
    s=sorted(v,key=operator.itemgetter("similarity"),reverse=True)
    word2vec[k]=s
    for w in word2vec[k]:
        if w not in new_dict_list:
            new_dict_list.append(w)
    word2vec[k]=new_dict_list

#创建满足某一阈值的tfidf的特征词的相似度的词典
l=[ll for l in product_dict.keys()for ll in product_dict[l]]
for k in word2vec.keys():
    for m in l:
        if m["key_word"]==k:
            m["category_names"]=word2vec[k]
print(len(product_dict))

# 将字典转为数组,索引转为商品名称。
result=[r for r in listl]
product_list = []
for i in product_dict:
    product = {}
    product['product'] = result[i][0]
    product['key_words'] = product_dict[i]
    product_list.append(product)
print('product_list.len=%s' % len(product_list))
print(len(product_list))


#匹配填充tfidf值满足某一阈值的相似度词典
f=open('gabbage.txt','w',encoding='utf-8',errors='ignore') 
# 将TF-IDF>=0。4的数据存入字典,Key为电商索引。
product_dict = {}
for i in range(size):
    row = tfidf_rows[i]
    column = tfidf_columns[i]
    tfidf_value = tfidf[row,column]
    if tfidf_value <=0.4:
        continue
    key_words = product_dict.setdefault(row, [])
    for jj in ff:
        try:
            y1 = model.wv.similarity(features[column],jj)  
            insert={'key_word':features[column],'tfidf_value':tfidf_value,'category_names':{'goods_name':jj,'word2vec':y1}}
            key_words.append(insert)
        except:
            f.write('')
f.close()

#将结果写入json
import json
t=open('word2vec_result1.txt','w',encoding='utf-8',errors='ignore')
json_string = json.dumps(product_list[0:11]).encode('utf-8').decode('unicode_escape')
print(json_string)
t.write(json_string)
t.close()

猜你喜欢

转载自blog.csdn.net/hellozhxy/article/details/82083226