基于结巴分词、SnowNLP、Kmeans自然语言处理之京东评论情感分析

【1】需要处理的评论数据已经存到MongoDB(评论数据总量约为3万条)

import jieba
import jieba.analyse
from pymongo import MongoClient
from snownlp import SnowNLP

comments = ''
client=MongoClient()
results = client.jd.shouhuan.find({})
for result in results:
    for content in result['商品总评论']:
        comments+=content[:-26]    # 第一步去除"噪音"

        
# 【STEP 1 从数据库/文件读取评论内容】
#导入自定义词典   # 【目标:】让jieba识别新词
jieba.load_userdict("/Users/macbookair/Desktop/NLP1221/dict.txt")


# ===START=============================
# 去除停用词    #【目标:】去除文本噪音
# ===================================

stopwords = {}.fromkeys(['一晃','准功','平理','一大','充好',';','?','*','**','??????','1','2','3','4','5','6','7','8','9','10','0','a','b','c','d','e','f','g','h','i','g','k','m','n','o','p','q','r','s','t','u','v','w','x','y','z','*^★*☆','丶','helliphellip',';','*?acute╰╯`?','hellip','哦','与','下次','~','!',"(',')",'�','\n','、','~','再','来','给','有','&','的', '包括', '等', '是', '了', '和','开始','用','怎么','说','呢','还是',',',' ','。',':','而且','似乎','都','!','?','hellip',';','还有','就','直接','会','第二天','按','之后','一款'])

# ===END=============================
# 建立停用词表
# ===================================



# ===START=============================
# 使用停用词降低噪音,并分词
# ===================================

segs = jieba.cut(comments, cut_all=False)   # 精确模式,默认为精准模式,且用精准模式。因为全模式或搜索模式,会产生更多噪音。
final = ''
for seg in segs:
    if seg not in stopwords:   # 从文本中去除停用词,降低噪音
            final += seg   # 重新做成string类型内容
            
len(final)
    
# ===END=============================
# 使用停用词降低噪音
# ===================================



       
        
# ==START============================
# 获得特征词 1221
# ===================================


# tags = jieba.analyse.extract_tags(final, topK=500, withWeight=True, allowPOS=('n'))
# # print (tags)
# for tag in tags:
#     with open("/Users/macbookair/Desktop/NLP1221/特征词_1221.txt", 'a') as f:
#         f.write(str(tag[0])+'  '+ str(tag[1]) + '\n')
#     特征词  = tag[0]
#     权重 = tag[1]
#     特_权 = {'特征词':特征词, '权重':权重}

#     client = MongoClient()
#     client.jd.特征词_权重_1221.insert_one(特_权)


# ==END==============================
# 获得特征词
# ===================================



【2】获得所有分词

segs = jieba.lcut(comments, cut_all=False) 
final = ''
for seg in segs:
    if seg not in stopwords and len(seg)>1:   # 从文本中去除停用词,降低噪音
        with open("/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt", 'a') as f:
            f.write(str(seg) + '\n')

【3】基于word2vec 训练模型

from gensim.models import word2vec

# ===START===========================
# 训练模型,做词向量
# ===================================


sentence = []    
with open('/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt') as f: # 从分词文件读取词
    result = f.read()
    sentence.append(result.split('\n'))


# 使用 CBOW 算法
model = word2vec.Word2Vec(sentence, sg=0, size=100,  window=5,  min_count=0,  negative=3, sample=0.001, hs=1, workers=4)
model.save('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model')
model.wv.save_word2vec_format('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model.txt', '/Users/macbookair/Desktop/NLP1221/wordvec-1221.vocab.txt', binary=False)

 【4】基于 kmeans 聚类


from gensim.models import Word2Vec
from sklearn.cluster import KMeans

# 加载词空间模型
model = Word2Vec.load('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model')
keys = model.wv.vocab.keys()

# 获取词对应的词向量
wordvector = []
for key in keys:
    wordvector.append(model[key])

    
# 聚类,分10个蔟
clf = KMeans(n_clusters=20)
s = clf.fit_predict(wordvector)

print(len(s))
# 读取特征词库
name = []
with open('/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt') as f: # 从分词文件读取词
    result = f.read()
    names = result.replace('\n',' ').split(' ')

for i in range(0, 21):
    label_i = []
    for j in range(0, len(s)):
        if s[j] == i:
            label_i.append(names[j])


    
    #  将聚类结果保存至文件
    with open('/Users/macbookair/Desktop/NLP1221/全词聚类-1221-2.model.txt', 'a') as f:
        f.write('label_' + str(i) + ':' + str(label_i) + '\n')
        
        
       

【5】将长句切成短句并存储,为短句情感分析做准备。(短句的情感分析比长句情感分析精准很多)

from pymongo import MongoClient


comments = ''
client=MongoClient()
results = client.jd.shouhuan_qinggan_zhaiyao.find({})

for comment in results:
    c = comment['评论内容'].replace('!',',').replace('~',',').replace('~',',').replace('。',',').replace('?',',').replace(',',',').replace('\n',',').replace('\n\n\n',',') 
    cs = {'商品ID':comment['商品ID'], '评论内容':comment['评论内容'],'评论时间':comment['评论时间'],'长句情感积极度':comment['情感积极度'],
         '短句集合':c}
    client.jd.短句集合.insert_one(cs)

【6】使用主语(名词)+情感词(形容词)匹配短句,并给短句打分

from pymongo import MongoClient
from snownlp import SnowNLP


## 读取人工处理后的特征词
with open('/Users/macbookair/Desktop/NLP1221/人工处理过的特征词_1221.txt') as f:
    特征词 = (f.read()).split('\n')

#  读取人工处理后的情感词
with open('/Users/macbookair/Desktop/NLP1221/人工处理过的情感词_1221.txt') as f:
    情感词 = (f.read()).split('\n')
#     print(情感词)


# #  读取短句并变成list类型
client = MongoClient()
短句集合 = client.jd.短句集合.find({}).skip(500).limit(10)
# 短句集合 = client.jd.短句集合.find({})


for 短句 in 短句集合:
    all_短句 = 短句['短句集合'].replace(' ',',').replace(':',',').replace('……',',').replace('、',',').split(',')
#     print('------------------>', 短句['商品ID'])
    
    for 特征 in 特征词:
        词 = (特征.split('  '))[0]
        权 = (特征.split('  '))[1]
#         print('词===》',词,'权 ====》', 权)
        for 情感 in 情感词:

            情感词词 =  (情感.split('  '))[0]
            情感词权 =  (情感.split('  '))[1]
            情感词倾 =  (情感.split('  '))[2]
#             print('情感词词===>',情感词词, '情感词权=====》',情感词权,'情感词倾=====>', 情感词倾)

            for _短句 in all_短句:
                if 词 in _短句 and 情感词词  in _短句:
                    
                    短句情感倾向 = SnowNLP(_短句).sentiments
                    result = {'商品ID':短句['商品ID'], '长句':短句['评论内容'],'长句情感倾向':短句['长句情感积极度'],
                              '评论时间':短句['评论时间'],'短句':_短句, '短句情感倾向':短句情感倾向,'特征词':词,'特征权重':权,
                              '情感词':情感词词, '情感词权':情感词权, '情感词倾向':情感词倾}
#                     print(result)
                    client.jd.特征情感短句.insert_one(result)
                    
发布了10 篇原创文章 · 获赞 1 · 访问量 2033

猜你喜欢

转载自blog.csdn.net/xinxianren007/article/details/104458949