【1】需要处理的评论数据已经存到MongoDB(评论数据总量约为3万条)
import jieba
import jieba.analyse
from pymongo import MongoClient
from snownlp import SnowNLP
comments = ''
client=MongoClient()
results = client.jd.shouhuan.find({})
for result in results:
for content in result['商品总评论']:
comments+=content[:-26] # 第一步去除"噪音"
# 【STEP 1 从数据库/文件读取评论内容】
#导入自定义词典 # 【目标:】让jieba识别新词
jieba.load_userdict("/Users/macbookair/Desktop/NLP1221/dict.txt")
# ===START=============================
# 去除停用词 #【目标:】去除文本噪音
# ===================================
stopwords = {}.fromkeys(['一晃','准功','平理','一大','充好',';','?','*','**','??????','1','2','3','4','5','6','7','8','9','10','0','a','b','c','d','e','f','g','h','i','g','k','m','n','o','p','q','r','s','t','u','v','w','x','y','z','*^★*☆','丶','helliphellip',';','*?acute╰╯`?','hellip','哦','与','下次','~','!',"(',')",'�','\n','、','~','再','来','给','有','&','的', '包括', '等', '是', '了', '和','开始','用','怎么','说','呢','还是',',',' ','。',':','而且','似乎','都','!','?','hellip',';','还有','就','直接','会','第二天','按','之后','一款'])
# ===END=============================
# 建立停用词表
# ===================================
# ===START=============================
# 使用停用词降低噪音,并分词
# ===================================
segs = jieba.cut(comments, cut_all=False) # 精确模式,默认为精准模式,且用精准模式。因为全模式或搜索模式,会产生更多噪音。
final = ''
for seg in segs:
if seg not in stopwords: # 从文本中去除停用词,降低噪音
final += seg # 重新做成string类型内容
len(final)
# ===END=============================
# 使用停用词降低噪音
# ===================================
# ==START============================
# 获得特征词 1221
# ===================================
# tags = jieba.analyse.extract_tags(final, topK=500, withWeight=True, allowPOS=('n'))
# # print (tags)
# for tag in tags:
# with open("/Users/macbookair/Desktop/NLP1221/特征词_1221.txt", 'a') as f:
# f.write(str(tag[0])+' '+ str(tag[1]) + '\n')
# 特征词 = tag[0]
# 权重 = tag[1]
# 特_权 = {'特征词':特征词, '权重':权重}
# client = MongoClient()
# client.jd.特征词_权重_1221.insert_one(特_权)
# ==END==============================
# 获得特征词
# ===================================
【2】获得所有分词
segs = jieba.lcut(comments, cut_all=False)
final = ''
for seg in segs:
if seg not in stopwords and len(seg)>1: # 从文本中去除停用词,降低噪音
with open("/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt", 'a') as f:
f.write(str(seg) + '\n')
【3】基于word2vec 训练模型
from gensim.models import word2vec
# ===START===========================
# 训练模型,做词向量
# ===================================
sentence = []
with open('/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt') as f: # 从分词文件读取词
result = f.read()
sentence.append(result.split('\n'))
# 使用 CBOW 算法
model = word2vec.Word2Vec(sentence, sg=0, size=100, window=5, min_count=0, negative=3, sample=0.001, hs=1, workers=4)
model.save('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model')
model.wv.save_word2vec_format('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model.txt', '/Users/macbookair/Desktop/NLP1221/wordvec-1221.vocab.txt', binary=False)
【4】基于 kmeans 聚类
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
# 加载词空间模型
model = Word2Vec.load('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model')
keys = model.wv.vocab.keys()
# 获取词对应的词向量
wordvector = []
for key in keys:
wordvector.append(model[key])
# 聚类,分10个蔟
clf = KMeans(n_clusters=20)
s = clf.fit_predict(wordvector)
print(len(s))
# 读取特征词库
name = []
with open('/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt') as f: # 从分词文件读取词
result = f.read()
names = result.replace('\n',' ').split(' ')
for i in range(0, 21):
label_i = []
for j in range(0, len(s)):
if s[j] == i:
label_i.append(names[j])
# 将聚类结果保存至文件
with open('/Users/macbookair/Desktop/NLP1221/全词聚类-1221-2.model.txt', 'a') as f:
f.write('label_' + str(i) + ':' + str(label_i) + '\n')
【5】将长句切成短句并存储,为短句情感分析做准备。(短句的情感分析比长句情感分析精准很多)
from pymongo import MongoClient
comments = ''
client=MongoClient()
results = client.jd.shouhuan_qinggan_zhaiyao.find({})
for comment in results:
c = comment['评论内容'].replace('!',',').replace('~',',').replace('~',',').replace('。',',').replace('?',',').replace(',',',').replace('\n',',').replace('\n\n\n',',')
cs = {'商品ID':comment['商品ID'], '评论内容':comment['评论内容'],'评论时间':comment['评论时间'],'长句情感积极度':comment['情感积极度'],
'短句集合':c}
client.jd.短句集合.insert_one(cs)
【6】使用主语(名词)+情感词(形容词)匹配短句,并给短句打分
from pymongo import MongoClient
from snownlp import SnowNLP
## 读取人工处理后的特征词
with open('/Users/macbookair/Desktop/NLP1221/人工处理过的特征词_1221.txt') as f:
特征词 = (f.read()).split('\n')
# 读取人工处理后的情感词
with open('/Users/macbookair/Desktop/NLP1221/人工处理过的情感词_1221.txt') as f:
情感词 = (f.read()).split('\n')
# print(情感词)
# # 读取短句并变成list类型
client = MongoClient()
短句集合 = client.jd.短句集合.find({}).skip(500).limit(10)
# 短句集合 = client.jd.短句集合.find({})
for 短句 in 短句集合:
all_短句 = 短句['短句集合'].replace(' ',',').replace(':',',').replace('……',',').replace('、',',').split(',')
# print('------------------>', 短句['商品ID'])
for 特征 in 特征词:
词 = (特征.split(' '))[0]
权 = (特征.split(' '))[1]
# print('词===》',词,'权 ====》', 权)
for 情感 in 情感词:
情感词词 = (情感.split(' '))[0]
情感词权 = (情感.split(' '))[1]
情感词倾 = (情感.split(' '))[2]
# print('情感词词===>',情感词词, '情感词权=====》',情感词权,'情感词倾=====>', 情感词倾)
for _短句 in all_短句:
if 词 in _短句 and 情感词词 in _短句:
短句情感倾向 = SnowNLP(_短句).sentiments
result = {'商品ID':短句['商品ID'], '长句':短句['评论内容'],'长句情感倾向':短句['长句情感积极度'],
'评论时间':短句['评论时间'],'短句':_短句, '短句情感倾向':短句情感倾向,'特征词':词,'特征权重':权,
'情感词':情感词词, '情感词权':情感词权, '情感词倾向':情感词倾}
# print(result)
client.jd.特征情感短句.insert_one(result)