import pandas as pd
import lda
test=pd.read_excel('华硕r424.xlsx')
test.head()
会员 | 级别 | 评价星级 | 评价内容 | 时间 | 点赞数 | 评论数 | 追评时间 | 追评内容 | 商品属性 | 页面网址 | 页面标题 | 采集时间 | sku | 好评度 | 评价关键词 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | jd_187595nwq | NaN | star4 | 轻薄程度:很好运行速度:一般外形外观:好看屏幕效果:一般奇怪,怎么就一个c盘?????? | 2020-02-08 08:47 | 1 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#none | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:39:25.6734153 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... |
1 | 周先生还没老 | NaN | star5 | 初步感受还是很不错的,办公性价比很高,用一段时间会追加评价的 | 2020-02-07 12:56 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#none | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:39:25.8138153 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... |
2 | yly185*****037 | PLUS会员[试用] | star4 | 老品牌,值得信赖,性价比还是比较高的,刚开始用一天不好做多评价, | 2020-02-06 16:33 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#none | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:39:25.8450153 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... |
3 | 亿***下 | NaN | star5 | 噪音很大,其他的都还好 | 2020-02-06 11:39 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#none | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:39:25.8762153 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... |
4 | jd_188622nqe | NaN | star5 | 还可以的,就自己在家简单做做东西,上上网完全没问题,不过就是一台裸机,连个鼠标都没有 | 2020-02-06 11:38 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#none | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:39:25.9074153 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... |
先用LDA模型把商品评论中此商品各个主题特征提取出来,如物流,外观,性能等
test_1=test['评价内容']
stoplist = list(pd.read_csv('停用词.txt', names = ['w'], sep = 'aaa',
encoding = 'utf-8', engine='python').w)
import jieba
def m_cut(intxt):
return [ w for w in jieba.cut(intxt)
if w not in stoplist and len(w) > 1 ]
# 生成分词清理后章节文本
cleanchap = [ " ".join(m_cut(w)) for w in test_1]
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.895 seconds.
Prefix dict has been built succesfully.
# 将文本中的词语转换为词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df = 5)
wordmtx = countvec.fit_transform(cleanchap)
wordmtx
<348x98 sparse matrix of type '<class 'numpy.int64'>'
with 1488 stored elements in Compressed Sparse Row format>
#基于词频矩阵X计算TF-IDF值
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(wordmtx)
tfidf
<348x98 sparse matrix of type '<class 'numpy.float64'>'
with 1488 stored elements in Compressed Sparse Row format>
# 设定LDA模型
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 8
ldamodel = LatentDirichletAllocation(n_components = n_topics)
# 拟合LDA模型
ldamodel.fit(wordmtx)
D:\sofewore\anaconda\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
DeprecationWarning)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
evaluate_every=-1, learning_decay=0.7, learning_method=None,
learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
mean_change_tol=0.001, n_components=8, n_jobs=1,
n_topics=None, perp_tol=0.1, random_state=None,
topic_word_prior=None, total_samples=1000000.0, verbose=0)
# 拟合后模型的实质
# print(ldamodel.components_.shape)
# ldamodel.components_[:2]
# 主题词打印函数
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
n_top_words = 12
tf_feature_names = countvec.get_feature_names()
print_top_words(ldamodel, tf_feature_names, n_top_words)
Topic #0:
速度 屏幕 运行 轻薄 外观 效果 散热 外形 程度 性能 好看 不错
Topic #1:
散热 轻薄 速度 屏幕 外观 效果 性能 很快 运行 超薄 好看 外形
Topic #2:
电脑 鼠标 客服 东西 价格 一个 退货 华硕 收到 真的 配置 后悔
Topic #3:
感觉 声音 开机 售后 垃圾 时间 朋友 散热 键盘 图片 不错 笔记本
Topic #4:
不错 速度 很快 评价 外观 办公 运行 内容 未填写 用户 笔记本 好看
Topic #5:
性价比 喜欢 风扇 打开 做工 售后 内存 网页 华硕 不错 客服 笔记本电脑
Topic #6:
办公 华硕 京东 评价 信赖 值得 品牌 轻薄 开机 购买 超薄 足够
Topic #7:
开机 第二天 好评 物流 特色 还好 想象 速度 第一次 京东 差评 失望
由结果已知可以大致分为:
物流:很快 差评 信赖 垃圾 退货
外观:喜欢 轻薄 好看
性能:不错 散热 速度 轻便
散热:风扇 散热
售后:态度 客服
然后根据这几个特征用nlp库把各个特征的评论都分为积极消极两面 好评 差评
tmpdf = pd.read_csv('停用词.txt',
names = ['w'], sep = 'aaa', encoding = 'utf-8',engine='python')
def function(a):
word_list=[w for w in jieba.cut(a) if w not in list(tmpdf.w)]
return word_list
test['评价内容'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
text_3=['差评','垃圾','退货','受不了','惨痛','垃圾','消极','退','很慢','差','不好','耗电','杂音','卡','奔溃',]
text_4=['很快','信赖','喜欢','不错','性价比','小巧', '精致','流畅','携带方便','好看', '高贵','优良','窄边']
from numpy import *
#加载数据
def loadDataSet():
postingList = [text_3,text_4]
classVec = [0,1]
return postingList, classVec
#合并所有单词,利用set来去重,得到所有单词的唯一列表
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
#优化词集模型= 为 词袋模型+=,将单词列表变为数字向量列表
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList) #获得所有单词等长的0列表
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1 #对应单词位置加1
return returnVec
# 返回的是0、1各自两个分类中每个单词数量除以该分类单词总量再取对数ln 以及0、1两类的比例
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 样本数
numWords = len(trainMatrix[0]) # 特征数
pAbusive = sum(trainCategory) / float(numTrainDocs) # 1类所占比例
p0Num = ones(numWords)
p1Num = ones(numWords) #初始化所有单词为1
p0Denom = 2.0
p1Denom = 2.0 #初始化总单词为2 后面解释为什么这四个不初始化为0
for i in range(numTrainDocs):
if trainCategory[i] == 1: #求1类
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i] #求0类
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num / p1Denom) # numpy数组 / float = 1中每个单词/1中总单词
p0Vect = log(p0Num / p0Denom) # 这里为什么还用ln来处理,后面说明
return p0Vect, p1Vect, pAbusive
#P(X|C)判断各类别的概率大小(这里是0、1)
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) # 相乘后得到哪些单词存在,再求和,再+log(P(C))
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 由于使用的是ln,这里其实都是对数相加
if p1 > p0:
return 1
else:
return 0
result=[]
#封装调用的函数
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
#上面求出了0、1两个类中各单词所占该类的比例,以及0、1的比例
#下面是预测两条样本数据的类别
# testEntry = test['评价内容'][9]
# thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
# print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
for i in range(len(test['评价内容'])):
testEntry = test['评价内容'][i]
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
# print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
result.append(classifyNB(thisDoc, p0V, p1V, pAb))
# print(result)
# for i in range(len(tmpchap_4['txt'])):
# testEntry = tmpchap_4['txt'][i]
# thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
# print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
# result.append(classifyNB(thisDoc, p0V, p1V, pAb))
if __name__=="__main__":
testingNB()
test['评价']=result
test_1=test.loc[test['评价']==1]
import codecs
import jieba
import pickle
# test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # # print (fin.read())
# # #第一次运行程序时将分好的词存入文件
# # text = ''
# # with open('名称.txt',encoding = 'utf-8') as fin:
# # for line in fin.readlines():
# # line = line.strip('\n')
# # text += ' '.join(jieba.cut(line))
# # text += ' '
# # fout = open('text.txt','wb')
# # pickle.dump(text,fout)
# # fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('good.jpg')
wc = WordCloud( background_color = 'white', # 设置背景颜色
mask = backgroud_Image, # 设置背景图片
max_words = 200, # 设置最大现实的字数
stopwords = STOPWORDS, # 设置停用词
font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
max_font_size = 200, # 设置字体最大值
random_state = 8, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()
test_1=test.loc[test['评价']==0]
import codecs
import jieba
import pickle
# test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次运行程序时将分好的词存入文件
# text = ''
# with open('名称.txt',encoding = 'utf-8') as fin:
# for line in fin.readlines():
# line = line.strip('\n')
# text += ' '.join(jieba.cut(line))
# text += ' '
# fout = open('text1.txt','wb')
# pickle.dump(text,fout)
# fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text1.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('nogood.jpg')
wc = WordCloud( background_color = 'white', # 设置背景颜色
mask = backgroud_Image, # 设置背景图片
max_words = 200, # 设置最大现实的字数
stopwords = STOPWORDS, # 设置停用词
font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
max_font_size = 200, # 设置字体最大值
random_state = 8, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FChz7FYs-1581849296894)(output_19_0.png)]
对比好评差评得出好评率,如物流好评率多少,外观好评率多少
物流
[‘物流’,‘快’,‘好快’,‘慢’,‘退货’]
外观
[‘喜欢’,‘轻薄’,‘好看’,‘外形’,‘外观’,‘漂亮’]
性能:
[不错 散热 速度 轻便]
散热:
风扇 散热
售后:
态度 客服
def function(x):
a=['物流','快','好快','慢','退货']
b=['喜欢','轻薄','好看','外形','外观','漂亮']
c=['不错','速度','轻便']
d=['风扇','散热']
e=['态度','客服']
for i in a:
if i in x:
return 1
for i in b:
if i in x:
return 2
for i in c:
if i in x:
return 3
for i in d:
if i in x:
return 4
for i in e:
if i in x:
return 5
# elif b in x:
# return 2
# else:
# return 3
test['定义'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
test_1=test.loc[test['定义']==1]
#物流好评率
print('华硕r424物流好评率为{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
华硕r424物流好评率为36.21%
test_1=test.loc[test['定义']==2]
#外观好评率
print('华硕r424观好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
华硕r424观好评率68.83%
test_1=test.loc[test['定义']==3]
#外观好评率
print('华硕r424性能好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
华硕r424性能好评率75.93%
test_1=test.loc[test['定义']==4]
#外观好评率
print('华硕r424散热好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
test_1.head()
华硕r424散热好评率0.00%
会员 | 级别 | 评价星级 | 评价内容 | 时间 | 点赞数 | 评论数 | 追评时间 | 追评内容 | 商品属性 | 页面网址 | 页面标题 | 采集时间 | sku | 好评度 | 评价关键词 | 评价 | 定义 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
21 | 大金狗 | NaN | star5 | [散热, 性能] | 2020-01-22 23:20 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:40:02.7474156 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 4.0 |
25 | j***E | NaN | star3 | [下午, 收到, 货, 晚上, 开机, 几分钟, 发现, 杂音, 散热, 声音, 滋, 滋,... | 2020-01-20 22:38 | 0 | 1 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:40:02.8722156 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 4.0 |
40 | 宏***0 | NaN | star5 | [散热, 性能, 一段时间, 评价, 物超所值] | 2020-01-10 21:53 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:40:39.7646158 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 4.0 |
69 | jd_183086ioo | NaN | star3 | [散热, 声音, 太, 屏幕, 高清, 卡] | 2019-12-31 23:33 | 0 | 1 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:41:17.2208160 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 4.0 |
179 | jd_130661ach | NaN | star5 | [散热, 不好] | 2019-11-17 11:59 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:44:41.0726166 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 4.0 |
test_1=test.loc[test['定义']==5]
#外观好评率
print('华硕r424客服好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
test_1.head()
华硕r424客服好评率0.00%
会员 | 级别 | 评价星级 | 评价内容 | 时间 | 点赞数 | 评论数 | 追评时间 | 追评内容 | 商品属性 | 页面网址 | 页面标题 | 采集时间 | sku | 好评度 | 评价关键词 | 评价 | 定义 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | jd189251xrl | PLUS会员 | star1 | [垃圾, 买, 回, 不到, 一个月, 开, 机, 售后, 麻烦, 现场, 售后, 点, 态... | 2020-01-08 12:10 | 0 | 1 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:40:40.0620158 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 5.0 |
141 | jd_184826hro | NaN | star1 | [产品质量, 反馈, 没, 解决, 收到, 货有, 裂痕, 客服, 未, 解决] | 2019-11-27 18:50 | 0 | 1 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:43:45.2106165 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 5.0 |
144 | 羣既有謀又有膽 | PLUS会员 | star5 | [京东, 自营, 产品, 买, 本本, 一条, 刮花, 痕迹, 影响, 功能, 客服, 态度] | 2019-11-26 22:39 | 0 | 0 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:43:45.3198165 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 5.0 |
149 | j***7 | NaN | star5 | [电脑, 顺手, 买, 出, 点, 客服, 解决, 态度, ?, ?, ?] | 2019-11-24 14:51 | 0 | 1 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:43:45.5070165 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 5.0 |
313 | jd189251xrl | PLUS会员 | star1 | [垃圾, 买, 回, 不到, 一个月, 开, 机, 售后, 麻烦, 现场, 售后, 点, 态... | 2020-01-08 12:10 | 0 | 1 | NaN | NaN | 【爆款】i3 4G 256G固态 银 ... | https://item.jd.com/100007507848.html#comment | 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... | 2020-02-08 16:49:00.1768175 | 100007507848 | 92% | 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... | 0 | 5.0 |
再对比同一商品不同品牌各个特征好评率得出优劣,如华硕运行速度好于联想
import pandas as pd
import lda
test=pd.read_excel('联想IdeaPad.xlsx')
test.head()
# ## 先用LDA模型把商品评论中此商品各个主题特征提取出来,如物流,外观,性能等
test_1=test['评价内容']
stoplist = list(pd.read_csv('停用词.txt', names = ['w'], sep = 'aaa',
encoding = 'utf-8', engine='python').w)
import jieba
def m_cut(intxt):
return [ w for w in jieba.cut(intxt)
if w not in stoplist and len(w) > 1 ]
# 生成分词清理后章节文本
cleanchap = [ " ".join(m_cut(w)) for w in test_1]
# 将文本中的词语转换为词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df = 5)
wordmtx = countvec.fit_transform(cleanchap)
wordmtx
#基于词频矩阵X计算TF-IDF值
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(wordmtx)
tfidf
# 设定LDA模型
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 8
ldamodel = LatentDirichletAllocation(n_components = n_topics)
# 拟合LDA模型
ldamodel.fit(wordmtx)
# 主题词打印函数
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
n_top_words = 12
tf_feature_names = countvec.get_feature_names()
print_top_words(ldamodel, tf_feature_names, n_top_words)
D:\sofewore\anaconda\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
DeprecationWarning)
Topic #0:
不错 电脑 办公 质量 东西 喜欢 收到 物流 很快 不卡 买来 外观
Topic #1:
性能 散热 一块 实惠 流畅 买来 配置 效果 屏幕 轻薄 好看 外观
Topic #2:
电脑 速度 喜欢 很快 运行 满意 客服 开机 物流 收到 颜色 好看
Topic #3:
鼠标 电脑 联想 垃圾 笔记本 开机 东西 电脑包 品牌 卡顿 好评 漂亮
Topic #4:
评价 用户 未填写 内容 电脑 不好 购买 满意 第一次 店家 差评 服务
Topic #5:
流畅 办公 电脑 好看 系统 一个 同事 值得 外观 游戏 稍微 网页
Topic #6:
东西 京东 客服 服务态度 第一次 开机 小时 一个 两天 宝贝 收到 包装
Topic #7:
感觉 开机 速度 屏幕 外观 轻薄 挺快 运行 性价比 赠品 适合 散热
由结果已知可以大致分为:
物流:很快 差评 信赖 垃圾 退货
外观:喜欢 轻薄 好看
性能:不错 散热 速度 轻便
散热:风扇 散热
售后:态度 客服
然后根据这几个特征用nlp库把各个特征的评论都分为积极消极两面 好评 差评
# In[13]:
tmpdf = pd.read_csv('停用词.txt',
names = ['w'], sep = 'aaa', encoding = 'utf-8',engine='python')
def function(a):
word_list=[w for w in jieba.cut(a) if w not in list(tmpdf.w)]
return word_list
test['评价内容'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
# In[14]:
text_3=['差评','垃圾','退货','受不了','惨痛','垃圾','消极','退','很慢','差','不好','耗电','杂音','卡','奔溃',]
text_4=['很快','信赖','喜欢','不错','性价比','小巧', '精致','流畅','携带方便','好看', '高贵','优良','窄边']
from numpy import *
#加载数据
def loadDataSet():
postingList = [text_3,text_4]
classVec = [0,1]
return postingList, classVec
#合并所有单词,利用set来去重,得到所有单词的唯一列表
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
#优化词集模型= 为 词袋模型+=,将单词列表变为数字向量列表
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList) #获得所有单词等长的0列表
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1 #对应单词位置加1
return returnVec
# 返回的是0、1各自两个分类中每个单词数量除以该分类单词总量再取对数ln 以及0、1两类的比例
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 样本数
numWords = len(trainMatrix[0]) # 特征数
pAbusive = sum(trainCategory) / float(numTrainDocs) # 1类所占比例
p0Num = ones(numWords)
p1Num = ones(numWords) #初始化所有单词为1
p0Denom = 2.0
p1Denom = 2.0 #初始化总单词为2 后面解释为什么这四个不初始化为0
for i in range(numTrainDocs):
if trainCategory[i] == 1: #求1类
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i] #求0类
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num / p1Denom) # numpy数组 / float = 1中每个单词/1中总单词
p0Vect = log(p0Num / p0Denom) # 这里为什么还用ln来处理,后面说明
return p0Vect, p1Vect, pAbusive
#P(X|C)判断各类别的概率大小(这里是0、1)
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) # 相乘后得到哪些单词存在,再求和,再+log(P(C))
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 由于使用的是ln,这里其实都是对数相加
if p1 > p0:
return 1
else:
return 0
result=[]
#封装调用的函数
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
#上面求出了0、1两个类中各单词所占该类的比例,以及0、1的比例
#下面是预测两条样本数据的类别
# testEntry = test['评价内容'][9]
# thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
# print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
for i in range(len(test['评价内容'])):
testEntry = test['评价内容'][i]
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
# print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
result.append(classifyNB(thisDoc, p0V, p1V, pAb))
# print(result)
# for i in range(len(tmpchap_4['txt'])):
# testEntry = tmpchap_4['txt'][i]
# thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
# print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
# result.append(classifyNB(thisDoc, p0V, p1V, pAb))
if __name__=="__main__":
testingNB()
test['评价']=result
test_1=test.loc[test['评价']==0]
import codecs
import jieba
import pickle
test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# print (fin.read())
#第一次运行程序时将分好的词存入文件
text = ''
with open('名称.txt',encoding = 'utf-8') as fin:
for line in fin.readlines():
line = line.strip('\n')
text += ' '.join(jieba.cut(line))
text += ' '
fout = open('联想text.txt','wb')
pickle.dump(text,fout)
fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('联想text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('nogood.jpg')
wc = WordCloud( background_color = 'white', # 设置背景颜色
mask = backgroud_Image, # 设置背景图片
max_words = 200, # 设置最大现实的字数
stopwords = STOPWORDS, # 设置停用词
font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
max_font_size = 200, # 设置字体最大值
random_state = 8, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iRuVUm5c-1581849296896)(output_33_0.png)]
test_1=test.loc[test['评价']==1]
import codecs
import jieba
import pickle
# test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次运行程序时将分好的词存入文件
# text = ''
# with open('名称.txt',encoding = 'utf-8') as fin:
# for line in fin.readlines():
# line = line.strip('\n')
# text += ' '.join(jieba.cut(line))
# text += ' '
# fout = open('联想text2.txt','wb')
# pickle.dump(text,fout)
# fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('联想text2.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('good.jpg')
wc = WordCloud( background_color = 'white', # 设置背景颜色
mask = backgroud_Image, # 设置背景图片
max_words = 200, # 设置最大现实的字数
stopwords = STOPWORDS, # 设置停用词
font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
max_font_size = 200, # 设置字体最大值
random_state = 8, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WINeOFrJ-1581849296897)(output_34_0.png)]
# ## 对比好评差评得出好评率,如物流好评率多少,外观好评率多少
# 物流
#
# ['物流','快','好快','慢','退货']
#
# 外观
#
# ['喜欢','轻薄','好看','外形','外观','漂亮']
# In[26]:
def function(x):
a=['物流','快','好快','慢','退货']
b=['喜欢','轻薄','好看','外形','外观','漂亮']
c=['不错','速度','轻便']
d=['风扇','散热']
e=['态度','客服','服务']
for i in a:
if i in x:
return 1
for i in b:
if i in x:
return 2
for i in c:
if i in x:
return 3
for i in d:
if i in x:
return 4
for i in e:
if i in x:
return 5
test['定义'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
test_1=test.loc[test['定义']==1]
#物流好评率
print('联想IdeaPad物流好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad物流好评率58.82%
test_1=test.loc[test['定义']==2]
#外观好评率
print('联想IdeaPad外观好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad外观好评率83.33%
test_1=test.loc[test['定义']==3]
#外观好评率
print('联想IdeaPad性能好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad性能好评率90.00%
test_1=test.loc[test['定义']==4]
#外观好评率
print('联想IdeaPad散热好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad散热好评率0.00%
test_1=test.loc[test['定义']==5]
#外观好评率
print('联想IdeaPad售后好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad售后好评率0.00%