20200207_情感分析

在这里插入图片描述

import pandas as pd
import lda 
test=pd.read_excel('华硕r424.xlsx')
test.head()
会员 级别 评价星级 评价内容 时间 点赞数 评论数 追评时间 追评内容 商品属性 页面网址 页面标题 采集时间 sku 好评度 评价关键词
0 jd_187595nwq NaN star4 轻薄程度:很好运行速度:一般外形外观:好看屏幕效果:一般奇怪,怎么就一个c盘?????? 2020-02-08 08:47 1 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#none 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:39:25.6734153 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分...
1 周先生还没老 NaN star5 初步感受还是很不错的,办公性价比很高,用一段时间会追加评价的 2020-02-07 12:56 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#none 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:39:25.8138153 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分...
2 yly185*****037 PLUS会员[试用] star4 老品牌,值得信赖,性价比还是比较高的,刚开始用一天不好做多评价, 2020-02-06 16:33 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#none 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:39:25.8450153 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分...
3 亿***下 NaN star5 噪音很大,其他的都还好 2020-02-06 11:39 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#none 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:39:25.8762153 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分...
4 jd_188622nqe NaN star5 还可以的,就自己在家简单做做东西,上上网完全没问题,不过就是一台裸机,连个鼠标都没有 2020-02-06 11:38 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#none 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:39:25.9074153 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分...

先用LDA模型把商品评论中此商品各个主题特征提取出来,如物流,外观,性能等

test_1=test['评价内容']
stoplist = list(pd.read_csv('停用词.txt', names = ['w'], sep = 'aaa', 
                            encoding = 'utf-8', engine='python').w)
import jieba 
def m_cut(intxt):
    return [ w for w in jieba.cut(intxt) 
            if w not in stoplist and len(w) > 1 ] 
# 生成分词清理后章节文本
cleanchap = [ " ".join(m_cut(w)) for w in test_1] 
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.895 seconds.
Prefix dict has been built succesfully.
# 将文本中的词语转换为词频矩阵  
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df = 5) 
wordmtx = countvec.fit_transform(cleanchap) 
wordmtx
<348x98 sparse matrix of type '<class 'numpy.int64'>'
	with 1488 stored elements in Compressed Sparse Row format>
#基于词频矩阵X计算TF-IDF值  
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()  
tfidf = transformer.fit_transform(wordmtx)  
tfidf
<348x98 sparse matrix of type '<class 'numpy.float64'>'
	with 1488 stored elements in Compressed Sparse Row format>
# 设定LDA模型
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 8
ldamodel = LatentDirichletAllocation(n_components = n_topics)
# 拟合LDA模型
ldamodel.fit(wordmtx)
D:\sofewore\anaconda\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)





LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=8, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)
# 拟合后模型的实质
# print(ldamodel.components_.shape)
# ldamodel.components_[:2]
# 主题词打印函数
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] 
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 12
tf_feature_names = countvec.get_feature_names()
print_top_words(ldamodel, tf_feature_names, n_top_words)
Topic #0:
速度 屏幕 运行 轻薄 外观 效果 散热 外形 程度 性能 好看 不错
Topic #1:
散热 轻薄 速度 屏幕 外观 效果 性能 很快 运行 超薄 好看 外形
Topic #2:
电脑 鼠标 客服 东西 价格 一个 退货 华硕 收到 真的 配置 后悔
Topic #3:
感觉 声音 开机 售后 垃圾 时间 朋友 散热 键盘 图片 不错 笔记本
Topic #4:
不错 速度 很快 评价 外观 办公 运行 内容 未填写 用户 笔记本 好看
Topic #5:
性价比 喜欢 风扇 打开 做工 售后 内存 网页 华硕 不错 客服 笔记本电脑
Topic #6:
办公 华硕 京东 评价 信赖 值得 品牌 轻薄 开机 购买 超薄 足够
Topic #7:
开机 第二天 好评 物流 特色 还好 想象 速度 第一次 京东 差评 失望

由结果已知可以大致分为:

物流:很快 差评 信赖 垃圾 退货

外观:喜欢 轻薄 好看

性能:不错 散热 速度 轻便

散热:风扇 散热

售后:态度 客服

然后根据这几个特征用nlp库把各个特征的评论都分为积极消极两面 好评 差评

tmpdf = pd.read_csv('停用词.txt',
                    names = ['w'], sep = 'aaa', encoding = 'utf-8',engine='python')
def function(a):
    word_list=[w for w in jieba.cut(a) if w not in list(tmpdf.w)]
    return word_list
test['评价内容'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
text_3=['差评','垃圾','退货','受不了','惨痛','垃圾','消极','退','很慢','差','不好','耗电','杂音','卡','奔溃',]
text_4=['很快','信赖','喜欢','不错','性价比','小巧', '精致','流畅','携带方便','好看', '高贵','优良','窄边']
from numpy import *
#加载数据
def loadDataSet():
    postingList = [text_3,text_4]
    classVec = [0,1]
    return postingList, classVec
#合并所有单词,利用set来去重,得到所有单词的唯一列表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
#优化词集模型= 为 词袋模型+=,将单词列表变为数字向量列表
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)    #获得所有单词等长的0列表
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1   #对应单词位置加1
    return returnVec
 
# 返回的是0、1各自两个分类中每个单词数量除以该分类单词总量再取对数ln 以及0、1两类的比例
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)  # 样本数
    numWords = len(trainMatrix[0])  # 特征数
    pAbusive = sum(trainCategory) / float(numTrainDocs)  # 1类所占比例
    p0Num = ones(numWords)
    p1Num = ones(numWords)  #初始化所有单词为1
    p0Denom = 2.0
    p1Denom = 2.0  #初始化总单词为2        后面解释为什么这四个不初始化为0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:       #求1类
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]     #求0类
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num / p1Denom)  # numpy数组 / float = 1中每个单词/1中总单词
    p0Vect = log(p0Num / p0Denom)  # 这里为什么还用ln来处理,后面说明
    return p0Vect, p1Vect, pAbusive
 
#P(X|C)判断各类别的概率大小(这里是0、1)
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # 相乘后得到哪些单词存在,再求和,再+log(P(C))
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 由于使用的是ln,这里其实都是对数相加
    if p1 > p0:
        return 1
    else:
        return 0
result=[]   
#封装调用的函数
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    #上面求出了0、1两个类中各单词所占该类的比例,以及0、1的比例
 
    #下面是预测两条样本数据的类别
#     testEntry = test['评价内容'][9]
#     thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
#     print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
    for i in range(len(test['评价内容'])):
        testEntry = test['评价内容'][i]
        thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
#         print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
        result.append(classifyNB(thisDoc, p0V, p1V, pAb))
#         print(result)
#     for i in range(len(tmpchap_4['txt'])):
#         testEntry = tmpchap_4['txt'][i]
#         thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
#         print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
#         result.append(classifyNB(thisDoc, p0V, p1V, pAb))
if __name__=="__main__":
    testingNB()
test['评价']=result
test_1=test.loc[test['评价']==1]
import codecs
import jieba
import pickle
# test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # # print (fin.read())
# # #第一次运行程序时将分好的词存入文件
# # text = ''
# # with open('名称.txt',encoding = 'utf-8') as fin:
# #     for line in fin.readlines():
# #         line = line.strip('\n')
# #         text += ' '.join(jieba.cut(line))
# #         text += ' '
# # fout = open('text.txt','wb')
# # pickle.dump(text,fout)
# # fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('good.jpg')
wc = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = backgroud_Image,        # 设置背景图片
                max_words = 200,            # 设置最大现实的字数
                stopwords = STOPWORDS,        # 设置停用词
                font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
                max_font_size = 200,            # 设置字体最大值
                random_state = 8,            # 设置有多少种随机生成状态,即有多少种配色方案
                )
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()

在这里插入图片描述

test_1=test.loc[test['评价']==0]
import codecs
import jieba
import pickle
# test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次运行程序时将分好的词存入文件
# text = ''
# with open('名称.txt',encoding = 'utf-8') as fin:
#     for line in fin.readlines():
#         line = line.strip('\n')
#         text += ' '.join(jieba.cut(line))
#         text += ' '
# fout = open('text1.txt','wb')
# pickle.dump(text,fout)
# fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text1.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('nogood.jpg')
wc = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = backgroud_Image,        # 设置背景图片
                max_words = 200,            # 设置最大现实的字数
                stopwords = STOPWORDS,        # 设置停用词
                font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
                max_font_size = 200,            # 设置字体最大值
                random_state = 8,            # 设置有多少种随机生成状态,即有多少种配色方案
                )
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FChz7FYs-1581849296894)(output_19_0.png)]

对比好评差评得出好评率,如物流好评率多少,外观好评率多少

物流

[‘物流’,‘快’,‘好快’,‘慢’,‘退货’]

外观

[‘喜欢’,‘轻薄’,‘好看’,‘外形’,‘外观’,‘漂亮’]

性能:

[不错 散热 速度 轻便]

散热:

风扇 散热

售后:

态度 客服

def function(x):
    a=['物流','快','好快','慢','退货']
    b=['喜欢','轻薄','好看','外形','外观','漂亮']
    c=['不错','速度','轻便']
    d=['风扇','散热']
    e=['态度','客服']
    for i in a:
        if i in x:
            return 1
    for i in b:
        if i in x:
            return 2
    for i in c:
        if i in x:
            return 3
    for i in d:
        if i in x:
            return 4
    for i in e:
        if i in x:
            return 5
#     elif b in x:
#         return 2
#     else:
#         return 3
test['定义'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
test_1=test.loc[test['定义']==1]
#物流好评率
print('华硕r424物流好评率为{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
华硕r424物流好评率为36.21%
test_1=test.loc[test['定义']==2]
#外观好评率
print('华硕r424观好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
华硕r424观好评率68.83%
test_1=test.loc[test['定义']==3]
#外观好评率
print('华硕r424性能好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
华硕r424性能好评率75.93%
test_1=test.loc[test['定义']==4]
#外观好评率
print('华硕r424散热好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
test_1.head()
华硕r424散热好评率0.00%
会员 级别 评价星级 评价内容 时间 点赞数 评论数 追评时间 追评内容 商品属性 页面网址 页面标题 采集时间 sku 好评度 评价关键词 评价 定义
21 大金狗 NaN star5 [散热, 性能] 2020-01-22 23:20 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:40:02.7474156 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 4.0
25 j***E NaN star3 [下午, 收到, 货, 晚上, 开机, 几分钟, 发现, 杂音, 散热, 声音, 滋, 滋,... 2020-01-20 22:38 0 1 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:40:02.8722156 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 4.0
40 宏***0 NaN star5 [散热, 性能, 一段时间, 评价, 物超所值] 2020-01-10 21:53 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:40:39.7646158 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 4.0
69 jd_183086ioo NaN star3 [散热, 声音, 太, 屏幕, 高清, 卡] 2019-12-31 23:33 0 1 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:41:17.2208160 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 4.0
179 jd_130661ach NaN star5 [散热, 不好] 2019-11-17 11:59 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:44:41.0726166 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 4.0
test_1=test.loc[test['定义']==5]
#外观好评率
print('华硕r424客服好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
test_1.head()
华硕r424客服好评率0.00%
会员 级别 评价星级 评价内容 时间 点赞数 评论数 追评时间 追评内容 商品属性 页面网址 页面标题 采集时间 sku 好评度 评价关键词 评价 定义
48 jd189251xrl PLUS会员 star1 [垃圾, 买, 回, 不到, 一个月, 开, 机, 售后, 麻烦, 现场, 售后, 点, 态... 2020-01-08 12:10 0 1 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:40:40.0620158 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 5.0
141 jd_184826hro NaN star1 [产品质量, 反馈, 没, 解决, 收到, 货有, 裂痕, 客服, 未, 解决] 2019-11-27 18:50 0 1 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:43:45.2106165 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 5.0
144 羣既有謀又有膽 PLUS会员 star5 [京东, 自营, 产品, 买, 本本, 一条, 刮花, 痕迹, 影响, 功能, 客服, 态度] 2019-11-26 22:39 0 0 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:43:45.3198165 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 5.0
149 j***7 NaN star5 [电脑, 顺手, 买, 出, 点, 客服, 解决, 态度, ?, ?, ?] 2019-11-24 14:51 0 1 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:43:45.5070165 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 5.0
313 jd189251xrl PLUS会员 star1 [垃圾, 买, 回, 不到, 一个月, 开, 机, 售后, 麻烦, 现场, 售后, 点, 态... 2020-01-08 12:10 0 1 NaN NaN 【爆款】i3 4G 256G固态 银 ... https://item.jd.com/100007507848.html#comment 【华硕R424】华硕顽石(ASUS) R424 英特尔酷睿i3 14英寸窄边轻薄笔记本电脑(... 2020-02-08 16:49:00.1768175 100007507848 92% 轻薄精巧(19)/\n运行超快(16)/\n方便快捷(15)/\n简单方便(13)/\n十分... 0 5.0

再对比同一商品不同品牌各个特征好评率得出优劣,如华硕运行速度好于联想

import pandas as pd
import lda 
test=pd.read_excel('联想IdeaPad.xlsx')
test.head()
# ## 先用LDA模型把商品评论中此商品各个主题特征提取出来,如物流,外观,性能等
test_1=test['评价内容']
stoplist = list(pd.read_csv('停用词.txt', names = ['w'], sep = 'aaa', 
                            encoding = 'utf-8', engine='python').w)
import jieba 
def m_cut(intxt):
    return [ w for w in jieba.cut(intxt) 
            if w not in stoplist and len(w) > 1 ] 
# 生成分词清理后章节文本
cleanchap = [ " ".join(m_cut(w)) for w in test_1] 
# 将文本中的词语转换为词频矩阵  
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df = 5) 
wordmtx = countvec.fit_transform(cleanchap) 
wordmtx
#基于词频矩阵X计算TF-IDF值  
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()  
tfidf = transformer.fit_transform(wordmtx)  
tfidf
# 设定LDA模型
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 8
ldamodel = LatentDirichletAllocation(n_components = n_topics)
# 拟合LDA模型
ldamodel.fit(wordmtx)
# 主题词打印函数
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] 
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 12
tf_feature_names = countvec.get_feature_names()
print_top_words(ldamodel, tf_feature_names, n_top_words)
D:\sofewore\anaconda\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)


Topic #0:
不错 电脑 办公 质量 东西 喜欢 收到 物流 很快 不卡 买来 外观
Topic #1:
性能 散热 一块 实惠 流畅 买来 配置 效果 屏幕 轻薄 好看 外观
Topic #2:
电脑 速度 喜欢 很快 运行 满意 客服 开机 物流 收到 颜色 好看
Topic #3:
鼠标 电脑 联想 垃圾 笔记本 开机 东西 电脑包 品牌 卡顿 好评 漂亮
Topic #4:
评价 用户 未填写 内容 电脑 不好 购买 满意 第一次 店家 差评 服务
Topic #5:
流畅 办公 电脑 好看 系统 一个 同事 值得 外观 游戏 稍微 网页
Topic #6:
东西 京东 客服 服务态度 第一次 开机 小时 一个 两天 宝贝 收到 包装
Topic #7:
感觉 开机 速度 屏幕 外观 轻薄 挺快 运行 性价比 赠品 适合 散热

由结果已知可以大致分为:

物流:很快 差评 信赖 垃圾 退货

外观:喜欢 轻薄 好看

性能:不错 散热 速度 轻便

散热:风扇 散热

售后:态度 客服

然后根据这几个特征用nlp库把各个特征的评论都分为积极消极两面 好评 差评


# In[13]:
tmpdf = pd.read_csv('停用词.txt',
                    names = ['w'], sep = 'aaa', encoding = 'utf-8',engine='python')
def function(a):
    word_list=[w for w in jieba.cut(a) if w not in list(tmpdf.w)]
    return word_list
test['评价内容'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
# In[14]:
text_3=['差评','垃圾','退货','受不了','惨痛','垃圾','消极','退','很慢','差','不好','耗电','杂音','卡','奔溃',]
text_4=['很快','信赖','喜欢','不错','性价比','小巧', '精致','流畅','携带方便','好看', '高贵','优良','窄边']
from numpy import *
#加载数据
def loadDataSet():
    postingList = [text_3,text_4]
    classVec = [0,1]
    return postingList, classVec
#合并所有单词,利用set来去重,得到所有单词的唯一列表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
#优化词集模型= 为 词袋模型+=,将单词列表变为数字向量列表
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)    #获得所有单词等长的0列表
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1   #对应单词位置加1
    return returnVec
 
# 返回的是0、1各自两个分类中每个单词数量除以该分类单词总量再取对数ln 以及0、1两类的比例
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)  # 样本数
    numWords = len(trainMatrix[0])  # 特征数
    pAbusive = sum(trainCategory) / float(numTrainDocs)  # 1类所占比例
    p0Num = ones(numWords)
    p1Num = ones(numWords)  #初始化所有单词为1
    p0Denom = 2.0
    p1Denom = 2.0  #初始化总单词为2        后面解释为什么这四个不初始化为0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:       #求1类
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]     #求0类
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num / p1Denom)  # numpy数组 / float = 1中每个单词/1中总单词
    p0Vect = log(p0Num / p0Denom)  # 这里为什么还用ln来处理,后面说明
    return p0Vect, p1Vect, pAbusive
 
#P(X|C)判断各类别的概率大小(这里是0、1)
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # 相乘后得到哪些单词存在,再求和,再+log(P(C))
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 由于使用的是ln,这里其实都是对数相加
    if p1 > p0:
        return 1
    else:
        return 0
result=[]   
#封装调用的函数
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    #上面求出了0、1两个类中各单词所占该类的比例,以及0、1的比例
 
    #下面是预测两条样本数据的类别
#     testEntry = test['评价内容'][9]
#     thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
#     print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
    for i in range(len(test['评价内容'])):
        testEntry = test['评价内容'][i]
        thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
#         print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
        result.append(classifyNB(thisDoc, p0V, p1V, pAb))
#         print(result)
#     for i in range(len(tmpchap_4['txt'])):
#         testEntry = tmpchap_4['txt'][i]
#         thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) #先将测试数据转为numpy的词袋模型 [0 2 0 5 1 0 0 3 ...]
#         print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) #传值判断
#         result.append(classifyNB(thisDoc, p0V, p1V, pAb))
if __name__=="__main__":
    testingNB()
test['评价']=result
test_1=test.loc[test['评价']==0]
import codecs
import jieba
import pickle
test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# print (fin.read())
#第一次运行程序时将分好的词存入文件
text = ''
with open('名称.txt',encoding = 'utf-8') as fin:
    for line in fin.readlines():
        line = line.strip('\n')
        text += ' '.join(jieba.cut(line))
        text += ' '
fout = open('联想text.txt','wb')
pickle.dump(text,fout)
fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('联想text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('nogood.jpg')
wc = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = backgroud_Image,        # 设置背景图片
                max_words = 200,            # 设置最大现实的字数
                stopwords = STOPWORDS,        # 设置停用词
                font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
                max_font_size = 200,            # 设置字体最大值
                random_state = 8,            # 设置有多少种随机生成状态,即有多少种配色方案
                )
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iRuVUm5c-1581849296896)(output_33_0.png)]

test_1=test.loc[test['评价']==1]
import codecs
import jieba
import pickle
# test_1['评价内容'].to_csv('名称.txt', sep='\t', index=False)
# fin = codecs.open('名称.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次运行程序时将分好的词存入文件
# text = ''
# with open('名称.txt',encoding = 'utf-8') as fin:
#     for line in fin.readlines():
#         line = line.strip('\n')
#         text += ' '.join(jieba.cut(line))
#         text += ' '
# fout = open('联想text2.txt','wb')
# pickle.dump(text,fout)
# fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('联想text2.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('good.jpg')
wc = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = backgroud_Image,        # 设置背景图片
                max_words = 200,            # 设置最大现实的字数
                stopwords = STOPWORDS,        # 设置停用词
                font_path = 'simfang.ttf',# 设置字体格式,如不设置显示不了中文
                max_font_size = 200,            # 设置字体最大值
                random_state = 8,            # 设置有多少种随机生成状态,即有多少种配色方案
                )
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WINeOFrJ-1581849296897)(output_34_0.png)]

# ## 对比好评差评得出好评率,如物流好评率多少,外观好评率多少

# 物流
# 
# ['物流','快','好快','慢','退货']
# 
# 外观
# 
# ['喜欢','轻薄','好看','外形','外观','漂亮']

# In[26]:

def function(x):
    a=['物流','快','好快','慢','退货']
    b=['喜欢','轻薄','好看','外形','外观','漂亮']
    c=['不错','速度','轻便']
    d=['风扇','散热']
    e=['态度','客服','服务']
    for i in a:
        if i in x:
            return 1
    for i in b:
        if i in x:
            return 2
    for i in c:
        if i in x:
            return 3
    for i in d:
        if i in x:
            return 4
    for i in e:
        if i in x:
            return 5
test['定义'] = test.apply(lambda x: function(x['评价内容']), axis = 1)
test_1=test.loc[test['定义']==1]
#物流好评率
print('联想IdeaPad物流好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad物流好评率58.82%
test_1=test.loc[test['定义']==2]
#外观好评率
print('联想IdeaPad外观好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad外观好评率83.33%
test_1=test.loc[test['定义']==3]
#外观好评率
print('联想IdeaPad性能好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad性能好评率90.00%
test_1=test.loc[test['定义']==4]
#外观好评率
print('联想IdeaPad散热好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad散热好评率0.00%
test_1=test.loc[test['定义']==5]
#外观好评率
print('联想IdeaPad售后好评率{:.2%}'.format(len(test_1.loc[test_1['评价']==1])/len(test_1)))
联想IdeaPad售后好评率0.00%
发布了76 篇原创文章 · 获赞 23 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_39309652/article/details/104346628
今日推荐