Worldcloud文本词云图

import re
import jieba
import jieba.analyse                        # 提取关键内容
from pymongo import MongoClient
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
from pylab import *
from wordcloud import WordCloud

mpl.rcParams['font.sans-serif'] = ['SimHei']                        #添加汉字/ #用来正常显示中文标签
mpl.rcParams['axes.unicode_minus'] = False                         #用来正常显示负号

def load_file():
    '''
        加载外部词典,正则去除所有的标点符号,返回纯文本
    '''
    jieba.load_userdict("C:/Lib/dict_lzf.txt")   # 加载外部自定义词典
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['Taoguba']
    name = 'List'
    for i in range(5):
        db_name = name + str(i + 1)
        db_emotino = db[db_name]
        news = db_emotino.find()
        C_new = []
        for i in news:
            new = (i["Content"])
            r = '[’@#~%……&*() ——+|}{“?》《,。、、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+'
            news1 = re.sub(r, '', new)
            news1 = re.sub('[a-zA-Z0-9]', '', news1)
            stop_new = stop_dict(news1)
            cut = cut_package(stop_new)
            C_new.append(cut)
            print("----每一篇词频发布如下-----")
            find_(str(cut))
        print("---------所有文章词频发布如下---------")
        lable, x = find_(str(C_new))
        make_pic(db_name, lable, x)
        w_cloud(db_name, str(C_new))

def stop_dict(news):
    '''
        去除所有的停用词
    '''
    stopwords = open("C:/Anaconda/stopworld.txt", 'r',
                     encoding='utf-8').read()
    outstr = ''
    for word in news:
        if word not in stopwords:
            outstr += word
    return outstr

def cut_package(news):
    '''
       按照不同的模式切分
    '''
    seg_list = [x for x in jieba.cut(news, cut_all=False)]         # 精确切割模式(默认为精确模式)
    seg = (' '.join(seg_list))
    return seg
    # print(seg)

    # seg_list = jieba.cut(news, cut_all=True)         # 全模式
    # print("Full Mode:", ' '.join(seg_list))

    # seg_list = jieba.cut_for_search(news)            # 搜索引擎模式
    # print("Full Mode:", ' '.join(seg_list))

def find_(item):
    '''
        关键词提取
        withWeight 为是否一并返回关键词权重值,默认值为 False
        allowPOS 仅包括指定词性的词,默认值为空,即不筛选
    '''
    c = jieba.analyse.extract_tags(item, topK=10, withWeight=True)  # 关键词提取,返回10个出现最多的词
    world = []
    weight = []
    for i in c:
        a1 = i[0]
        world.append(a1)
        b1 = i[1]
        weight.append(b1)
        print("%s  的词频为:%s" % (a1, b1))
    return world, weight

def make_pic(name, lable, x):
    '''
        绘图
    '''
    idx = np.arange(len(x))
    color = cm.jet(np.array(x) / max(x))
    plt.barh(idx, x, color=color)
    plt.yticks(idx + 0.4, lable)
    plt.grid(axis='x')
    plt.xlabel('出现频率')
    plt.ylabel('标签')
    plt.title('文档中出现频率最高的前十个词')
    # plt.show()
    savefig("C:/python/web/ %s.png" % name)

def w_cloud(name, seg):
    # back_coloring = imread("G:/Anaconda/work/data/LZF.png")
    # ima_graph = np.array(ima)
    wc = WordCloud(font_path="C:/Anaconda/font/GB2312.ttf",
                   background_color="black", width=2300, height=1000, max_font_size=1000)
    # wc = WordCloud(background_color="black", width=2300, height=1000, max_font_size=1000)
    wc.generate(seg)
    # imacolor = ImageColorGenerator(ima_graph)
    # wc.recolor(color_func=imacolor)
    wc.to_file(r"C:/python/ %s.jpg" % name)
    plt.figure("词云图")
    plt.imshow(wc)
    plt.axis("off")
    plt.show()

def emo_pic(name, data):
    labels = '消极', '积极'
    fracs = data
    colors = ['yellow', 'red']
    explode = (0, 0.08)     # 偏移量
    plt.subplot(aspect=1)
    plt.pie(fracs, explode=explode, labels=labels, colors=colors, autopct='%.0f%%', shadow=True, radius=1)
    #                                   %.0f%% 整数, %1.1f%% 一位小数
    plt.legend()
    plt.axis('equal')
    savefig("C:/python/ %s.png" % name)
    plt.show()

def main():
    load_file()

if __name__ == '__main__':
    main()







猜你喜欢

转载自blog.csdn.net/luzaofa/article/details/79712341