import re import jieba import jieba.analyse # 提取关键内容 from pymongo import MongoClient import matplotlib.pyplot as plt from matplotlib import cm import numpy as np from pylab import * from wordcloud import WordCloud mpl.rcParams['font.sans-serif'] = ['SimHei'] #添加汉字/ #用来正常显示中文标签 mpl.rcParams['axes.unicode_minus'] = False #用来正常显示负号 def load_file(): ''' 加载外部词典,正则去除所有的标点符号,返回纯文本 ''' jieba.load_userdict("C:/Lib/dict_lzf.txt") # 加载外部自定义词典 client = MongoClient('localhost', 27017) # 链接数据库 db = client['Taoguba'] name = 'List' for i in range(5): db_name = name + str(i + 1) db_emotino = db[db_name] news = db_emotino.find() C_new = [] for i in news: new = (i["Content"]) r = '[’!@#~¥%……&*() ——+|}{“:”?》《,。、‘;’、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+' news1 = re.sub(r, '', new) news1 = re.sub('[a-zA-Z0-9]', '', news1) stop_new = stop_dict(news1) cut = cut_package(stop_new) C_new.append(cut) print("----每一篇词频发布如下-----") find_(str(cut)) print("---------所有文章词频发布如下---------") lable, x = find_(str(C_new)) make_pic(db_name, lable, x) w_cloud(db_name, str(C_new)) def stop_dict(news): ''' 去除所有的停用词 ''' stopwords = open("C:/Anaconda/stopworld.txt", 'r', encoding='utf-8').read() outstr = '' for word in news: if word not in stopwords: outstr += word return outstr def cut_package(news): ''' 按照不同的模式切分 ''' seg_list = [x for x in jieba.cut(news, cut_all=False)] # 精确切割模式(默认为精确模式) seg = (' '.join(seg_list)) return seg # print(seg) # seg_list = jieba.cut(news, cut_all=True) # 全模式 # print("Full Mode:", ' '.join(seg_list)) # seg_list = jieba.cut_for_search(news) # 搜索引擎模式 # print("Full Mode:", ' '.join(seg_list)) def find_(item): ''' 关键词提取 withWeight 为是否一并返回关键词权重值,默认值为 False allowPOS 仅包括指定词性的词,默认值为空,即不筛选 ''' c = jieba.analyse.extract_tags(item, topK=10, withWeight=True) # 关键词提取,返回10个出现最多的词 world = [] weight = [] for i in c: a1 = i[0] world.append(a1) b1 = i[1] weight.append(b1) print("%s 的词频为:%s" % (a1, b1)) return world, weight def make_pic(name, lable, x): ''' 绘图 ''' idx = np.arange(len(x)) color = cm.jet(np.array(x) / max(x)) plt.barh(idx, x, color=color) plt.yticks(idx + 0.4, lable) plt.grid(axis='x') plt.xlabel('出现频率') plt.ylabel('标签') plt.title('文档中出现频率最高的前十个词') # plt.show() savefig("C:/python/web/ %s.png" % name) def w_cloud(name, seg): # back_coloring = imread("G:/Anaconda/work/data/LZF.png") # ima_graph = np.array(ima) wc = WordCloud(font_path="C:/Anaconda/font/GB2312.ttf", background_color="black", width=2300, height=1000, max_font_size=1000) # wc = WordCloud(background_color="black", width=2300, height=1000, max_font_size=1000) wc.generate(seg) # imacolor = ImageColorGenerator(ima_graph) # wc.recolor(color_func=imacolor) wc.to_file(r"C:/python/ %s.jpg" % name) plt.figure("词云图") plt.imshow(wc) plt.axis("off") plt.show() def emo_pic(name, data): labels = '消极', '积极' fracs = data colors = ['yellow', 'red'] explode = (0, 0.08) # 偏移量 plt.subplot(aspect=1) plt.pie(fracs, explode=explode, labels=labels, colors=colors, autopct='%.0f%%', shadow=True, radius=1) # %.0f%% 整数, %1.1f%% 一位小数 plt.legend() plt.axis('equal') savefig("C:/python/ %s.png" % name) plt.show() def main(): load_file() if __name__ == '__main__': main()
Worldcloud文本词云图
猜你喜欢
转载自blog.csdn.net/luzaofa/article/details/79712341
今日推荐
周排行