程序如下:
import os import codecs import jieba import pandas as pd from wordcloud import WordCloud from scipy.misc import imread import matplotlib.pyplot as plt os.chdir("/Users/Zhaohaibo/Desktop") class Hlm(object): # 字频前100,并写入writedoc # ————————————————————— def Zipin(self, readdoc, writedoc): word_lst = [] word_dict = {} exclude_str = ",。!?、()【】<>《》=:+-*—“”…" with open(readdoc,"r") as fileIn ,open(writedoc,'w') as fileOut: # 添加每一个字到列表中 for line in fileIn: for char in line: word_lst.append(char) # 用字典统计每个字出现的个数 for char in word_lst: if char not in exclude_str: if char.strip() not in word_dict: # strip去除各种空白 word_dict[char] = 1 else : word_dict[char] += 1 # 排序x[1]是按字频排序,x[0]则是按字排序 lstWords = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) # 输出结果 (前100) print ('字符\t字频') print ('=============') for e in lstWords[:100]: print ('%s\t%d' % e) fileOut.write('%s, %d\n' % e) # 词频表(DataFrame格式) # ————————————————————— def Cipin(self, doc): wdict = {} f = open(doc,"r") for line in f.readlines(): words = jieba.cut(line) for w in words: if(w not in wdict): wdict[w] = 1 else: wdict[w] += 1 # 导入停用词表 stop = pd.read_csv('stoplist.txt', encoding = 'utf-8', sep = 'zhao', header = None,engine = 'python') #sep:分割符号(需要用一个确定不会出现在停用词表中的单词) stop.columns = ['word'] stop = [' '] + list(stop.word) #python读取时不会读取到空格。但空格依旧需要去除。所以加上空格; 读取后的stop是series的结构,需要转成列表 for i in range(len(stop)): if(stop[i] in wdict): wdict.pop(stop[i]) ind = list(wdict.keys()) val = list(wdict.values()) ind = pd.Series(ind) val = pd.Series(val) data = pd.DataFrame() data['词'] = ind data['词频'] = val return data # 词云图 # ————————————————————— def Ciyun(self,doc): g = open(doc,"r").read() back_pic = imread("aixin.jpg") # 设置背景图片 wc = WordCloud( font_path='/System/Library/Fonts/STHeiti Medium.ttc',#设置字体 background_color="white", #背景颜色 max_words=2000,# 词云显示的最大词数 mask=back_pic,#设置背景图片 max_font_size=200, #字体最大值 random_state=42, ).generate(g) plt.figure(figsize=(64,32)) plt.imshow(wc) plt.axis('off') plt.savefig("ciyun.jpg") plt.show() def main(self,readdoc): # self.Zipin(readdoc,writedoc) df = self.Cipin(readdoc) #self.Ciyun(readdoc) return df if __name__ == '__main__': hlm = Hlm() hlm.Zipin("红楼梦.txt","红楼梦字频.txt") df_hlm1 = hlm.main("红楼梦.txt")
效果图如下: