#微信朋友签名云图项目 #2018-01-28 19:57:38 January Sunday the 04 week, the 028 day SZ SSMR ################获取微信朋友所有信息################# #登陆网页版微信 from urllib import request #request 是抓取网页数据的库 from bs4 import BeautifulSoup as bs #beautifulsoup库对html代码进行解析 import re #引入正则表达式 import jieba #分词包库jieba,可以将中文语句拆解成一个个的词汇。 import pandas as pd import matplotlib.pyplot as plt import matplotlib import numpy #numpy计算包 from wordcloud import WordCloud import itchat itchat.login() friends = itchat.get_friends(update = True)[0:] #print("friends are:",friends[2]) #观察得到的文件格式,方便下面操作 #2018-01-28 19:57:34 January Sunday the 04 week, the 028 day SZ SSMR #爬朋友性别比例。 male = female = other = 0 #friends[0]是自己信息,爬朋友,所以从friends[1]开始 for i in friends[1:]: sex = i["Sex"] if sex == 1: # male = male + 1 elif sex == 2: #女性为2 female +=1 else: other +=1 #其他是不明性别的(就是没有填的) #计算朋友总数 total = len(friends[1:]) print("小哥哥比例:%.2f%%"%(float(male)/total * 100) + "\n" + "小仙女比例:%.2f%%"%(float(female)/total * 100) + "\n" + "未知比例:%.2f%%"%(float(other)/total * 100)) ##########分析好友城市分布################### #定义函数,爬取不同变量 def get_var(var): variable = [] for i in friends: value = i[var] variable.append(value) return variable #调用上面函数得到不同的变量信息,并且保存到CSV文件里,保存到桌面 NickName = get_var("NickName") Sex = get_var("Sex") Province = get_var("Province") City = get_var("City") Signature = get_var("Signature") from pandas import DataFrame data = {'NickName':NickName, 'Sex':Sex, 'Province':Province, 'City':City, 'Signature':Signature} frame = DataFrame(data) frame.to_csv('D:/ST/Python_work/data.csv', index = True) ###############微信好友个性签名的自定义云图################# import re #re正则表达式模块 siglist = [] for i in friends: #调取每个签名,并且删除左右空格,删除span,class,emoji等会出现的各种表情包 signature = i["Signature"].strip().replace("span","").replace("class","").replace("emoji","") rep = re.compile("1f\\d + \\w*|[<>/=]") #用正则表达式删除<>/= signature = rep.sub("", signature) siglist.append(signature) #添加每个签名到列表里 text = "".join(siglist) #把列表转换为字符串格式 #结巴分词包把上面的text导入 import jieba wordlist = jieba.cut(text, cut_all = True) word_space_split = "".join(wordlist) #清除数据中的标点符号。正则表达式。短小精悍的一个模式[\u4e00-\u9fa5]+即可匹配。将非中文字符彻底清理 #pattern = re.compile(r'[\u4e00-\u9fa5]+') #filterdata = re.findall(pattern, comments) #cleaned_comments = ''.join(filterdata) #放入一个字符串中,成为一个字符串 #print(cleaned_comments) #这里是用的是lcut()方法,能将中文字符串拆解成一个列表,每项都是一个词。 segment = jieba.lcut(word_space_split) #print(segment) #处理词汇的聚合问题,统计词频而已 words_df = pd.DataFrame({'segment':segment}) #print(words_df.head()) #查看segment和words_df的内容不是words_df.head()内容 #去掉其中的高频词,没意义的词语,看”、“太”、“的”等虚词(停用词)。由于这些词汇中,有很多词是没有实际分析价值的,所以我们需要利用一个停词文件来将不必要的词处理掉。 stopwords = pd.read_csv("D:\\ST\\Python_work\\stopwords.txt", index_col = False, quoting = 3, sep = "\t", names = ['stopword'], encoding = 'utf-8') words_df = words_df[~words_df.segment.isin(stopwords.stopword)] #print(words_df) #统计词频 words_stat = words_df.groupby(by = ['segment'])['segment'].agg({"计数":numpy.size}) words_stat = words_stat.reset_index().sort_values(by = ["计数"], ascending = False) #print(words_stat) #第三阶段:用词云显示效果,simhei.ttf字符格式,类似宋体之类的 wordcloud = WordCloud(font_path = "simhei.ttf", background_color = "white", max_font_size = 80) #设置字体属性 #word_frequence 为字典类型,可以直接传入wordcloud.fit_words() word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show() ''' ###########画云图######################## import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator import numpy as np import PIL.Image as Image #coloring = np.array(Image.open("D:\\ST\\Python_work\\liu.jpg")) my_wordcloud = WordCloud(background_color = "white", max_words = 2000, max_font_size = 60,random_state = 42, scale = 2, font_path = "D:/ST/Python_work/SimHei.ttf").generate(word_space_split) #mask = coloring, #image_colors = ImageColorGenerator(coloring) #plt.imshow(my_wordcloud.recolor(color_func = image_colors)) plt.imshow(my_wordcloud) plt.axis("off") plt.show() #第三阶段:用词云显示效果,simhei.ttf字符格式,类似宋体之类的 wordcloud = WordCloud(font_path = "D:/ST/Python_work/SimHei.ttf", background_color = "white", max_font_size = 80) #设置字体属性 #word_frequence 为字典类型,可以直接传入wordcloud.fit_words() word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show() '''
Python实现微信朋友签名云图项目
猜你喜欢
转载自blog.csdn.net/btujack/article/details/80717268
今日推荐
周排行