Python实现微信朋友签名云图项目

#微信朋友签名云图项目
#2018-01-28 19:57:38 January Sunday the 04 week, the 028 day SZ SSMR
################获取微信朋友所有信息#################
#登陆网页版微信


from urllib import request				#request 是抓取网页数据的库
from bs4 import BeautifulSoup as bs	#beautifulsoup库对html代码进行解析
import re  							#引入正则表达式
import jieba						#分词包库jieba,可以将中文语句拆解成一个个的词汇。
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib
import numpy    #numpy计算包

from wordcloud import WordCloud 	


import itchat
itchat.login()
friends = itchat.get_friends(update = True)[0:]
#print("friends are:",friends[2]) #观察得到的文件格式,方便下面操作
#2018-01-28 19:57:34 January Sunday the 04 week, the 028 day SZ SSMR
#爬朋友性别比例。
male = female = other = 0
#friends[0]是自己信息,爬朋友,所以从friends[1]开始
for i in friends[1:]:
	sex = i["Sex"]
	if sex == 1: #
		male = male + 1
	elif sex == 2: #女性为2
		female +=1
	else:
		other +=1 #其他是不明性别的(就是没有填的)
#计算朋友总数
total = len(friends[1:])
print("小哥哥比例:%.2f%%"%(float(male)/total * 100) + "\n" + "小仙女比例:%.2f%%"%(float(female)/total * 100) + "\n" + "未知比例:%.2f%%"%(float(other)/total * 100))

		
##########分析好友城市分布###################
#定义函数,爬取不同变量
def get_var(var):
	variable = []
	for i in friends:
		value = i[var]
		variable.append(value)
	return variable

#调用上面函数得到不同的变量信息,并且保存到CSV文件里,保存到桌面
NickName = get_var("NickName")
Sex = get_var("Sex")
Province = get_var("Province")
City = get_var("City")
Signature = get_var("Signature")
from pandas import DataFrame
data = {'NickName':NickName, 'Sex':Sex, 'Province':Province, 'City':City, 'Signature':Signature}
frame = DataFrame(data)
frame.to_csv('D:/ST/Python_work/data.csv', index = True)




###############微信好友个性签名的自定义云图#################
import re 		#re正则表达式模块
siglist = []
for i in friends:
	#调取每个签名,并且删除左右空格,删除span,class,emoji等会出现的各种表情包
	signature = i["Signature"].strip().replace("span","").replace("class","").replace("emoji","")
	rep = re.compile("1f\\d + \\w*|[<>/=]")	#用正则表达式删除<>/=
	signature = rep.sub("", signature)
	siglist.append(signature)	#添加每个签名到列表里
text = "".join(siglist)		#把列表转换为字符串格式

#结巴分词包把上面的text导入
import jieba
wordlist = jieba.cut(text, cut_all = True)
word_space_split = "".join(wordlist)


#清除数据中的标点符号。正则表达式。短小精悍的一个模式[\u4e00-\u9fa5]+即可匹配。将非中文字符彻底清理
#pattern = re.compile(r'[\u4e00-\u9fa5]+')
#filterdata = re.findall(pattern, comments)
#cleaned_comments = ''.join(filterdata) #放入一个字符串中,成为一个字符串
#print(cleaned_comments)


#这里是用的是lcut()方法,能将中文字符串拆解成一个列表,每项都是一个词。
segment = jieba.lcut(word_space_split)
#print(segment)
#处理词汇的聚合问题,统计词频而已
words_df = pd.DataFrame({'segment':segment})
#print(words_df.head())
#查看segment和words_df的内容不是words_df.head()内容


#去掉其中的高频词,没意义的词语,看”、“太”、“的”等虚词(停用词)。由于这些词汇中,有很多词是没有实际分析价值的,所以我们需要利用一个停词文件来将不必要的词处理掉。
stopwords = pd.read_csv("D:\\ST\\Python_work\\stopwords.txt", index_col = False, quoting = 3, sep = "\t", names = ['stopword'], encoding = 'utf-8')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
#print(words_df)


#统计词频
words_stat = words_df.groupby(by = ['segment'])['segment'].agg({"计数":numpy.size})
words_stat = words_stat.reset_index().sort_values(by = ["计数"], ascending = False)
#print(words_stat)

#第三阶段:用词云显示效果,simhei.ttf字符格式,类似宋体之类的
wordcloud = WordCloud(font_path = "simhei.ttf", background_color = "white", max_font_size = 80)		#设置字体属性
#word_frequence 为字典类型,可以直接传入wordcloud.fit_words()
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}


wordcloud = wordcloud.fit_words(word_frequence)

plt.imshow(wordcloud)

plt.axis("off")
plt.show()

'''
###########画云图########################
import matplotlib.pyplot as plt 
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np 
import PIL.Image as Image 

#coloring = np.array(Image.open("D:\\ST\\Python_work\\liu.jpg"))
my_wordcloud = WordCloud(background_color = "white", max_words = 2000,  max_font_size = 60,random_state = 42,
				scale = 2, font_path = "D:/ST/Python_work/SimHei.ttf").generate(word_space_split) #mask = coloring,
#image_colors = ImageColorGenerator(coloring)
#plt.imshow(my_wordcloud.recolor(color_func = image_colors))
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()



#第三阶段:用词云显示效果,simhei.ttf字符格式,类似宋体之类的
wordcloud = WordCloud(font_path = "D:/ST/Python_work/SimHei.ttf", background_color = "white", max_font_size = 80)		#设置字体属性
#word_frequence 为字典类型,可以直接传入wordcloud.fit_words()
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}

wordcloud = wordcloud.fit_words(word_frequence)

plt.imshow(wordcloud)

plt.axis("off")
plt.show()
'''

猜你喜欢

转载自blog.csdn.net/btujack/article/details/80717268