Python数据挖掘之用图挖掘同样喜欢梅西的的人

前言:
本次同样在twitter上进行数据挖掘,主题是构建图,计算相似度,找到同样喜欢梅西的人,之后可以进行好友推荐,或者用户建群等。
编辑工具:anaconda 的jupyter notebook,所以会直接在代码后面跟上输出。
数据集:如果不想麻烦的弄数据的话,可以跳过下面获取数据步骤,直接使用文末数据,导入即可。

获取Twitter好友数据

写入授权令牌

import twitter
consumer_key = "填入自己的授权令牌"
consumer_secret = "填入自己的授权令牌"
access_token = "填入自己的授权令牌"
access_token_secret = "填入自己的授权令牌"
authorization = twitter.OAuth(access_token,access_token_secret,consumer_key,consumer_secret)
t = twitter.Twitter(auth=authorization,retry=True)

获取用户数据

import os
data_folder = "E:\DataMining\Project\dataming_with_python\chapter_7用图挖掘到感兴趣的人"
output_filename = os.path.join(data_folder,"python_tweets.json")
import json 
original_users = []   #用户信息
tweets = []           #消息文本
user_ids = {}       #关联用户编号和昵称
search_result = t.search.tweets(q="Lionel Messi",count=60)['statuses']
for tweet in search_result:
    if 'text' in tweet:
        original_users.append(tweet['user']['screen_name'])
        user_ids[tweet['user']['screen_name']] = tweet['user']['id']
        tweets.append(tweet['text'])
len(tweets)
60

获取Twitter好友信息,这里假设搜索Lionel Messi的都是喜欢的

import time
#创建函数
def get_friends(t,user_id):
    friends =[]
    cursor = -1    #游标,管理翻页
    while cursor != 0:
        try:
            results = t.friends.ids(user_id=user_id,cursor=cursor,count=5000)
            friends.extend([friend for friend in results['ids']])
            cursor = results['next_cursor']
            if len(friends) > 10000:           #用户好友很多时限制
                break
        #处理异常
        except TypeError as e:
            if results is None:
                print('API次数限制,等待五分钟')
                sys.stdout.flush()
                time.sleep(5*60)   #等五分钟
            else:
                raise e
        except twitter.TwitterHTTPError as e:
            break
        finally:
            time.sleep(60)
    return friends

构建网络

#找到每个人的好友,保存到字典,从usee_id拿到编号
friends = {}
for screen_name in original_users:
    user_id = user_ids[screen_name]
    print("Obtaining friends for user {}".format(screen_name))
    friends[user_id] = get_friends(t,user_id)
friends = {user_id:friends[user_id] for user_id in friends if len(friends[user_id]) > 0}
Obtaining friends for user chukwudip6
Obtaining friends for user masrinez
Obtaining friends for user DaviidCrowe
Obtaining friends for user ViscaParedes
Obtaining friends for user olimpisima
Obtaining friends for user BumpyOth
Obtaining friends for user MaxAriguznaga
Obtaining friends for user bro_arsenal
Obtaining friends for user raul_victor7
Obtaining friends for user BRAHIANDANIEL7
Obtaining friends for user verobrunati
Obtaining friends for user RikrdoBjarano
Obtaining friends for user alfred_sinclair
Obtaining friends for user LEOWHOELSE
Obtaining friends for user Ferraro_Mariano
Obtaining friends for user Elian_x
Obtaining friends for user Josuemanuel1995
Obtaining friends for user Roberto_Leigh
Obtaining friends for user Warra_Kwena
Obtaining friends for user danielbrunel_90
Obtaining friends for user mati_silva9
Obtaining friends for user Brenndaduran
Obtaining friends for user LadySunshine81
Obtaining friends for user sgygustavo
Obtaining friends for user mdgma
Obtaining friends for user Mademba_dio
Obtaining friends for user iz_BEBA
Obtaining friends for user darioabm_
Obtaining friends for user danielkstlo
Obtaining friends for user sinyakitagawa
Obtaining friends for user beautymariel099
Obtaining friends for user kikesol_sol
Obtaining friends for user ktee_boi
Obtaining friends for user HamadouJr
Obtaining friends for user Xuscruz
Obtaining friends for user maraco44
Obtaining friends for user elpollo_35
Obtaining friends for user nico11torales
Obtaining friends for user febcrubre
Obtaining friends for user aarroyo0715
Obtaining friends for user BencherkiMeryem
Obtaining friends for user aguilar_d26
Obtaining friends for user JavierZuniga_
Obtaining friends for user GeorgeNjideka
Obtaining friends for user robertyedrarmcf
Obtaining friends for user fedepardo98
Obtaining friends for user 01_Unstopable
Obtaining friends for user lautarocejas22
Obtaining friends for user Romary2
Obtaining friends for user ALXNG7
Obtaining friends for user gudade_zaid
Obtaining friends for user TomiiHuracan7
Obtaining friends for user camilofabre
Obtaining friends for user CampeonHuracan
Obtaining friends for user PapiMiyagui
Obtaining friends for user maidenrokko
Obtaining friends for user SharjeelDrawing
Obtaining friends for user Carpio_Marca
Obtaining friends for user matiasmarcoz
Obtaining friends for user DerlisMontiel
len(friends)
60

遍历用户好友列表,统计每个好友的出现次数

from collections import defaultdict
def count_friends(friends):
    friend_count = defaultdict(int)
    for friend_list in friends.values():
        for friend in friend_list:
            friend_count[friend] += 1
    return friend_count
#排序
friend_count = count_friends(friends)
len(friend_count)
52144
from operator import itemgetter
best_friends = sorted(friend_count.items(),key=itemgetter(1),reverse=True)
len(best_friends)
52144
for user_id,count in best_friends:
    if user_id in friends:
        continue
    friends[user_id] = get_friends(t,user_id)
    print("Received {} friends".format(len(friends[user_id])))
    print("We now have the friends of {} users".format(len(friends)))
    for friend in friends[user_id]:
        friend_count[friend] += 1
    if len(friends) == 150:
        break
best_friends = sorted(friend_count.items(),key=itemgetter(1),reverse=True)
Received 961 friends
We now have the friends of 240 users



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-24-02eef337e079> in <module>()
      2     if user_id in friends:
      3         continue
----> 4     friends[user_id] = get_friends(t,user_id)
      5     print("Received {} friends".format(len(friends[user_id])))
      6     print("We now have the friends of {} users".format(len(friends)))


<ipython-input-4-c79a8739fbda> in get_friends(t, user_id)
     22             break
     23         finally:
---> 24             time.sleep(60)
     25     return friends


KeyboardInterrupt: 
len(friends)
240

保存到json文件

import json
friends_filename = os.path.join(data_folder,"python_friends.json")
with open(friends_filename,'w') as outf:
    json.dump(friends,outf)

创建图

import networkx as nx
G =nx.DiGraph()
#只考虑150名核心用户
main_users = friends.keys()
G.add_nodes_from(main_users)
#创建边
for user_id in friends:
    for friend in friends[user_id]:
        if friend in main_users:
            G.add_edge(user_id,friend)
#将图绘制成图像
%matplotlib inline
nx.draw(G)

这里写图片描述

创建用户相似图

如果两个用户有共同的好友,那么这两个用户相似度很高,权重代表距离,权重越小,相似度越高

import os
import json
data_folder = "E:\DataMining\Project\dataming_with_python\chapter_7用图挖掘到感兴趣的人"
friends_filename = os.path.join(data_folder,"python_friends.json")
with open (friends_filename) as inf:
    friends = json.load(inf)
import networkx as nx
#计算两个列表相似度,规范化数据到0,1之间
friends = {user:set(friends[user]) for user in friends}
#函数计算相似度
def compute_similarity(friends1,friends2):
    return len(friends1 & friends2) / len(friends1 | friends2)

#创建创图函数
def create_graph(followers,threshold=0):
    G = nx.Graph()
    for user1 in friends.keys():
        for user2 in friends.keys():
            if user1 == user2:
                continue
            weight = compute_similarity(friends[user1],friends[user2])
            if weight >= threshold:
                G.add_node(user1)
                G.add_node(user2)
                G.add_edge(user1,user2,weight=weight)    #设置带权重的边
    return G
%matplotlib inline
#生成图
import matplotlib.pyplot as plt
G = create_graph(friends)
#增加一下尺寸
plt.figure(figsize=(20,20))
#使用networkx的spring_layout布局
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G,pos)
edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]  #获取权重
nx.draw_networkx_edges(G,pos,width=edgewidth)
nx.draw(G)

这里写图片描述

分析:
240个用户中,大部分的相似度还是很高的,只有少部分几个游离。

寻找子图,聚类分析

找到图中的连通分支,任意两点之间,存在至少一条路径

G = create_graph(friends,0.1)       #设置最低权重
#寻找连通分支
sub_graphs = nx.connected_component_subgraphs(G)
#遍历,输出基本信息
for i,sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i,n_nodes))
Subgraph 0 has 62 nodes
Subgraph 1 has 3 nodes
Subgraph 2 has 9 nodes
Subgraph 3 has 4 nodes
Subgraph 4 has 2 nodes
Subgraph 5 has 2 nodes
Subgraph 6 has 2 nodes
Subgraph 7 has 4 nodes
Subgraph 8 has 2 nodes
Subgraph 9 has 2 nodes
Subgraph 10 has 3 nodes
Subgraph 11 has 2 nodes
Subgraph 12 has 2 nodes
Subgraph 13 has 2 nodes
Subgraph 14 has 2 nodes
#获得连通分支和数量
sub_graphs = nx.connected_component_subgraphs(G)     #一种生成器
n_subgraphs = nx.number_connected_components(G)
fig = plt.figure(figsize=(20,(n_subgraphs * 3)))
for i,sub_graph in enumerate(sub_graphs):

    ax = fig.add_subplot(int(n_subgraphs / 3),3,i+1)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    #绘制顶点和边
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G,pos,sub_graph.nodes(),ax=ax,node_size=500)
    nx.draw_networkx_edges(G,pos,sub_graph.edges(),ax=ax)

这里写图片描述

优化参数选取准则,选择合适的最低权重

import numpy as np
from sklearn.metrics import silhouette_score 
from scipy.optimize import minimize
def compute_silhouette(threshold,friends):
    G = create_graph(friends,threshold=threshold)
    if len(G.nodes()) ==0:
        return -99
    sub_graphs = nx.connected_component_subgraphs(G)
    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):
        return -99
    label_dict = {}
    for i,sub_graph in enumerate(sub_graphs):
        for node in sub_graph.nodes():
            label_dict[node] = i
    labels = np.array([label_dict[node] for node in G.nodes()])
    #将图转换为距离矩阵,且用todense()化为非稀疏矩阵
    X = nx.to_scipy_sparse_matrix(G).todense()
    #把相似度转化为距离,距离越大,相似度越小,1为最大相似度
    X = 1 - X
    #返回轮廓系数
    return silhouette_score(X,labels,metric='precomputed')
print(compute_silhouette(0.1, friends))
-0.1099057806366541

打分函数,值越大效果越好;损失函数,值越小效果越好

#轮廓系数取反,打分函数变为损失函数
def invert(func):
    def inverted_function(*args, **kwds):
        return -func(*args, **kwds)
    return inverted_function
#调用minimize()优化操作
result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })
print(result)
 final_simplex: (array([[0.11125   ],
       [0.11132813]]), array([0.08167987, 0.08167987]))
           fun: 0.08167987420556581
       message: 'Maximum number of iterations has been exceeded.'
          nfev: 22
           nit: 10
        status: 2
       success: False
             x: array([0.11125])

使用上面的最优参数

G = create_graph(friends, threshold=0.11125)
sub_graphs = nx.connected_component_subgraphs(G)

for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))
Subgraph 0 has 7 nodes
Subgraph 1 has 4 nodes
Subgraph 2 has 54 nodes
Subgraph 3 has 2 nodes
Subgraph 4 has 2 nodes
Subgraph 5 has 4 nodes
Subgraph 6 has 2 nodes
Subgraph 7 has 2 nodes
Subgraph 8 has 2 nodes
Subgraph 9 has 2 nodes

最后:
有收获的话点个赞呗

链接:https://pan.baidu.com/s/1-BCilBZGTCirK02soK0DhQ 密码:jfmb

———关注我的公众号,一起学数据挖掘————
这里写图片描述

猜你喜欢

转载自blog.csdn.net/crozonkdd/article/details/80503485