全网唯一Python爬取B站、微博、小红书、抖音用户粉丝数教程!

首先介绍一下我自己写的一个爬取工具BusinessTool.exe

  • 使用方法:
  • 只要在config.txt中配置相应的type和url类型即可。
  • 修改这俩部分
  • 其中url指的是需要爬取的微博博主的主页url、或者抖音、小红书、b站的链接
  • type指的是对应的链接类型:1:微博,2抖音,3快手,4.b站 5.小红书
  • 配置结束后,运行BusinessTool.exe即可爬取到相应的粉丝数

实例代码

这个是将我写了一天的代码大放送,免费赠给大家

import requests
from bs4 import BeautifulSoup
import json
from time import strftime, localtime
from fontTools.ttLib import TTFont
import re
import os


# 定义全局变量
G_WEIBO = '1'
G_DOUYIN = '2'
G_KUAISHOU = '3'
G_BZHAN = '4'
G_XIAOHONGSHU = '5'

LOGIN_COOKIES = 'SINAGLOBAL=522519899039.0867.1574345484645; UM_distinctid=16f5c2b0fd5388-06657092a336d8-6701b35-1fa400-16f5c2b0fd621e; SUHB=02Meot-9jOoPJy; ALF=1621327072; SUB=_2AkMploOff8NxqwJRmP4Qzm3ja4p3zA_EieKfynJEJRMxHRl-yT9kqhAJtRB6AhatcJAakTfQ4KKZBahMLkPKCqGmy6qa; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhZH_vEymdzQfyYRooMOfCs; UOR=,,news.ifeng.com; YF-Page-G0=913e50d6fa3a3406e80cc7f737d4352f|1590646466|1590646466; _s_tentry=-; Apache=7843672000587.247.1590646468304; ULV=1590646468380:10:6:3:7843672000587.247.1590646468304:1590636352909; YF-V5-G0=4e19e5a0c5563f06026c6591dbc8029f'
cookies2 = dict(map(lambda x: x.split('='), LOGIN_COOKIES.split(";")))

# 爬取抖音
ttfont = TTFont('111.woff')

best_cmap = ttfont['cmap'].getBestCmap()


def get_best_cmap():

    '''
    这个函数用来返回映射表
    :return:  返回映射表
    '''
    new_best_cmap={}
    for key , value in best_cmap.items():
       # print(hex(key),value)
        new_best_cmap[hex(key)] = value

    return new_best_cmap

def get_num_cmap():
    num_map={
        "x":"","num_":"1","num_1":"0",
        "num_2":"3","num_3":"2",
        "num_4":"4","num_5":"5",
        "num_6":"6","num_7":"9",
        "num_8":"7","num_9":"8",
    }
    return num_map


def get_html(url):

    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
    response = requests.get(url=url,headers=headers).text

    return response

def replace_num_and_cmap(result,response):

    for key, value in result.items():
        if key in response:
            response = re.sub(key,value , response)
    return response

#baocun
def save_to_file(response):

    with open('douyin.html','w',encoding='utf-8')as fp:
        fp.write(response)


def map_cmap_num(get_best_cmap,get_num_cmap):

    result = {}
    for key, value in get_best_cmap().items():

        key = re.sub('0','&#',key,count=1)+ ';'
        result[key] = get_num_cmap()[value]

    return result

def ScrapyDouYin(url,uid):
    '''
    爬取抖音粉丝数
    :param url:
    :return:
    '''
    result = map_cmap_num(get_best_cmap, get_num_cmap)
    response = get_html(url)
    response = replace_num_and_cmap(result, response)
    bs_ = BeautifulSoup(response, 'lxml')
    div_ = bs_.find('span', class_='follower block')
    follower_num = div_.text  # 获取粉丝数目
    fans_numbers = follower_num.replace(" ","")
    fans_number = fans_numbers.replace("粉丝",'')
    print('粉丝数目:'+fans_number)
    print('发送请求')
    PostReq(url,'2',fans_number,uid)

#endend


def fans(mid, name=-1):
    mid = str(mid)
    name = str(name)
    if name == -1:
        name = mid
    url = "https://api.bilibili.com/x/relation/stat?vmid=" + mid + "&jsonp=jsonp"
    resp = requests.get(url)# 通过url爬取到我们想要的json数据
    info = eval(resp.text)
    fans_number = info['data']['follower']
    return fans_number

def PostReq(url,type,fans,uid='130'):
    '''
    发送请求给后台 写入数据
    :param url: 地址Url
    :param type: 类型 1:微博,2抖音,3快手,4.b站 5.小红书
    :param fans: 粉丝数目
    :param uid: 默认130
    :return:
    '''
    url_ = 'http://xmk.oywblog.com/service4/user/buyer/User_fans/update_user_social'
    json_ = {'type':type,'url':url,'fans':fans,"uid":uid}
    res = requests.post(url_,data=json_)
    print(res.status_code)
    print(res.content)



def ScrapyWB(url,uid):
    '''
    获取微博粉丝数目
    :param url:
    :return:
    '''
    headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
    res = requests.get(url,headers = headers_,cookies=cookies2)
    print(res.url)
    req = requests.get(res.url,headers = headers_,cookies=cookies2)
    res_str = req.text
    pos_ = res_str.find('粉丝(')
    num_str = res_str[pos_:pos_+15]
    pos_left = num_str.find('(')
    pos_right = num_str.find(')')
    fans_number = num_str[pos_left+1:pos_right] #获取粉丝数
    print('粉丝数目:'+fans_number)
    print('开始发送请求')
    PostReq(url,'1',fans_number,uid)

def ScrapyXiaoHongShu(url,uid):
    '''
    获取小红书粉丝数目
    :param url:
    :return:
    '''
    headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
    res = requests.get(url, headers=headers_)
    bs_ = BeautifulSoup(res.text,'lxml')
    div_ = bs_.find('div',class_='card-info')
    span_ = div_.find_all('span',class_='info-number')
    fans_num = span_[1].text
    print('粉丝数目:'+fans_num) # 获取粉丝数
    fans_number = fans_num.strip() #获取粉丝数
    print('开始发送请求')
    PostReq(url, '5', fans_number,uid)

def ScrapyKuaiShou(url):
    '''
    获取爬取的粉丝数
    :param url:
    :return:
    '''
    headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
    res = requests.get(url, headers=headers_)

def ScrapyBZhan(url,uid):
    '''
    爬取B站
    :param url:
    :return:
    '''
    url_pos_left = url.find('com/')
    pors_ = url.find('/',url_pos_left+4)
    uid_num = url[url_pos_left+4:pors_] #获取uid
    headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
    #res = requests.get(url, headers=headers_)
    #res.encoding='utf-8'
   # bs_ = BeautifulSoup(res.text,'lxml')
    fans_number = fans(uid_num) #获取粉丝数
    print("粉丝数目:" + str(fans_number))
    print('发送请求写入数据')
    PostReq(url,'4',str(fans_number),uid)

def ScrapyData(url,type_,uid):
    '''
    :param url:
    :param type:
    :param uid:
    :return:
    '''
    print('url信息:'+url)
    print('type信息:' +type_)
    print('uid信息:' + uid)
    print('读取完毕')
    if type_ == G_WEIBO:
        print('微博粉丝数爬取')
        ScrapyWB(url,uid)
    if type_ == G_BZHAN:
        print('B站粉丝数爬取')
        ScrapyBZhan(url,uid)
    if type_ == G_DOUYIN:
        print('抖音粉丝数爬取')
        ScrapyDouYin(url,uid)
    if type_ == G_KUAISHOU:
        print('快手粉丝数爬取')
        ScrapyKuaiShou(url)
    if type_ == G_XIAOHONGSHU:
        print('小红书粉丝数爬取')
        ScrapyXiaoHongShu(url,uid)

def ReadFromConfig(file_name):
    '''
    读取配置信息
    :return:
    '''
    f = open(file_name,'r')
    rs_ = ''
    for lis_ in f:
        rs_ = rs_ +lis_
    json_list = list(eval(rs_))
    for json_str in json_list:
        url_ = json_str['url']
        type_ = json_str['type']
        uid_ = json_str['uid']
        ScrapyData(url_,type_,uid_)
    return

if __name__ == '__main__':
    print('初始化读取配置信息config文件夹')
    # 读取文件夹
    files = os.listdir('config/')
    for filename in files:
        print(filename)
        config_info = ReadFromConfig('config/'+filename)

源码案例代码加群:850591259

猜你喜欢

转载自blog.csdn.net/weixin_43881394/article/details/107689977