首先介绍一下我自己写的一个爬取工具BusinessTool.exe
- 使用方法:
- 只要在config.txt中配置相应的type和url类型即可。
- 修改这俩部分
- 其中url指的是需要爬取的微博博主的主页url、或者抖音、小红书、b站的链接
- type指的是对应的链接类型:1:微博,2抖音,3快手,4.b站 5.小红书
- 配置结束后,运行BusinessTool.exe即可爬取到相应的粉丝数
实例代码
这个是将我写了一天的代码大放送,免费赠给大家
import requests from bs4 import BeautifulSoup import json from time import strftime, localtime from fontTools.ttLib import TTFont import re import os # 定义全局变量 G_WEIBO = '1' G_DOUYIN = '2' G_KUAISHOU = '3' G_BZHAN = '4' G_XIAOHONGSHU = '5' LOGIN_COOKIES = 'SINAGLOBAL=522519899039.0867.1574345484645; UM_distinctid=16f5c2b0fd5388-06657092a336d8-6701b35-1fa400-16f5c2b0fd621e; SUHB=02Meot-9jOoPJy; ALF=1621327072; SUB=_2AkMploOff8NxqwJRmP4Qzm3ja4p3zA_EieKfynJEJRMxHRl-yT9kqhAJtRB6AhatcJAakTfQ4KKZBahMLkPKCqGmy6qa; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhZH_vEymdzQfyYRooMOfCs; UOR=,,news.ifeng.com; YF-Page-G0=913e50d6fa3a3406e80cc7f737d4352f|1590646466|1590646466; _s_tentry=-; Apache=7843672000587.247.1590646468304; ULV=1590646468380:10:6:3:7843672000587.247.1590646468304:1590636352909; YF-V5-G0=4e19e5a0c5563f06026c6591dbc8029f' cookies2 = dict(map(lambda x: x.split('='), LOGIN_COOKIES.split(";"))) # 爬取抖音 ttfont = TTFont('111.woff') best_cmap = ttfont['cmap'].getBestCmap() def get_best_cmap(): ''' 这个函数用来返回映射表 :return: 返回映射表 ''' new_best_cmap={} for key , value in best_cmap.items(): # print(hex(key),value) new_best_cmap[hex(key)] = value return new_best_cmap def get_num_cmap(): num_map={ "x":"","num_":"1","num_1":"0", "num_2":"3","num_3":"2", "num_4":"4","num_5":"5", "num_6":"6","num_7":"9", "num_8":"7","num_9":"8", } return num_map def get_html(url): headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} response = requests.get(url=url,headers=headers).text return response def replace_num_and_cmap(result,response): for key, value in result.items(): if key in response: response = re.sub(key,value , response) return response #baocun def save_to_file(response): with open('douyin.html','w',encoding='utf-8')as fp: fp.write(response) def map_cmap_num(get_best_cmap,get_num_cmap): result = {} for key, value in get_best_cmap().items(): key = re.sub('0','&#',key,count=1)+ ';' result[key] = get_num_cmap()[value] return result def ScrapyDouYin(url,uid): ''' 爬取抖音粉丝数 :param url: :return: ''' result = map_cmap_num(get_best_cmap, get_num_cmap) response = get_html(url) response = replace_num_and_cmap(result, response) bs_ = BeautifulSoup(response, 'lxml') div_ = bs_.find('span', class_='follower block') follower_num = div_.text # 获取粉丝数目 fans_numbers = follower_num.replace(" ","") fans_number = fans_numbers.replace("粉丝",'') print('粉丝数目:'+fans_number) print('发送请求') PostReq(url,'2',fans_number,uid) #endend def fans(mid, name=-1): mid = str(mid) name = str(name) if name == -1: name = mid url = "https://api.bilibili.com/x/relation/stat?vmid=" + mid + "&jsonp=jsonp" resp = requests.get(url)# 通过url爬取到我们想要的json数据 info = eval(resp.text) fans_number = info['data']['follower'] return fans_number def PostReq(url,type,fans,uid='130'): ''' 发送请求给后台 写入数据 :param url: 地址Url :param type: 类型 1:微博,2抖音,3快手,4.b站 5.小红书 :param fans: 粉丝数目 :param uid: 默认130 :return: ''' url_ = 'http://xmk.oywblog.com/service4/user/buyer/User_fans/update_user_social' json_ = {'type':type,'url':url,'fans':fans,"uid":uid} res = requests.post(url_,data=json_) print(res.status_code) print(res.content) def ScrapyWB(url,uid): ''' 获取微博粉丝数目 :param url: :return: ''' headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} res = requests.get(url,headers = headers_,cookies=cookies2) print(res.url) req = requests.get(res.url,headers = headers_,cookies=cookies2) res_str = req.text pos_ = res_str.find('粉丝(') num_str = res_str[pos_:pos_+15] pos_left = num_str.find('(') pos_right = num_str.find(')') fans_number = num_str[pos_left+1:pos_right] #获取粉丝数 print('粉丝数目:'+fans_number) print('开始发送请求') PostReq(url,'1',fans_number,uid) def ScrapyXiaoHongShu(url,uid): ''' 获取小红书粉丝数目 :param url: :return: ''' headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} res = requests.get(url, headers=headers_) bs_ = BeautifulSoup(res.text,'lxml') div_ = bs_.find('div',class_='card-info') span_ = div_.find_all('span',class_='info-number') fans_num = span_[1].text print('粉丝数目:'+fans_num) # 获取粉丝数 fans_number = fans_num.strip() #获取粉丝数 print('开始发送请求') PostReq(url, '5', fans_number,uid) def ScrapyKuaiShou(url): ''' 获取爬取的粉丝数 :param url: :return: ''' headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} res = requests.get(url, headers=headers_) def ScrapyBZhan(url,uid): ''' 爬取B站 :param url: :return: ''' url_pos_left = url.find('com/') pors_ = url.find('/',url_pos_left+4) uid_num = url[url_pos_left+4:pors_] #获取uid headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} #res = requests.get(url, headers=headers_) #res.encoding='utf-8' # bs_ = BeautifulSoup(res.text,'lxml') fans_number = fans(uid_num) #获取粉丝数 print("粉丝数目:" + str(fans_number)) print('发送请求写入数据') PostReq(url,'4',str(fans_number),uid) def ScrapyData(url,type_,uid): ''' :param url: :param type: :param uid: :return: ''' print('url信息:'+url) print('type信息:' +type_) print('uid信息:' + uid) print('读取完毕') if type_ == G_WEIBO: print('微博粉丝数爬取') ScrapyWB(url,uid) if type_ == G_BZHAN: print('B站粉丝数爬取') ScrapyBZhan(url,uid) if type_ == G_DOUYIN: print('抖音粉丝数爬取') ScrapyDouYin(url,uid) if type_ == G_KUAISHOU: print('快手粉丝数爬取') ScrapyKuaiShou(url) if type_ == G_XIAOHONGSHU: print('小红书粉丝数爬取') ScrapyXiaoHongShu(url,uid) def ReadFromConfig(file_name): ''' 读取配置信息 :return: ''' f = open(file_name,'r') rs_ = '' for lis_ in f: rs_ = rs_ +lis_ json_list = list(eval(rs_)) for json_str in json_list: url_ = json_str['url'] type_ = json_str['type'] uid_ = json_str['uid'] ScrapyData(url_,type_,uid_) return if __name__ == '__main__': print('初始化读取配置信息config文件夹') # 读取文件夹 files = os.listdir('config/') for filename in files: print(filename) config_info = ReadFromConfig('config/'+filename)
源码案例代码加群:850591259