import warnings
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import datetime
import time
#忽略警告提示
warnings.filterwarnings("ignore")
#b站视频链接/地址
title_url = 'https://www.bilibili.com/video/av{}'
#b站api b站的av号就是aid
mode_url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}'
#请求chrome
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/49.0.2623.112 Safari/537.36'}
#获取所需的信息列表
def get_info(t_url,m_url):
msg_list = []
#需要做异常处理
try:
#获取静态信息
#获取并解析视频信息
video_html = requests.get(t_url, headers=headers)
soup = BeautifulSoup(video_html.text, 'lxml')
#获取视频标题
title=soup.title.string
msg_list.append(title)
#获取发布时间
pubilc_time=soup.find('time').get_text()
msg_list.append(pubilc_time)
#获取作者昵称
author_name=soup.find('a', {'class': 'name is-vip'}).get_text()
msg_list.append(author_name)
#获取视频标签
html_video_tags=soup.find_all('li',{'class':'tag'})
video_tags = []
for each in html_video_tags:
e=each.find('a',{'target':'_blank'}).get_text()
video_tags.append(e)
msg_list.append(video_tags)
#获取动态信息
response1 = requests.get(m_url, headers=headers, verify=False, timeout=10)
print(response1.status_code)
if response1.status_code == 200:
j1 = response1.json()['data']
#获取av号
av = 'av' + str(j1['aid'])
#获取播放量--判断是否为空
view=j1['view']
#获取弹幕数
danmaku=j1['danmaku']
#获取评论量
reply=j1['reply']
msg_list.extend([av, view, danmaku, reply])
except Exception as e:
print(e)
pass
return msg_list
#计时装饰器
def timer(func):
def time_count(*args):
start_time = datetime.datetime.now()
func(*args)
end_time = datetime.datetime.now()
day = (end_time - start_time).days
times = (end_time - start_time).seconds
hour = times / 3600
h = times % 3600
minute = h / 60
m = h % 60
second = m
print('爬取完成')
print('一共用时%s天%s时%s分%s秒' % (day, hour, minute, second))
return time_count
#将数据存储到mongodb
def mongodb_save(my_list):
#建立数据库连接
client = MongoClient('localhost', 27017)
#获取数据库
db = client.bili
#获取表
collection = db.video
#插入数据
try:
v=dict(title =my_list[0],public_time=my_list[1],author_name=my_list[2],
video_tags=my_list[3],av=my_list[4],view=my_list[5],danmaku=my_list[6],
reply=my_list[7])
collection.insert(v)
except Exception as e:
print(e)
#主函数
@timer
def main(i, n):
print('开始爬取...')
t = 0
count = 0
while t < n:
t += 1
if count == 150:
time.sleep(60)
count = 0
else:
count += 1
t_url = title_url.format(i)
m_url = mode_url.format(i)
msg_list = get_info(t_url, m_url)
print(len(msg_list))
if len(msg_list) == 8:
#存到数据库
mongodb_save(msg_list)
print('爬取第%s个成功'%t)
else:
print('爬取第%s个失败' % t)
i+=1 #i+1的位置应该在循环内,判断外
if __name__ == '__main__':
num1 = input("起始视频编号:")
num11=int(num1)
print("---------------------")
num2 = input("需要爬取数量:")
num22=int(num2)
print("---------------------")
main(num11, num22)
B站视频信息爬虫python
猜你喜欢
转载自blog.csdn.net/The_Legend_of_1900/article/details/86418132
今日推荐
周排行