方法一(只提供思路)
因为初次加载页面的时候已经都加载完毕了,直接去BILIBILI这个页面的数据就好了,在这个页面的JS里,你用正则取出来,都是JS的对象,想办法转成字典就OK了。
方法二
# -*- coding: utf-8 -*-
import scrapy
import json
from pabz.items import PabzItem
import re
import time
import requests
from selenium import webdriver
from time import sleep
class BzSpider(scrapy.Spider):
name = 'bz'
# allowed_domains = ['www.com']
start_urls = ['https://www.bilibili.com/']
bro = webdriver.Chrome(executable_path=r'F:\爬虫包\通用爬虫selenium\chromedriver.exe')
page=1
def parse(self, response):
self.bro.get(response.url)
sleep(2)
while(self.page<3):
#热门视频url
hot_url_list=response.xpath('//*[@id="reportFirst1"]/div[2]/div')[0:8]
for url in hot_url_list:
item={}
#dic是用来传网址的
dic={}
need_url=''.join(url.xpath('./div/a/@href').extract())
whole_url='https:'+need_url #完整的url
aid_number = need_url.split('av')[-1]
cid= requests.get(url=whole_url).text
need_cid = re.findall('","cid":(.*?),"', cid, re.M)
need_cid=''.join(need_cid)
dm_api = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + need_cid # 弹幕
zan_api = 'https://api.bilibili.com/x/web-interface/archive/' + 'stat?aid='+aid_number #点赞人数
#在线人数
online_url='https://api.bilibili.com/x/player.so?id=cid%3A' + need_cid+'&aid='+aid_number+'&buvid=D7512C54-9EB9-4D8A-ADF9-040A66C06A6C190950infoc'
dic['online_url']=online_url
dic['dm_api']=dm_api
dic['zan_api']= zan_api
yield scrapy.Request(whole_url,callback=self.title,meta={'dic':dic,'item':item})
self.bro.quit()
#标题
def title(self,response):
item = response.meta['item']
dic=response.meta['dic']
title=response.xpath('//*[@id="viewbox_report"]/h1/span/text()').extract()
item['title'] = title
#由于后台的原因 必须带一个参数
header = {
' Access - Control - Allow - Origin': ' https: // www.bilibili.com'
}
yield scrapy.Request(dic['online_url'], callback=self.online, headers=header,meta={'dic':dic,'item':item})
#获取在线人数
def online(self,response):
dic = response.meta['dic']
item = response.meta['item']
response = response.body.decode()
online = re.findall('<online_count>(.*?)</online_count>', response, re.M)
online_people=''.join(online)
item['online_people']=online_people
yield scrapy.Request(dic['zan_api'],callback=self.dianzan,meta={'dic':dic,'item':item})
#这个函数是获取点赞 投币 收藏转发 的数量
def dianzan(self,response):
dic = response.meta['dic']
item = response.meta['item']
all_data = json.loads(response.text)
#点赞
detail_data =all_data.get('data')
dian_zan=detail_data.get('like')
item['dian_zan'] = dian_zan
#投币
coins=detail_data.get('coin')
item['coins'] = coins
#收藏
favorite=detail_data.get('favorite')
item['collect']=favorite
# print(favorite)
yield scrapy.Request(dic['dm_api'],callback=self.dm,meta={'dic':dic,'item':item})
def dm(self,response):
dic = response.meta['dic']
item = response.meta['item']
d_list = response.xpath('/i/d')
all=[]#装 弹幕 最后加到item里去,防止覆盖了
for d in d_list:
content = d.xpath('./text()').extract()
content=''.join(content)
time_base = d.xpath('./@p').extract()
str_time_base = ''.join(time_base)
unix_time = str_time_base.split(',')[4]
unix_time = int(unix_time)
x = time.localtime(unix_time)
end_finish_time = time.strftime('%Y-%m-%d %H:%M:%S', x)
all_dm_content = str(end_finish_time) + content
all.append(all_dm_content)
item['dm'] = ''.join(all)
yield PabzItem(
title=item['title'],
zan=item['dian_zan'],
coins=item['coins'],
collect=item['collect'],
screen_shoot=item['dm'],
online_people=item['online_people'])
self.page += 1
To_clik=self.bro.find_element_by_xpath('//*[@id="reportFirst1"]/div[2]/div[10]/i')
To_clik.click()
sleep(1)
我的item
class PabzItem(scrapy.Item):
title =scrapy.Field()
zan = scrapy.Field()
coins =scrapy.Field()
collect = scrapy.Field()
screen_shoot=scrapy.Field()
online_people=scrapy.Field()
我的pipine
class PabzPipeline(object):
def process_item(self, item, spider):
return item
from pymongo import MongoClient # 使用MongoClient连接mongo
from pabz.settings import Mongoip,MongoDBname,MongoPort,MongoItem #从settings.py导入第一步配置的连接信息
class CrawldataToMongoPipline(object):
def __init__(self):
host = Mongoip #主机
port = MongoPort #端口
dbName = MongoDBname #文档名
client = MongoClient(host=host,port=port) # 创建连接对象client
db = client[dbName] # 使用文档dbName='mylove1'
self.post = db[MongoItem] # 使用item MongoItem='DouluodaluItem'
def process_item(self, item, spider):
dl_info = dict(item) # item转换为字典格式
self.post.insert(dl_info) # 将item写入mongo
return item
我的setting
Mongoip='127.0.0.1' #mongoDB节点 ip地址 可以写127.0.0.1,或者cmdifconfig查自己的ip 前提是可视化工具(robo 3t 客户端)能打开你可视化能打开本地的mongodb
MongoPort = 27017 #端口号 一般都是 27017
MongoDBname='mylove1' #文档名
MongoItem='PabzItem' #item名
最后要打开item —— pipiline
ITEM_PIPELINES = {
'pabz.pipelines.PabzPipeline': 300,
'pabz.pipelines.CrawldataToMongoPipline': 301,#入数据库方式
}