爬取b站所有的热门视频并且存入mongodb数据库

方法一(只提供思路)

因为初次加载页面的时候已经都加载完毕了,直接去BILIBILI这个页面的数据就好了,在这个页面的JS里,你用正则取出来,都是JS的对象,想办法转成字典就OK了。

方法二

# -*- coding: utf-8 -*-
import scrapy
import json
from pabz.items import PabzItem
import re
import time
import requests
from selenium import webdriver
from time import sleep





class BzSpider(scrapy.Spider):
    name = 'bz'
    # allowed_domains = ['www.com']
    start_urls = ['https://www.bilibili.com/']
    bro = webdriver.Chrome(executable_path=r'F:\爬虫包\通用爬虫selenium\chromedriver.exe')
    page=1



    def parse(self, response):
        self.bro.get(response.url)
        sleep(2)

        while(self.page<3):
            #热门视频url
            hot_url_list=response.xpath('//*[@id="reportFirst1"]/div[2]/div')[0:8]
            for url in  hot_url_list:
                item={}
                #dic是用来传网址的
                dic={}
                need_url=''.join(url.xpath('./div/a/@href').extract())
                whole_url='https:'+need_url #完整的url
                aid_number = need_url.split('av')[-1]
                cid= requests.get(url=whole_url).text
                need_cid = re.findall('","cid":(.*?),"', cid, re.M)
                need_cid=''.join(need_cid)
                dm_api = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + need_cid   # 弹幕
                zan_api = 'https://api.bilibili.com/x/web-interface/archive/' + 'stat?aid='+aid_number #点赞人数
                #在线人数
                online_url='https://api.bilibili.com/x/player.so?id=cid%3A' + need_cid+'&aid='+aid_number+'&buvid=D7512C54-9EB9-4D8A-ADF9-040A66C06A6C190950infoc'
                dic['online_url']=online_url
                dic['dm_api']=dm_api
                dic['zan_api']= zan_api

                yield scrapy.Request(whole_url,callback=self.title,meta={'dic':dic,'item':item})
        self.bro.quit()


            #标题
    def title(self,response):
        item = response.meta['item']
        dic=response.meta['dic']
        title=response.xpath('//*[@id="viewbox_report"]/h1/span/text()').extract()
        item['title'] = title
        #由于后台的原因 必须带一个参数
        header = {
            ' Access - Control - Allow - Origin': ' https: // www.bilibili.com'
        }
        yield scrapy.Request(dic['online_url'], callback=self.online, headers=header,meta={'dic':dic,'item':item})
#获取在线人数
    def online(self,response):
        dic = response.meta['dic']
        item = response.meta['item']
        response = response.body.decode()
        online = re.findall('<online_count>(.*?)</online_count>', response, re.M)
        online_people=''.join(online)
        item['online_people']=online_people
        yield  scrapy.Request(dic['zan_api'],callback=self.dianzan,meta={'dic':dic,'item':item})
    #这个函数是获取点赞 投币 收藏转发 的数量
    def dianzan(self,response):
        dic = response.meta['dic']
        item = response.meta['item']
        all_data = json.loads(response.text)
        #点赞
        detail_data =all_data.get('data')
        dian_zan=detail_data.get('like')
        item['dian_zan'] = dian_zan
        #投币
        coins=detail_data.get('coin')
        item['coins'] = coins
        #收藏
        favorite=detail_data.get('favorite')
        item['collect']=favorite
        # print(favorite)
        yield scrapy.Request(dic['dm_api'],callback=self.dm,meta={'dic':dic,'item':item})

    def dm(self,response):
        dic = response.meta['dic']
        item = response.meta['item']
        d_list = response.xpath('/i/d')
        all=[]#装 弹幕 最后加到item里去,防止覆盖了
        for d in d_list:
            content = d.xpath('./text()').extract()
            content=''.join(content)
            time_base = d.xpath('./@p').extract()
            str_time_base = ''.join(time_base)
            unix_time = str_time_base.split(',')[4]
            unix_time = int(unix_time)
            x = time.localtime(unix_time)
            end_finish_time = time.strftime('%Y-%m-%d %H:%M:%S', x)
            all_dm_content = str(end_finish_time) + content
            all.append(all_dm_content)
        item['dm'] = ''.join(all)


        yield PabzItem(
            title=item['title'],
            zan=item['dian_zan'],
            coins=item['coins'],
            collect=item['collect'],
            screen_shoot=item['dm'],
            online_people=item['online_people'])
        self.page += 1
        To_clik=self.bro.find_element_by_xpath('//*[@id="reportFirst1"]/div[2]/div[10]/i')
        To_clik.click()
        sleep(1)

我的item

class PabzItem(scrapy.Item):
    title =scrapy.Field()
    zan = scrapy.Field()
    coins =scrapy.Field()
    collect = scrapy.Field()
    screen_shoot=scrapy.Field()
    online_people=scrapy.Field()
  

我的pipine

class PabzPipeline(object):
    def process_item(self, item, spider):
        return item




from pymongo import MongoClient  # 使用MongoClient连接mongo
from pabz.settings import Mongoip,MongoDBname,MongoPort,MongoItem   #从settings.py导入第一步配置的连接信息
class CrawldataToMongoPipline(object):
    def __init__(self):
        host = Mongoip      #主机
        port = MongoPort    #端口
        dbName = MongoDBname  #文档名
        client = MongoClient(host=host,port=port)    # 创建连接对象client
        db = client[dbName]                          # 使用文档dbName='mylove1'
        self.post = db[MongoItem]                  # 使用item MongoItem='DouluodaluItem'

    def process_item(self, item, spider):
        dl_info = dict(item)                      # item转换为字典格式
        self.post.insert(dl_info)                 # 将item写入mongo
        return item

我的setting

Mongoip='127.0.0.1'       #mongoDB节点 ip地址 可以写127.0.0.1,或者cmdifconfig查自己的ip 前提是可视化工具(robo 3t 客户端)能打开你可视化能打开本地的mongodb
MongoPort = 27017             #端口号  一般都是 27017
MongoDBname='mylove1'         #文档名
MongoItem='PabzItem'    #item名

最后要打开item —— pipiline

ITEM_PIPELINES = {
   'pabz.pipelines.PabzPipeline': 300,
   'pabz.pipelines.CrawldataToMongoPipline': 301,#入数据库方式
}

发布了6 篇原创文章 · 获赞 2 · 访问量 1769

猜你喜欢

转载自blog.csdn.net/MYLOVEis77/article/details/104264148