爬取B站小黑屋信息

由于b站更新了反爬虫策略，现在爬取B站可以采用模拟浏览器操作进行爬取。需要安装以下python模块:

pip3 install selenium 
pip3 install bs4

使用selenium模拟浏览器操作，对小黑屋进行模拟下拉操作，可以设置下拉次数（这里要注意每次下拉后要sleep一段时间，否则网页会加载不完）。等获取到足够的页面后在进行数据清洗。

from selenium import webdriver
from bs4 import BeautifulSoup  
import time
import json
import re


class BSpider():

    def __init__(self):
        # 设置无界面模式
        options = webdriver.FirefoxOptions()
        options.add_argument('--headless')
        self.browser = webdriver.Firefox(options = options)
        self.blackroom_page = 'https://www.bilibili.com/blackroom/ban'
        self.count = 0

    # 获取页面
    def get_page(self):
        
        self.browser.get(self.blackroom_page)
        # 只获取弹幕内容
        self.browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[1]/div[2]/div[1]/i').click()
        time.sleep(0.5)
        self.browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[1]/div[2]/div[2]/p[3]').click()
        time.sleep(0.5)
        # 下拉页面, 下拉300次
        index, max_count = 0, 300
        while index < max_count:
            print("scroll down: %d ..." % (index))
            self.browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)'
            )
            time.sleep(0.8)
            index = index + 1

    # 字符串找中文字符
    def find_chinese(self, article):
        pattern = re.compile(r'[^\u4e00-\u9fa5]')
        chinese = re.sub(pattern, '', article)
        return chinese
	# 删除星号*
    def delete_star(self, article):
        pattern = re.compile(r'[*]')
        no_star = re.sub(pattern, '', article)
        return no_star
    # 解析页面，对数据进行清洗 在这里只获取账号封禁时间（永久/15天/7天......）和发的弹幕
    def paser_page(self):
        html = BeautifulSoup(self.browser.page_source)
        
        output_data = []
        for dl in html.find_all('dl'):
            sub_output_data = {
    
    }
            black_cube = dl.parent
            try:
                temp_type = (black_cube.find(class_='jc').get_text())
                first_p_text = self.delete_star(dl.dt.p.text)
                # first_p_text = dl.dt.p.text
            except Exception as e:
                print(e)
            
            # sub_output_data["reason"] = temp_reson
            sub_output_data["type"] = temp_type
            sub_output_data['article'] = first_p_text

            if first_p_text != '':
                output_data.append(sub_output_data)
        # print(output_data)
        # 存储数据
        print('dump to json file ...')
        with open(r'2020\ML\ML_action\3.NaiveBayes\data\blackroom.json', 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False,sort_keys=False, indent=4)
        print('dump  file done.')
    
b = BSpider()
print("init....")
b.get_page()
b.paser_page()

【爬虫】爬取B站小黑屋

爬取B站小黑屋信息

猜你喜欢