爬取B站小黑屋信息
由于b站更新了反爬虫策略,现在爬取B站可以采用模拟浏览器操作进行爬取。需要安装以下python模块:
pip3 install selenium
pip3 install bs4
使用selenium模拟浏览器操作,对小黑屋进行模拟下拉操作,可以设置下拉次数(这里要注意每次下拉后要sleep一段时间,否则网页会加载不完)。等获取到足够的页面后在进行数据清洗。
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import json
import re
class BSpider():
def __init__(self):
# 设置无界面模式
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
self.browser = webdriver.Firefox(options = options)
self.blackroom_page = 'https://www.bilibili.com/blackroom/ban'
self.count = 0
# 获取页面
def get_page(self):
self.browser.get(self.blackroom_page)
# 只获取弹幕内容
self.browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[1]/div[2]/div[1]/i').click()
time.sleep(0.5)
self.browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[1]/div[2]/div[2]/p[3]').click()
time.sleep(0.5)
# 下拉页面, 下拉300次
index, max_count = 0, 300
while index < max_count:
print("scroll down: %d ..." % (index))
self.browser.execute_script(
'window.scrollTo(0,document.body.scrollHeight)'
)
time.sleep(0.8)
index = index + 1
# 字符串找中文字符
def find_chinese(self, article):
pattern = re.compile(r'[^\u4e00-\u9fa5]')
chinese = re.sub(pattern, '', article)
return chinese
# 删除星号*
def delete_star(self, article):
pattern = re.compile(r'[*]')
no_star = re.sub(pattern, '', article)
return no_star
# 解析页面,对数据进行清洗 在这里只获取账号封禁时间(永久/15天/7天......)和发的弹幕
def paser_page(self):
html = BeautifulSoup(self.browser.page_source)
output_data = []
for dl in html.find_all('dl'):
sub_output_data = {
}
black_cube = dl.parent
try:
temp_type = (black_cube.find(class_='jc').get_text())
first_p_text = self.delete_star(dl.dt.p.text)
# first_p_text = dl.dt.p.text
except Exception as e:
print(e)
# sub_output_data["reason"] = temp_reson
sub_output_data["type"] = temp_type
sub_output_data['article'] = first_p_text
if first_p_text != '':
output_data.append(sub_output_data)
# print(output_data)
# 存储数据
print('dump to json file ...')
with open(r'2020\ML\ML_action\3.NaiveBayes\data\blackroom.json', 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False,sort_keys=False, indent=4)
print('dump file done.')
b = BSpider()
print("init....")
b.get_page()
b.paser_page()