"""
爬取蓝桥杯历届试题
tips:
1. 提前登录账号并加入历届试题课程:https://www.lanqiao.cn/courses/2786/learning
2. 本代码还有待改善,因为蓝桥网站反扒太强,到处都是动态加载和请求,导致很多转圈圈的gif动图
3. 使用无头浏览器需要先把登录二维码截取下来并弹窗展示
4. 这里只提供一个思路
4. 有时间和能力再改进
"""
import os
from time import sleep
from urllib import request
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
chrome_optios = Options()
chrome_optios.add_argument('--headless')
chrome_optios.add_argument('--disable-gpu')
dirName = 'TestLibs'
if not os.path.exists(dirName):
os.mkdir(dirName)
url = 'https://www.lanqiao.cn/courses/2786/learning/'
# 无头浏览器需要先把登录二维码截取下来并弹窗展示
# bro = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_optios)
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.lanqiao.cn/login/')
# 先登录你的账号
sleep(20)
# 获取试题页面信息
bro.get(url)
num = 1
flag = True
while flag:
# 太快了你会发现全是转圈圈的gif
sleep(10)
# 图片懒加载
try:
WebDriverWait(bro, 10, 0.5).until(lambda bro: bro.find_element_by_xpath(
'//*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[2]/div/button'))
except:
flag = False
n = 1
page_text = bro.page_source
tree = etree.HTML(page_text)
imgs = tree.xpath('//*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[1]/div/div/div[2]/div/div/div/div')
# //*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[1]/div/div/div[2]/div/div/div
title = '课件' + str(num)
for img in imgs:
img_url = img.xpath('./img/@src')[0]
if not os.path.exists(dirName + '/' + title):
os.makedirs(dirName + '/' + title)
imgPath = dirName + '/' + title + '/' + str(n) + '.png'
print(imgPath)
print(img_url)
request.urlretrieve(img_url, filename=imgPath)
n += 1
if flag:
bro.find_element_by_xpath('//*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[2]/div/button').click()
num += 1
print(num)
bro.quit()
利用selenium批量获取蓝桥杯历年真题(仅供参考)
猜你喜欢
转载自blog.csdn.net/qq_31910669/article/details/114292828
今日推荐
周排行