利用selenium爬取知乎图片

from lxml import etree
from selenium import webdriver
import time
from urllib import request
dirName ='imgLibs'

driver = webdriver.Chrome(r'C:\Users\Administrator.DESKTOP-9DN4SRE\Downloads\Compressed\chromedriver_win32\chromedriver.exe')  
driver.get("https://www.zhihu.com/question/40273344") #需要爬取的网页

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #滑动到页面底部,实测知乎为动态加载 一次并不能获取太多数据信息,因此下面代码,多次调用此函数
time.sleep(6) #根据网络状况设置
#滑动之后出现登录页面
driver.find_element_by_xpath('/html/body/div[3]/div/div/div/div[2]/button').click()  #知乎模拟登陆较为复杂,因此直接定位到登录弹窗右上角关闭,窗口,然后再进行滑动页面
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)

page_text = driver.page_source

tree = etree.HTML(page_text)
img_list = tree.xpath('//figure')
i= 1
for img in img_list:
    if img.xpath('.//@data-original') : #由于figure标签内有其他图片,例如小的表情包,这里设置一个判断条件,获取到data-origin标签内容后,执行下列代码,否则,结束本次循环
        img_src = img.xpath('.//@data-original')[0]
        img_name = str(i)+'.jpg'
        filePath='./'+dirName+'/'+img_name
        request.urlretrieve(img_src,filename=filePath)
        i+=1
        print('xiazaichengg')
    else:
        continue

猜你喜欢

转载自www.cnblogs.com/cuirenlao/p/12503055.html