基于selenium爬取图片并转存到百度网盘

初学python,花了一天时间鼓捣了一个爬虫。
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import string
from selenium import webdriver
import time

urlprelix = 'http://www.******.com/'

def webcrawler(max_page):
    page = 1
    driverold = webdriver.Chrome('/Applications/chromedriver')#chromedriver的路径
    driverold.get('https://pan.baidu.com/')#登录百度网盘生成cookie,后面可以导入新的网页,不用重复登录
    time.sleep(30)
    cookies_list = driverold.get_cookies()
    driverold.close()

    while page <= max_page:
        if page is 1:
            url = urlprelix
        else:
            url = urlprelix +'440_'+ str(page) + '.html'#每页的full url

        #添加header防反爬虫
        headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        source_code = requests.get(url)
        source_code.encoding = 'gbk' #修改编码方式
        plaintxt = source_code.text
        soup = BeautifulSoup(plaintxt, "lxml")

        for link in soup.findAll('a',{'class': 'title yt-uix-sessionlink'}):
            href = link.get('href')
            fulllink = 'http://www.16xx8.com' + href
            print(fulllink)

            subsrc_code = requests.get(fulllink)
            subsrc_code.encoding = 'gbk'
            subplaintxt = subsrc_code.text
            subsoup = BeautifulSoup(subplaintxt,"lxml")
            for link in subsoup.findAll("a", {'onclick': re.compile('javascript:openwindow')}):
                magnet = link.get('onclick').replace('amp;','')
                magnet1 = re.findall("openwindow(.+?)400", magnet)
                magnet2 = str(magnet1).strip(string.punctuation).strip()

                bdlink = 'http://www.******.com/' + magnet2
                driver = webdriver.Chrome('/Applications/chromedriver')
                driver.get(bdlink)

                for cookiedold in cookies_list:
                    if cookiedold['domain'][0] != '.':
                        cookiedold['domain'] = '.' + cookiedold['domain']
                    driver.add_cookie(cookiedold)#将之前的cookie导入到新的网页

                driver.refresh()
                element = driver.find_elements_by_tag_name("a")
                print(str(element[0]))
                element[0].click()
                time.sleep(1)

                curhandle = driver.current_window_handle
                allhandle = driver.window_handles
                print( str(len(allhandle)))
                for handle in allhandle:
                    if handle != curhandle:
                        driver.switch_to.window(handle)

                #点击保存按钮
                curhandle = driver.current_window_handle
                #savehandle = str(handles[-1])
                ele1 = driver.find_elements_by_link_text('保存到网盘')

                try:
                    for _ in range(1000):
                        if curhandle == driver.current_window_handle:
                            ele1[0].click()
                            time.sleep(1)
                        else:
                            break
                except:
                    print(" ")



                #敲击回车键确认保存
                driver.switch_to.active_element
                curhandle = driver.current_window_handle
                ele2 = driver.find_element_by_link_text("确定")

                try:
                    for _ in range(10):
                        if curhandle == driver.current_window_handle:
                            ele2.click()
                            time.sleep(1)
                        else:
                            break
                except:
                    print('')

                time.sleep(2)
                driver.close()


        page += 1



webcrawler(4)#调用函数爬去前四页的资源

猜你喜欢

转载自blog.csdn.net/whueratsjtuer/article/details/78680056
今日推荐