初学python,花了一天时间鼓捣了一个爬虫。
#coding=utf-8 import requests from bs4 import BeautifulSoup import re import string from selenium import webdriver import time urlprelix = 'http://www.******.com/' def webcrawler(max_page): page = 1 driverold = webdriver.Chrome('/Applications/chromedriver')#chromedriver的路径 driverold.get('https://pan.baidu.com/')#登录百度网盘生成cookie,后面可以导入新的网页,不用重复登录 time.sleep(30) cookies_list = driverold.get_cookies() driverold.close() while page <= max_page: if page is 1: url = urlprelix else: url = urlprelix +'440_'+ str(page) + '.html'#每页的full url #添加header防反爬虫 headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} source_code = requests.get(url) source_code.encoding = 'gbk' #修改编码方式 plaintxt = source_code.text soup = BeautifulSoup(plaintxt, "lxml") for link in soup.findAll('a',{'class': 'title yt-uix-sessionlink'}): href = link.get('href') fulllink = 'http://www.16xx8.com' + href print(fulllink) subsrc_code = requests.get(fulllink) subsrc_code.encoding = 'gbk' subplaintxt = subsrc_code.text subsoup = BeautifulSoup(subplaintxt,"lxml") for link in subsoup.findAll("a", {'onclick': re.compile('javascript:openwindow')}): magnet = link.get('onclick').replace('amp;','') magnet1 = re.findall("openwindow(.+?)400", magnet) magnet2 = str(magnet1).strip(string.punctuation).strip() bdlink = 'http://www.******.com/' + magnet2 driver = webdriver.Chrome('/Applications/chromedriver') driver.get(bdlink) for cookiedold in cookies_list: if cookiedold['domain'][0] != '.': cookiedold['domain'] = '.' + cookiedold['domain'] driver.add_cookie(cookiedold)#将之前的cookie导入到新的网页 driver.refresh() element = driver.find_elements_by_tag_name("a") print(str(element[0])) element[0].click() time.sleep(1) curhandle = driver.current_window_handle allhandle = driver.window_handles print( str(len(allhandle))) for handle in allhandle: if handle != curhandle: driver.switch_to.window(handle) #点击保存按钮 curhandle = driver.current_window_handle #savehandle = str(handles[-1]) ele1 = driver.find_elements_by_link_text('保存到网盘') try: for _ in range(1000): if curhandle == driver.current_window_handle: ele1[0].click() time.sleep(1) else: break except: print(" ") #敲击回车键确认保存 driver.switch_to.active_element curhandle = driver.current_window_handle ele2 = driver.find_element_by_link_text("确定") try: for _ in range(10): if curhandle == driver.current_window_handle: ele2.click() time.sleep(1) else: break except: print('') time.sleep(2) driver.close() page += 1 webcrawler(4)#调用函数爬去前四页的资源