selenium模拟登陆拉钩网
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os, json, time
from urllib import parse
from lxml import etree
from fake_useragent import UserAgent
from pwd import username,password
from pymongo import MongoClient
ua = UserAgent()
#搜索的关键字
keywords_ls = ['python','java','web','c']
#搜索的热门城市
citys_ls = ['北京','上海','深圳','广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']
class LaGouselenium():
def __init__(self,keywords_ls=keywords_ls,citys_ls=citys_ls):
self.keywords_ls=keywords_ls
self.citys_ls=citys_ls
self.crawl_city=[] #已经爬取的城市
#断点续传
crawledCityPath='./lagou_crawled_city.json'
if os.path.exists(crawledCityPath):
with open(crawledCityPath,'r',encoding='utf8') as f:
ls = json.load(f)
self.crawled_city=ls
self.col = MongoClient()['selenium']['LGW']
#抓取条数
print('已抓取',self.col.count_documents({}))
#生成游览对象
self.chrome = webdriver.Chrome('D:\data\chromedriver\chromedriver.exe')
#隐式等待三秒
self.chrome.implicitly_wait(3)
def login(self):
loginUrl='https://passport.lagou.com/login/login.html'
self.chrome.get(loginUrl)
#账号 密码 登录
self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="username"]/input').send_keys(username)
self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="password"]/input').send_keys(password)
self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="submit"]/input').click()
#遇到验证码后 阻塞 手动验证
c = input('如果出现验证码 手动验证后 回车, 否则直接回车')
def crawl(self):
#点击红包页面
init=True
for k in self.keywords_ls:
#爬取哪一个关键字的信息
print(k)
#https://www.lagou.com/jobs/list_java/p-city_2?px=default#filterBox
#最新发布排序
url = 'https://www.lagou.com/jobs/list_{}/p-city_0?px=new&#filterBox'.format(k)
self.chrome.get(url)
if 'sec.lagou.com' in self.chrome.current_url: #301重定向
input('欢迎进入验证码页面!手动处理完成后回车')
#红包页面只点一次
if init:
self.chrome.find_element_by_class_name('body-btn').click()
init = False
for city in self.citys_ls:
print(city)
#维护断点续传
if (k,city) in self.crawl_city:
continue
#点击城市
self.chrome.find_element_by_link_text(city).click()
if 'sec.lagou.com' in self.chrome.current_url: #301重定向
input('欢迎进入验证码页面!手动处理完成后回车')
#循环翻页
while 1:
#总页面
totalpage = int(self.chrome.find_element_by_class_name('totalNum').text.strip())
#当前页面
curpage = int(self.chrome.find_element_by_class_name('curNum').text.strip())
print('翻页',curpage,'/',totalpage)
self.parseListPage(k,city)
if curpage == totalpage:
break
else:
self.chrome.find_element_by_class_name('next').click()
if 'sec.lagou.com' in self.chrome.current_url: #301重定向
input('欢迎进入验证码页面!手动处理完成后回车')
#一定要刷新 否则下一次循环 无法定位城市
self.chrome.refresh()
#这个字段的城市已经爬取
self.crawl_city.append((k,city))
with open('./lagou_crawled_city.json','w',encoding='utf8') as f:
print('已爬+++++++++++++',city,k)
#存储
json.dump(self.crawl_city,f,ensure_ascii=False)
self.checkDetailPage()
def handleTime(self,timestr):
pass
def parseListPage(self,keyword,city):
WebDriverWait(self.chrome,3).until(EC.presence_of_all_elements_located(('class name','con_list_item')))
time.sleep(1)
for ele in self.chrome.find_elements_by_class_name('con_list_item'):
item = {"keyword":keyword,'city':city}
try:
url = ele.find_element_by_xpath('./div/div/div/a').get_attribute('href')
jd_id = url.split('.html')[0].split('/')[-1].strip()
print(jd_id)
except:
return
if self.col.find_one({'jd_id':jd_id}):
print('重复',jd_id)
#continue #如果是初次爬取 continue 后期增量爬取 break
break
item['jd_id'] = jd_id
text = ele.find_element_by_xpath('./div').text
ls = text.split('\n')
title = ls[0]
addr =ls[1]
#pubtime =self.handleTime(ls[2])
item['title'] =title
item['addr'] =addr
#其他字段 自行添加
salaryRange=ls[3].split(' ')[0].split('-')
item['salaryRange']=salaryRange
print(item,'***********')
self.col.insert_one(item)
def checkDetailPage(self):
'''在数据库中遍历 补全信息'''
for item in self.col.find({'job_description':False}):
url = 'https://www.lagou.com/jobs/%s.html'%item['jd_id'].strip()
self.chrome.get(url)
des = self.chrome.find_element_by_class_name('job-detail').text.strip()
self.col.update_one({'jd_id':item['jd_id']},{'$set':{'job_description':des}})
if __name__ == "__main__":
lagou = LaGouselenium(keywords_ls,citys_ls)
lagou.login()
lagou.crawl()
效果演示