首先你得注册一个账号,可以跳过企业验证,招聘狗网站是给企业HR使用的,所以一般要求企业验证,这里我们直接跳过企业验证,下面是实现过程,有详细注释:
-
import json
-
import os
-
import random
-
import re
-
import sys
-
import traceback
-
import time
-
from PIL import Image
-
from lxml import html as lxml_html
-
import selenium
-
from selenium import webdriver
-
from selenium.common.exceptions import NoSuchElementException
-
from selenium.webdriver import ActionChains
-
import requests
-
import base64
-
from requests.exceptions import ConnectionError
-
import http.cookiejar
-
import logging
-
from dama2_API import Dama2API
-
#随机获取useragent的第三方库
-
from fake_useragent import UserAgent
-
ua = UserAgent()
-
class RTC_zhaopingou(object):
-
def __init__(self, account: dict, debug=False, visible=-1, last_try=False):
-
assert account['user_id']
-
assert account['password']
-
logging.info('Change webdriver to FireFox')
-
#创建seeion对象,爬取列表页和详情页使用
-
self.session = requests.Session()
-
self.session.headers = {
-
'Host': "qiye.zhaopingou.com",
-
"Origin":"http://qiye.zhaopingou.com",
-
"Referer":"http://qiye.zhaopingou.com",
-
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
-
}
-
#需要注册打码兔账号,从打码兔平台下载代码
-
self.dama2 = Dama2API()
-
def login(self):
-
l = logging
-
l.info("Processing Login...")
-
self.driver = webdriver.Firefox()
-
self.driver.set_window_size(1920, 1080)
-
self.driver.implicitly_wait(10)
-
driver = self.driver
-
# login_url = 'http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp='+str(int(time.time()*1000))
-
login_url = 'http://qiye.zhaopingou.com/'
-
driver.get(login_url)
-
#打开页面后出现的需要选择城市
-
driver.find_element_by_xpath('//div[@class="city-now citys"]').click()
-
#找到用户名和密码元素,模仿人手动输入
-
for i inself.account['username']:
-
driver.find_element_by_xpath('//input[@placeholder="请输入手机号/邮箱/狗狗号"]').send_keys(i)
-
time.sleep(random.uniform(0.2,0.8))
-
for j inself.account['password']:
-
driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys(j)
-
time.sleep(random.uniform(0.2, 0.8))
-
# 获取弹出验证码的按钮元素,这里有一个坑,按钮元素在iframe节点中,不能直接获取,需要通过driver.find_element_by_tag_name("iframe")切入到第一个iframe中,然后在通过xpath获取按钮元素
-
# iframe = driver.find_element_by_id('captcha_widget_aiwaylekc')
-
driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
-
# driver.switch_to.frame('captcha_widget_aiwaylekc')
-
driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').click()
-
#等待5秒,避免出现有时候还未加载出来的情况,通过driver.switch_to.default_content()从iframe切换到主html页面
-
time.sleep(5)
-
driver.switch_to.default_content()
-
#点击弹出验证码按钮后出现一个新的iframe,此时有两个iframe,并列的,从这页面切入到第二个iframe
-
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
-
# 验证码区域
-
captcha_xpath = '//div[@class="lc-panel"]'
-
# captcha_xpath = '#l-captcha-float_aiwaylekc'
-
re = self._login_process_captcha(captcha_xpath)
-
#登录成功
-
if re:
-
driver.switch_to.default_content()
-
driver.find_element_by_id('form_login').click()
-
time.sleep(3)
-
current_url = driver.current_url
-
#判断登录后的url是否是期望值
-
expect_url = 'http://qiye.zhaopingou.com/'
-
if current_url==expect_url:
-
l.info('login sucess!!!')
-
#获取cookie,并将cookie保存到session中,以便爬虫列表页和详情页使用
-
cookie = dict()
-
print(driver.get_cookies())
-
for item in driver.get_cookies():
-
# cookie += "; {}={}".format(item['name'], item["value"])
-
cookie[item['name']] = item['value']
-
if item['name'] == 'hrkeepToken':
-
self.token = item['value']
-
# 存储cookie
-
self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)
-
l.info("get cookie: {}".format(cookie))
-
#登录成功,退出driver,后面不使用了
-
self.driver.quit()
-
returnTrue
-
else:
-
l.info('login failed due to CAPTCHA, submit_count')
-
returnFalse
-
def _login_process_captcha(self,captcha_xpath):
-
l = logging
-
driver = self.driver
-
captcha_element = driver.find_element_by_xpath(captcha_xpath)
-
#验证码坐标和大小
-
offset = captcha_element.location
-
print('offset:',offset)
-
size = captcha_element.size
-
# 验证码接口
-
dama2 = self.dama2
-
#保存验证码图片
-
shm_dir = r'/tmp/zhaopingou/'
-
if os.path.exists(shm_dir) isFalse:
-
os.makedirs(shm_dir)
-
captcha_img_path = os.path.join(shm_dir, 'captcha_img_{user_id}.png'.format(user_id=self.account['user_id']))
-
maximum = 20
-
attempt = 0
-
while attempt<=maximum:
-
l.info(f'Trying to decode CAPTCHA: {attempt}/{maximum}')
-
#验证码元素
-
captcha_element = driver.find_element_by_xpath(captcha_xpath)
-
#截取验证码图片保存到captcha_img_path
-
captcha_element.screenshot(captcha_img_path)
-
try:
-
#调用打码兔接口,传入验证码类型,验证码图片文件,返回坐标值coordinate_list
-
captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)
-
l.info(f'coordinate_list:{coordinate_list}')
-
except Exception as err:
-
err_str = str(err)
-
tb = traceback.format_exc()
-
msg = f'Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}'
-
l.warning(msg)
-
attempt+=1
-
# 发生异常时先返回主页面
-
continue
-
#将鼠标移动到返回的坐标位置并点击
-
for xy in coordinate_list:
-
action = ActionChains(driver)
-
action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()
-
action.perform()
-
time.sleep(random.uniform(0.5,2))
-
#先切回到主html,再切到第一个iframe,获取之前的弹出验证按钮,判断内容是否是验证成功
-
driver.switch_to.default_content()
-
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
-
text = driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').text
-
if text.find('验证成功')!=-1:
-
l.info('验证码验证成功!')
-
time.sleep(random.uniform(1,2))
-
returnTrue
-
else: #失败则再切回到第二个iframe,从新获取验证码
-
driver.switch_to.default_content()
-
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
-
l.info('fail,and try it again')
-
attempt+=1
-
time.sleep(2)
-
continue
-
returnFalse
-
#通过搜索关键字获取列表页面,并定位到某一页
-
def search(self, keyword, page_to_go):
-
'''''搜索简历,得到列表页面,数据为json格式'''
-
l = logging
-
assert keyword
-
self.keyword = keyword
-
# 使用firefox浏览器抓取post请求参数
-
params = {
-
"pageSize":page_to_go,
-
"pageNo":"25",
-
"keyStr":keyword,
-
"companyName":"",
-
"schoolName":"",
-
"keyStrPostion":"",
-
"postionStr":"",
-
"startDegrees":"-1",
-
"endDegress":"-1",
-
"startAge":"0",
-
"endAge":"0",
-
"gender":"-1",
-
"region":"",
-
"timeType":"-1",
-
"startWorkYear":"-1",
-
"endWorkYear":"-1",
-
"beginTime":"",
-
"endTime":"",
-
"isMember":"-1",
-
"hopeAdressStr":"",
-
"cityId":"-1",
-
"updateTime":"",
-
"tradeId":"",
-
"clientNo":"",
-
"userToken":self.token,
-
"clientType":"2"
-
}
-
retry = 0
-
whileTrue:
-
#抓包获取请求的真实URL,后面是随机的数字字符串
-
search_url = "http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=" + str(int(time.time() * 1000))
-
l.info('search_url:{}'.format(search_url))
-
self.current_url = search_url
-
l.debug(f'Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}')
-
retry += 1
-
if retry == 11:
-
return''
-
try:
-
#使用session请求
-
res = self.session.post(search_url, data=params)
-
except ConnectionError:
-
l.info("ConnectionError! Sleep 5 minutes and retry...")
-
time.sleep(300)
-
self.current_url = search_url
-
continue
-
else:
-
l.info('current url is:{}'.format(res.url))
-
if res.url != search_url:
-
login_result = self.login(load=False)
-
if login_result:
-
continue
-
else:
-
l.warning("Login failed!")
-
sys.exit('login failed')
-
elifnot res.text:
-
l.info("Service is busy. Wait 5 minutes and retry...")
-
time.sleep(300)
-
l.info('Continue Searching...')
-
continue
-
#返回的数据异常,内容很少
-
elif len(str(res.text))<2000:
-
#若返回‘请您登录后查看简历’,则重新登录后在爬取
-
if'请您登录后查看简历'in str(res.text):
-
self.login(load=False)
-
continue
-
result = str(res.text)
-
#更换useragent
-
self.session.headers['User-Agent'] = ua.firefox
-
l.info(f'errorcode msg:{result}')
-
l.info('Too frequent operation, please try again in a minute')
-
time.sleep(random.randint(61,100))
-
continue
-
else:
-
try:
-
#返回的正常数据,通过json.dumps()获取json数据
-
resume_list = json.loads(res.text)
-
resume_list["current_page"]=page_to_go
-
# 在列表页面加入搜索页面
-
res = json.dumps(resume_list,ensure_ascii=False)
-
l.info(f'search_resume_list_info:{res}')
-
return res
-
except:
-
l.warning(res.text)
-
l.warning("something wrong!sleep 5 minutes and retry...")
-
time.sleep(300)
-
continue
-
def open_resume(self, url):
-
'''''
-
打开简历,得到详情页面
-
url可通过base64加密的用户id构造
-
'''
-
l = logging
-
l.debug(f'Open a resume: request_url: {url}')
-
resumeHtmlId=(url.split("="))[1]
-
# 设置前链
-
#self.session.headers['Referer'] = "http://qiye.zhaopingou.com/resume?key="+self.keyword
-
# 抓包获取简历详情页的请求参数
-
open_resume_data={
-
"resumeHtmlId": resumeHtmlId,
-
"keyStr":"",
-
"keyPositionName":"",
-
"tradeId":"",
-
"postionStr":"",
-
"jobId":"0",
-
"companyName":"",
-
"schoolName":"",
-
"clientNo":"",
-
"userToken":self.token,
-
"clientType":"2"
-
}
-
retry = 0
-
whileTrue:
-
#抓包获取详情页真实url
-
openresumeurl = "http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=" + str(int(time.time() * 1000))
-
l.info('resume_url:{}'.format(openresumeurl))
-
retry += 1
-
if retry == 11:
-
return''
-
try:
-
res = self.session.post(url=openresumeurl,data=open_resume_data)
-
except ConnectionError:
-
l.info("ConnectionError! Sleep 5 minutes and retry...")
-
time.sleep(300)
-
continue
-
else:
-
# 返回的html页面
-
l.info('current url is:{}'.format(res.url))
-
if res.url != openresumeurl:
-
l.info("cookie is invalid. Login with webdriver")
-
login_result = self.login(load=False)
-
if login_result:
-
continue
-
else:
-
l.warning("Login failed!")
-
sys.exit('login failed')
-
ifnot res.text:
-
l.info("Service is busy. Wait 5 minutes and retry...")
-
time.sleep(300)
-
continue
-
elif len(str(res.text))<2000:
-
print('errorcode:',res.text)
-
result = str(res.text)
-
l.info(f'errorcode msg:{result}')
-
l.info('Too frequent operation, please try again in a minute')
-
time.sleep(random.randint(61, 100))
-
continue
-
else:
-
try:
-
page_len = len(res.text)
-
self.current_url = openresumeurl
-
l.info(f'Downloaded a resume, len: {page_len:,d}, current_url: {url}')
-
resp_json=json.loads(res.text)
-
res_utf=json.dumps(resp_json,ensure_ascii=False)
-
return res_utf
-
except:
-
l.warning(res.text)
-
l.warning("something wrong! sleep 5 minutes and retry...")
-
time.sleep(300)
-
continue
-
if __name__ == '__main__':
-
#账号密码是假的,大家填写自己的账号密码
-
rtc_zhaopingou = RTC_zhaopingou(account={'user_id': '-701', 'username': '13419696888', 'password': '123'},
-
debug=False,
-
visible=1, last_try=False)
-
rtc_zhaopingou.login()
-
keyword_list = ['python','大数据','人工智能','java']
-
for kw in keyword_list:
-
for i in range(1,200):
-
search_result = rtc_zhaopingou.search(kw, i)
-
print('****************************************************************')
-
res = rtc_zhaopingou.open_resume(' http://qiye.zhaopingou.com/resume/detail?resumeId=5761920')
-
print(res)
打码兔平台的代码需要自己下载,放在同级目录后可以跑一下
希望能帮助大家!
欢迎大家关注我的博客:https://home.cnblogs.com/u/Python1234/
欢迎大家加入万人交流答疑群: