selenium和Python3.6实现招聘狗网站自动识别验证码登录!附源码

首先你得注册一个账号,可以跳过企业验证,招聘狗网站是给企业HR使用的,所以一般要求企业验证,这里我们直接跳过企业验证,下面是实现过程,有详细注释:

  1. import json

  2. import os

  3. import random

  4. import re

  5. import sys

  6. import traceback

  7. import time

  8. from PIL import Image

  9. from lxml import html as lxml_html

  10. import selenium

  11. from selenium import webdriver

  12. from selenium.common.exceptions import NoSuchElementException

  13. from selenium.webdriver import ActionChains

  14. import requests

  15. import base64

  16. from requests.exceptions import ConnectionError

  17. import http.cookiejar

  18. import logging

  19. from dama2_API import Dama2API

  20. #随机获取useragent的第三方库

  21. from fake_useragent import UserAgent

  22. ua = UserAgent()

  23. class RTC_zhaopingou(object):

  24. def __init__(self, account: dict, debug=False, visible=-1, last_try=False):

  25. assert account['user_id']

  26. assert account['password']

  27. logging.info('Change webdriver to FireFox')

  28. #创建seeion对象,爬取列表页和详情页使用

  29. self.session = requests.Session()

  30. self.session.headers = {

  31. 'Host': "qiye.zhaopingou.com",

  32. "Origin":"http://qiye.zhaopingou.com",

  33. "Referer":"http://qiye.zhaopingou.com",

  34. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",

  35. }

  36. #需要注册打码兔账号,从打码兔平台下载代码

  37. self.dama2 = Dama2API()

  38. def login(self):

  39. l = logging

  40. l.info("Processing Login...")

  41. self.driver = webdriver.Firefox()

  42. self.driver.set_window_size(1920, 1080)

  43. self.driver.implicitly_wait(10)

  44. driver = self.driver

  45. # login_url = 'http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp='+str(int(time.time()*1000))

  46. login_url = 'http://qiye.zhaopingou.com/'

  47. driver.get(login_url)

  48. #打开页面后出现的需要选择城市

  49. driver.find_element_by_xpath('//div[@class="city-now citys"]').click()

  50. #找到用户名和密码元素,模仿人手动输入

  51. for i inself.account['username']:

  52. driver.find_element_by_xpath('//input[@placeholder="请输入手机号/邮箱/狗狗号"]').send_keys(i)

  53. time.sleep(random.uniform(0.2,0.8))

  54. for j inself.account['password']:

  55. driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys(j)

  56. time.sleep(random.uniform(0.2, 0.8))

  57. # 获取弹出验证码的按钮元素,这里有一个坑,按钮元素在iframe节点中,不能直接获取,需要通过driver.find_element_by_tag_name("iframe")切入到第一个iframe中,然后在通过xpath获取按钮元素

  58. # iframe = driver.find_element_by_id('captcha_widget_aiwaylekc')

  59. driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))

  60. # driver.switch_to.frame('captcha_widget_aiwaylekc')

  61. driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').click()

  62. #等待5秒,避免出现有时候还未加载出来的情况,通过driver.switch_to.default_content()从iframe切换到主html页面

  63. time.sleep(5)

  64. driver.switch_to.default_content()

  65. #点击弹出验证码按钮后出现一个新的iframe,此时有两个iframe,并列的,从这页面切入到第二个iframe

  66. driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])

  67. # 验证码区域

  68. captcha_xpath = '//div[@class="lc-panel"]'

  69. # captcha_xpath = '#l-captcha-float_aiwaylekc'

  70. re = self._login_process_captcha(captcha_xpath)

  71. #登录成功

  72. if re:

  73. driver.switch_to.default_content()

  74. driver.find_element_by_id('form_login').click()

  75. time.sleep(3)

  76. current_url = driver.current_url

  77. #判断登录后的url是否是期望值

  78. expect_url = 'http://qiye.zhaopingou.com/'

  79. if current_url==expect_url:

  80. l.info('login sucess!!!')

  81. #获取cookie,并将cookie保存到session中,以便爬虫列表页和详情页使用

  82. cookie = dict()

  83. print(driver.get_cookies())

  84. for item in driver.get_cookies():

  85. # cookie += "; {}={}".format(item['name'], item["value"])

  86. cookie[item['name']] = item['value']

  87. if item['name'] == 'hrkeepToken':

  88. self.token = item['value']

  89. # 存储cookie

  90. self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)

  91. l.info("get cookie: {}".format(cookie))

  92. #登录成功,退出driver,后面不使用了

  93. self.driver.quit()

  94. returnTrue

  95. else:

  96. l.info('login failed due to CAPTCHA, submit_count')

  97. returnFalse

  98. def _login_process_captcha(self,captcha_xpath):

  99. l = logging

  100. driver = self.driver

  101. captcha_element = driver.find_element_by_xpath(captcha_xpath)

  102. #验证码坐标和大小

  103. offset = captcha_element.location

  104. print('offset:',offset)

  105. size = captcha_element.size

  106. # 验证码接口

  107. dama2 = self.dama2

  108. #保存验证码图片

  109. shm_dir = r'/tmp/zhaopingou/'

  110. if os.path.exists(shm_dir) isFalse:

  111. os.makedirs(shm_dir)

  112. captcha_img_path = os.path.join(shm_dir, 'captcha_img_{user_id}.png'.format(user_id=self.account['user_id']))

  113. maximum = 20

  114. attempt = 0

  115. while attempt<=maximum:

  116. l.info(f'Trying to decode CAPTCHA: {attempt}/{maximum}')

  117. #验证码元素

  118. captcha_element = driver.find_element_by_xpath(captcha_xpath)

  119. #截取验证码图片保存到captcha_img_path

  120. captcha_element.screenshot(captcha_img_path)

  121. try:

  122. #调用打码兔接口,传入验证码类型,验证码图片文件,返回坐标值coordinate_list

  123. captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)

  124. l.info(f'coordinate_list:{coordinate_list}')

  125. except Exception as err:

  126. err_str = str(err)

  127. tb = traceback.format_exc()

  128. msg = f'Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}'

  129. l.warning(msg)

  130. attempt+=1

  131. # 发生异常时先返回主页面

  132. continue

  133. #将鼠标移动到返回的坐标位置并点击

  134. for xy in coordinate_list:

  135. action = ActionChains(driver)

  136. action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()

  137. action.perform()

  138. time.sleep(random.uniform(0.5,2))

  139. #先切回到主html,再切到第一个iframe,获取之前的弹出验证按钮,判断内容是否是验证成功

  140. driver.switch_to.default_content()

  141. driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])

  142. text = driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').text

  143. if text.find('验证成功')!=-1:

  144. l.info('验证码验证成功!')

  145. time.sleep(random.uniform(1,2))

  146. returnTrue

  147. else: #失败则再切回到第二个iframe,从新获取验证码

  148. driver.switch_to.default_content()

  149. driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])

  150. l.info('fail,and try it again')

  151. attempt+=1

  152. time.sleep(2)

  153. continue

  154. returnFalse

  155. #通过搜索关键字获取列表页面,并定位到某一页

  156. def search(self, keyword, page_to_go):

  157. '''''搜索简历,得到列表页面,数据为json格式'''

  158. l = logging

  159. assert keyword

  160. self.keyword = keyword

  161. # 使用firefox浏览器抓取post请求参数

  162. params = {

  163. "pageSize":page_to_go,

  164. "pageNo":"25",

  165. "keyStr":keyword,

  166. "companyName":"",

  167. "schoolName":"",

  168. "keyStrPostion":"",

  169. "postionStr":"",

  170. "startDegrees":"-1",

  171. "endDegress":"-1",

  172. "startAge":"0",

  173. "endAge":"0",

  174. "gender":"-1",

  175. "region":"",

  176. "timeType":"-1",

  177. "startWorkYear":"-1",

  178. "endWorkYear":"-1",

  179. "beginTime":"",

  180. "endTime":"",

  181. "isMember":"-1",

  182. "hopeAdressStr":"",

  183. "cityId":"-1",

  184. "updateTime":"",

  185. "tradeId":"",

  186. "clientNo":"",

  187. "userToken":self.token,

  188. "clientType":"2"

  189. }

  190. retry = 0

  191. whileTrue:

  192. #抓包获取请求的真实URL,后面是随机的数字字符串

  193. search_url = "http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=" + str(int(time.time() * 1000))

  194. l.info('search_url:{}'.format(search_url))

  195. self.current_url = search_url

  196. l.debug(f'Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}')

  197. retry += 1

  198. if retry == 11:

  199. return''

  200. try:

  201. #使用session请求

  202. res = self.session.post(search_url, data=params)

  203. except ConnectionError:

  204. l.info("ConnectionError! Sleep 5 minutes and retry...")

  205. time.sleep(300)

  206. self.current_url = search_url

  207. continue

  208. else:

  209. l.info('current url is:{}'.format(res.url))

  210. if res.url != search_url:

  211. login_result = self.login(load=False)

  212. if login_result:

  213. continue

  214. else:

  215. l.warning("Login failed!")

  216. sys.exit('login failed')

  217. elifnot res.text:

  218. l.info("Service is busy. Wait 5 minutes and retry...")

  219. time.sleep(300)

  220. l.info('Continue Searching...')

  221. continue

  222. #返回的数据异常,内容很少

  223. elif len(str(res.text))<2000:

  224. #若返回‘请您登录后查看简历’,则重新登录后在爬取

  225. if'请您登录后查看简历'in str(res.text):

  226. self.login(load=False)

  227. continue

  228. result = str(res.text)

  229. #更换useragent

  230. self.session.headers['User-Agent'] = ua.firefox

  231. l.info(f'errorcode msg:{result}')

  232. l.info('Too frequent operation, please try again in a minute')

  233. time.sleep(random.randint(61,100))

  234. continue

  235. else:

  236. try:

  237. #返回的正常数据,通过json.dumps()获取json数据

  238. resume_list = json.loads(res.text)

  239. resume_list["current_page"]=page_to_go

  240. # 在列表页面加入搜索页面

  241. res = json.dumps(resume_list,ensure_ascii=False)

  242. l.info(f'search_resume_list_info:{res}')

  243. return res

  244. except:

  245. l.warning(res.text)

  246. l.warning("something wrong!sleep 5 minutes and retry...")

  247. time.sleep(300)

  248. continue

  249. def open_resume(self, url):

  250. '''''

  251. 打开简历,得到详情页面

  252. url可通过base64加密的用户id构造

  253. '''

  254. l = logging

  255. l.debug(f'Open a resume: request_url: {url}')

  256. resumeHtmlId=(url.split("="))[1]

  257. # 设置前链

  258. #self.session.headers['Referer'] = "http://qiye.zhaopingou.com/resume?key="+self.keyword

  259. # 抓包获取简历详情页的请求参数

  260. open_resume_data={

  261. "resumeHtmlId": resumeHtmlId,

  262. "keyStr":"",

  263. "keyPositionName":"",

  264. "tradeId":"",

  265. "postionStr":"",

  266. "jobId":"0",

  267. "companyName":"",

  268. "schoolName":"",

  269. "clientNo":"",

  270. "userToken":self.token,

  271. "clientType":"2"

  272. }

  273. retry = 0

  274. whileTrue:

  275. #抓包获取详情页真实url

  276. openresumeurl = "http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=" + str(int(time.time() * 1000))

  277. l.info('resume_url:{}'.format(openresumeurl))

  278. retry += 1

  279. if retry == 11:

  280. return''

  281. try:

  282. res = self.session.post(url=openresumeurl,data=open_resume_data)

  283. except ConnectionError:

  284. l.info("ConnectionError! Sleep 5 minutes and retry...")

  285. time.sleep(300)

  286. continue

  287. else:

  288. # 返回的html页面

  289. l.info('current url is:{}'.format(res.url))

  290. if res.url != openresumeurl:

  291. l.info("cookie is invalid. Login with webdriver")

  292. login_result = self.login(load=False)

  293. if login_result:

  294. continue

  295. else:

  296. l.warning("Login failed!")

  297. sys.exit('login failed')

  298. ifnot res.text:

  299. l.info("Service is busy. Wait 5 minutes and retry...")

  300. time.sleep(300)

  301. continue

  302. elif len(str(res.text))<2000:

  303. print('errorcode:',res.text)

  304. result = str(res.text)

  305. l.info(f'errorcode msg:{result}')

  306. l.info('Too frequent operation, please try again in a minute')

  307. time.sleep(random.randint(61, 100))

  308. continue

  309. else:

  310. try:

  311. page_len = len(res.text)

  312. self.current_url = openresumeurl

  313. l.info(f'Downloaded a resume, len: {page_len:,d}, current_url: {url}')

  314. resp_json=json.loads(res.text)

  315. res_utf=json.dumps(resp_json,ensure_ascii=False)

  316. return res_utf

  317. except:

  318. l.warning(res.text)

  319. l.warning("something wrong! sleep 5 minutes and retry...")

  320. time.sleep(300)

  321. continue

  322. if __name__ == '__main__':

  323. #账号密码是假的,大家填写自己的账号密码

  324. rtc_zhaopingou = RTC_zhaopingou(account={'user_id': '-701', 'username': '13419696888', 'password': '123'},

  325. debug=False,

  326. visible=1, last_try=False)

  327. rtc_zhaopingou.login()

  328. keyword_list = ['python','大数据','人工智能','java']

  329. for kw in keyword_list:

  330. for i in range(1,200):

  331. search_result = rtc_zhaopingou.search(kw, i)

  332. print('****************************************************************')

  333. res = rtc_zhaopingou.open_resume(' http://qiye.zhaopingou.com/resume/detail?resumeId=5761920')

  334. print(res)

打码兔平台的代码需要自己下载,放在同级目录后可以跑一下

希望能帮助大家!

欢迎大家关注我的博客:https://home.cnblogs.com/u/Python1234/

欢迎大家加入万人交流答疑群:

猜你喜欢

转载自www.cnblogs.com/Python1234/p/9063353.html