文章目录
验证码问题
我们的目标是使用程序来识别滑动验证码的验证。
查看 html 可以看到这有两张图片,我们可以把这两张图片下载下来,然后用模板匹配获得缺口位置。
准备工作
本次我们使用 selenium 模拟浏览器,浏览器为 Chrome。使用 cv2 来实现模板匹配。请确保已正确安装。
程序流程
-
使用 selenium 定位两张图片,并用 requests 下载到本地
# 背景图片 bg = self.driver.find_element_by_class_name("img-bg").get_attribute('src') with open('bg.jpg',mode='wb') as f: f.write(requests.get(bg).content) # 滑动图片 block = self.driver.find_element_by_class_name('img-block').get_attribute('src') with open('block.png',mode='wb') as f: f.write(requests.get(block).content)
-
使用 cv2 把 block.png 图片的透明部分裁剪掉(不裁剪会影响模板匹配准确率)
-
使用 cv2 进行模板匹配得到图片位置
def template_matching(self,bg,block): #读取目标图片 target = cv2.imread(bg) #读取模板图片 template = cv2.imread(block) #获得模板图片的高宽尺寸 theight, twidth = template.shape[:2] #执行模板匹配,采用的匹配方式cv2.TM_SQDIFF_NORMED result = cv2.matchTemplate(target,template,cv2.TM_SQDIFF_NORMED) #归一化处理 cv2.normalize( result, result, 0, 1, cv2.NORM_MINMAX, -1 ) #寻找矩阵(一维数组当做向量,用Mat定义)中的最大值和最小值的匹配结果及其位置 min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result) return min_loc[0],min_loc[1]
-
根据图片位置模拟拖动滑块
def move_to_gap(self,tracks): # 移动滑块 drop = self.driver.find_element_by_class_name("verifyicon") ActionChains(self.driver).click_and_hold(drop).perform() for x in tracks: ActionChains(self.driver).move_by_offset(xoffset=x,yoffset=0).perform() time.sleep(0.5) ActionChains(self.driver).release().perform()
selenium 防识别
房天下会检测到 selenium ,一定要加(针对最新版本的Chrome)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
完整代码
已经把程序封装成类了,使用时直接把类导入,传入验证码页面url,具体看最后两行代码
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
import time
import requests
import cv2
import numpy as np
class Captcha():
def __init__(self):
options = webdriver.ChromeOptions()
# 在centos服务器上运行需加上下面配置
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
# options.add_argument('--no-sandbox') # 这个配置很重要
self.driver = webdriver.Chrome(chrome_options=options,executable_path='/usr/local/driver/chromedriver')
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
def get_captcha(self):
# 背景图片
bg = self.driver.find_element_by_class_name("img-bg").get_attribute('src')
with open('bg.jpg',mode='wb') as f:
f.write(requests.get(bg).content)
# 滑动图片
block = self.driver.find_element_by_class_name('img-block').get_attribute('src')
with open('block.png',mode='wb') as f:
f.write(requests.get(block).content)
image = cv2.imread('block.png', cv2.IMREAD_UNCHANGED) # 读取图片
# cv2.imshow('1', image)
# 保存裁剪后图片
box = self.get_transparency_location(image)
result = self.cv2_crop(image, box)
cv2.imwrite('block.png', result)
def move_to_gap(self,tracks):
# 移动滑块
drop = self.driver.find_element_by_class_name("verifyicon")
ActionChains(self.driver).click_and_hold(drop).perform()
for x in tracks:
ActionChains(self.driver).move_by_offset(xoffset=x,yoffset=0).perform()
time.sleep(0.5)
ActionChains(self.driver).release().perform()
def cv2_crop(self,im, box):
'''cv2实现类似PIL的裁剪
:param im: cv2加载好的图像
:param box: 裁剪的矩形,(left, upper, right, lower)元组
'''
return im.copy()[box[1]:box[3], box[0]:box[2], :]
def get_transparency_location(self,image):
'''获取基于透明元素裁切图片的左上角、右下角坐标
:param image: cv2加载好的图像
:return: (left, upper, right, lower)元组
'''
# 1. 扫描获得最左边透明点和最右边透明点坐标
height, width, channel = image.shape # 高、宽、通道数
assert channel == 4 # 无透明通道报错
first_location = None # 最先遇到的透明点
last_location = None # 最后遇到的透明点
first_transparency = [] # 从左往右最先遇到的透明点,元素个数小于等于图像高度
last_transparency = [] # 从左往右最后遇到的透明点,元素个数小于等于图像高度
for y, rows in enumerate(image):
for x, BGRA in enumerate(rows):
alpha = BGRA[3]
if alpha != 0:
if not first_location or first_location[1] != y: # 透明点未赋值或为同一列
first_location = (x, y) # 更新最先遇到的透明点
first_transparency.append(first_location)
last_location = (x, y) # 更新最后遇到的透明点
if last_location:
last_transparency.append(last_location)
# 2. 矩形四个边的中点
top = first_transparency[0]
bottom = first_transparency[-1]
left = None
right = None
for first, last in zip(first_transparency, last_transparency):
if not left:
left = first
if not right:
right = last
if first[0] < left[0]:
left = first
if last[0] > right[0]:
right = last
# 3. 左上角、右下角
upper_left = (left[0], top[1]) # 左上角
bottom_right = (right[0], bottom[1]) # 右下角
return upper_left[0], upper_left[1], bottom_right[0], bottom_right[1]
def template_matching(self,bg,block):
#读取目标图片
target = cv2.imread(bg)
#读取模板图片
template = cv2.imread(block)
#获得模板图片的高宽尺寸
theight, twidth = template.shape[:2]
#执行模板匹配,采用的匹配方式cv2.TM_SQDIFF_NORMED
#result = cv2.matchTemplate(target,template,cv2.TM_SQDIFF_NORMED)
result = cv2.matchTemplate(target,template,cv2.TM_SQDIFF_NORMED)
#归一化处理
cv2.normalize( result, result, 0, 1, cv2.NORM_MINMAX, -1 )
#寻找矩阵(一维数组当做向量,用Mat定义)中的最大值和最小值的匹配结果及其位置
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
return min_loc[0],min_loc[1]
def get_track(self,distance):
"""
根据偏移量获取移动轨迹
:param distance: 偏移量
:return: 移动轨迹
相关公式:
x = x0 * t + 0.5 * a * t * t
v = v0 + a * t
"""
v=0 # 初速度
t=2 # 单位时间为0.2s来统计轨迹,轨迹即0.2内的位移
tracks=[] # 位移/轨迹列表,列表内的一个元素代表0.2s的位移
current=0 # 当前的位移
mid=distance * 4/5 # 到达mid值开始减速
distance += 10 # 先滑过一点,最后再反着滑动回来
while current < distance:
if current < mid:
# 加速度越小,单位时间的位移越小,模拟的轨迹就越多越详细
a = 2 # 加速运动
else:
a = -3 # 减速运动
v0 = v # 初速度
s = v0*t+0.5*a*(t**2) # 0.2秒时间内的位移
current += s # 当前的位置
tracks.append(round(s)) # 添加到轨迹列表
v= v0+a*t # 速度已经达到v,该速度作为下次的初速度
# 反着滑动到大概准确位置
for i in range(3):
tracks.append(-2)
for i in range(4):
tracks.append(-1)
print(tracks)
return tracks
def run(self,url):
self.driver.get(url)
self.driver.implicitly_wait(10) # 隐式等待
time.sleep(5)
verification_code = self.driver.find_element_by_css_selector('.info p').text
if verification_code == "请拖动滑块进行验证:":
sucess = self.driver.find_element_by_css_selector('.drag-text').text
print(sucess)
while sucess != '验证通过啦!':
self.get_captcha() # 下载验证码
x,y = self.template_matching('bg.jpg','block.png')
# x,y = self.get_diff_captcha('bg.jpg','block.png')
# self.move_to_gap(self.get_track(x))
self.move_to_gap([x])
print(x,y)
time.sleep(4)
sucess = self.driver.find_element_by_css_selector('.drag-text').text
print(sucess)
self.driver.find_element_by_id("captcha_submit_btn").click() # 点击提交
time.sleep(2)
self.driver.close()
if __name__ == "__main__":
captcha = Captcha()
captcha.run('http://search.fang.com/captcha-40a1371b84c30293be/?t=1620803648.855&h=aHR0cHM6Ly9lc2YuZmFuZy5jb20vaG91c2UvaTMxNT9yZnNzPTItM2ExZjhlMDU0ZmJhMDE5MTQ5LTBi&c=cmE6MTEyLjEyLjE2LjIyO3hyaTo7eGZmOg%3D%3D#')