利用Python爬取简书

1.不要频繁运行程序模拟登录

频繁模拟登录并识别验证码后,会出现验证码却来越模糊到难以识别,并且识别后点击"确认"按钮无法登录(或者说登录按键失效的)的情况。如图所示的位置失效:

sure_button

2.超级鹰

超级鹰打码平台 打码效率可以达到90%以上。在平台上注册绑定微信后会赠送1000积分,基本够用了。如图是我的积分情况:

jifen

3.超级鹰软件ID和验证码类型

软件ID相当于工作牌(或护照),每次打码都必须携带;验证码类型需要你去 平台 确认。例如该项目的验证码类型属于9004 坐标多选,返回1~4个坐标

4.识别思路(简要)

首先,获取验证码位置并获取网页截图;然后,裁剪获取验证码图像并以字节流的格式发送给超级鹰打码平台;最后,转化识别结果并使用Selenium点击登录。

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from PIL import Image
from io import BytesIO
from utils.chaojiying import Chaojiying_Client
from utils.config import *
'''
想要学习Python?Python学习交流群:984632579满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class Crack_Jianshu(object):
    def __init__(self):
        """
        初始化
        """
        self.url = URL
        self.browser = webdriver.Chrome()
        self.wait = WebDriverWait(self.browser, TIME_OUT)
        # 简书登录账号、密码
        self.email = EMAIL
        self.password = PASSWORD
        # 创建超级鹰Client对象
        self.chaojiying = Chaojiying_Client(CHAIJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAIJIYING_SOFT_ID)

    def __del__(self):
        """
        gc机制关闭浏览器
        """
        self.browser.close()

    def open(self):
        """
        打开简书网页版登录界面输入邮箱账号、密码
        :return: None
        """
        self.browser.get(self.url)
        # 邮箱
        email = self.wait.until(EC.presence_of_element_located((By.ID, 'session_email_or_mobile_number')))
        # 密码
        password = self.wait.until(EC.presence_of_element_located((By.ID, 'session_password')))
        # 输入邮箱
        email.clear()
        email.send_keys(self.email)
        sleep(2)
        # 输入密码
        password.clear()
        password.send_keys(self.password)
        sleep(2)

    def get_submit_btn(self):
        """
        获取登录按钮
        :return: button
        """
        button = self.wait.until(EC.element_to_be_clickable((By.ID, 'sign-in-form-submit-btn')))
        return button

    def get_touclick_element(self):
        """
        获取验证码图片对象
        :return: 图片对象
        """
        element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_item_img')))
        return element

    def get_code_position(self):
        """
        获取验证码位置
        :return: 验证码位置列表
        """
        element = self.get_touclick_element()
        sleep(3)
        # 相对位置
        location = element.location
        # 宽高度
        size = element.size
        # 坐标值
        top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size['width']
        # 验证码左上角和右下角坐标
        return [left, top, right, bottom]

    def get_screenshot(self):
        """
        获取网页截图
        :return: 截图对象
        """
        screenshot = self.browser.get_screenshot_as_png()
        screenshot = Image.open(BytesIO(screenshot))
        return screenshot

    def get_touclick_image(self, name='captcha.png'):
        """
        获取验证码图片
        :return: 图片对象
        """
        left, top, right, bottom = self.get_code_position()
        print('验证码位置:', left, top, right, bottom)
        # 获取网页截图的Image对象
        screenshot = self.get_screenshot()
        # 获取验证码的Image对象
        jianshu_code = screenshot.crop((left, top, right, bottom))
        # 存储
        jianshu_code.save(name)
        return jianshu_code

    def get_points(self, captcha_result):
        """
        解析超级鹰识别结果
        :param captcha_result: 识别结果
        :return: 转化结果
        """
        # 获取pic_str的values
        groups = captcha_result.get('pic_str').split('|')
        # 将 字符串坐标值 转换为 整数型的坐标值
        locations = [[int(number) for number in group.split(',')] for group in groups]
        return locations

    def touch_click_words(self, locations):
        """
        点击验证图片
        :param locations: 点击位置
        :return: None
        """
        cnt = 1
        for location in locations:
            print('坐标点{}: {}'.format(cnt,location))
            ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(),location[0], location[1]).click().perform()
            cnt = cnt + 1
            sleep(1)

    def get_verifi_button(self):
        """
        确认按钮
        :return: None
        """
        submit = self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="geetest_commit_tip"]')))
        submit.click()

    def get_article_info(self):
        pass

    def connect_db(self):
        pass

    def save_to_db(self):
        pass


    def crack_login(self):
        """
        登录
        :return: None
        """
        # 打开简书登录界面
        self.open()
        # 点击登录按钮
        button = self.get_submit_btn()
        button.click()
        # 获取验证码图片
        image = self.get_touclick_image()
        bytes_array = BytesIO()
        # 存储为字节流格式
        image.save(bytes_array, format='PNG')
        # 识别验证码
        result = self.chaojiying.PostPic(bytes_array.getvalue(), CHAOJIYING_KIND)
        print("\n超级鹰识别结果:{}\n".format(result))
        locations = self.get_points(result)
        self.touch_click_words(locations)
        sleep(3)
        # 点击确认按钮
        self.get_verifi_button()

        # 通过获取"Logo"判断是否登录成功
        sleep(5)
        success = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'logo')))
        if success:
            print('\nSuccessful login!\n')

        # 失败重试
        if not success:
            print("-" * 50)
            self.crack_login()

if __name__ == '__main__':
    crack = Crack_Jianshu()
    crack.crack_login()
扫描二维码关注公众号,回复: 5602755 查看本文章

猜你喜欢

转载自blog.csdn.net/fei347795790/article/details/88691641