Python uses pytesseract for captcha image recognition

Table of contents

Image reading mainly uses two libraries, and different libraries are different objects:

This image recognition test requires the following two conditions:

1. Obtain verification code

2. Login to the website

3. Image processing

4. Captcha recognition test

Test instruction

test code

Test Results

5. Re-identification test of successful samples

Test instruction

test code

Test Results

Test Notes

6. Integrate the fusion voting model and use the multi-process mechanism to run the program

Test instruction

test code

Test Results

Results of running the program in a single process

Effects and results when running programs in parallel​

7. Re-identification of failed examples

8. Other

Image reading mainly uses two libraries, and different libraries are different objects:

# plt.imread和PIL.Image.open读入的都是RGB顺序
from PIL import Image
img = Image.open('xxxx.png')  # 读取Image对象
img.save('xxx.png')
'''
print(img.mode)  # 有'1', 'L', 'P', 'RGB', 'RGBA'等
'1': 表示黑白模式照片
'L': 表示灰度模式照片
'RGB': 表示RGB通道模式的彩色照片
'RGBA': 表示RGB通道及Alpha通道的照片
'''
img.show() # 显示图片
img.convert('L')  # 转换为'L'模式
img.crop((20,30,300,200))  # 裁剪
# Image.eval(img, function)  # 对每个像素/通道进行函数处理


import cv2
# opencv中cv2.imread读入的是BGR通道顺序
# flags=0是灰度模式,flags=1是默认的彩色模式
# im = cv2.imread('xxxx.png', flags=0) # 读取图像array对象、
im = cv2.imread("imgCode_grey200.jpg", flags=cv2.IMREAD_GRAYSCALE)
cv2.imwrite('imgCode_grey200.jpg', im)
plt.imshow(im) # 显示图片
# plt.show()
# plt.close()
# cv2.imshow('im', im)  # 显示图片


## PIL.Image.open和cv2.imread的比较与相互转换的方法
# 当图片是png格式,读取结果是一致的;
# 当图片是jpg格式时,读取结果是不一致的。
# 这可能是因为Image.open 与 cv2.imread 在解码jpg时运算有差异。 

# 简单转换
# im = np.array(img, np.uint8)  # copy=True
# im = np.asarray(img, np.uint8)  # copy=False
 # 不设置dtype为数值的话,得到的可能是布尔值的数组,比如二值化后的图片
im = np.asarray(img) 
# img = Image.fromarray(np.uint8(im))
img = Image.fromarray(im)

# 标准转换
def PILImageToCV(imagePath):
    # PIL Image转换成OpenCV格式
    img = Image.open(imagePath)
    plt.imshow(img)
    img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
    plt.imshow(img)
    plt.show()
 
def CVImageToPIL(imagePath):
    # OpenCV图片转换为PIL image
    img = cv2.imread(imagePath)
    plt.imshow(img)
    img2 = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.imshow(img2)
    plt.show()

This image recognition test requires the following two conditions:

OCR software: tesseract.exe, to recognize characters through command line calls.

Python interface of OCR software: pytesseract, the kernel is OCR software tesseract

OCR: Optical Character Recognition (Optical Character Recognition)

Remarks: Another interface is PyOCR, the kernel can include tesseract or others, but the OCR software must be installed in advance.

import pytesseract

def get_result_by_imgCode_recognition(img):
    # 进行验证码识别
    result = pytesseract.image_to_string(img)  # 接口默认返回的是字符串
    # ''.join(result.split())  # 去掉全部空格和\n\t等
    result = ''.join(list(filter(str.isalnum, result)))  # 只保留字母和数字
    return result
    

def pass_counter(img, img_value):
    # 辨别是否识别正确
    rst = get_result_by_imgCode_recognition(img)
    if rst == img_value:
        return 1
    else:
        return 0
        
        
def most_frequent(lst):
    # 获取列表最频繁的元素,可用于集成投票获得识别结果
    # return max(lst, key=lst.count)
    return max(set(lst), key=lst.count)

1. Obtain verification code

Through the developer tools of the browser, it is found that the verification code image is a base64 encoded file, which is decoded and written into the file.

 

 

def fetch_imgCode():
    # 获取验证码
    url_imgCode = 'xxxx'
    html = requests.post(url_imgCode)
    '''
    print(f'imgCode rsp: {html.text}')
    imgCode rsp: {
        "data": 
            {"image_buf_str": "/9j/4AAQ....KAP/9k=",
             "image_code": "16501881494161"},
        "error_code": 0, "msg": {"en-us": "Success", "zh-CN": "\u6210\u529f"},
        "request": "POST /public/verificationCode/imgCode"}
    '''
    html = html.json()
    image_buf_str = html['data']['image_buf_str']
    image_code = html['data']['image_code']

    # 保存base64编码的图片为图片文件
    with open(f'./imgCode_png_raw/imgCode_{image_code}.png', 'wb') as f:
        f.write(base64.b64decode(image_buf_str))
    return image_code

2. Login to the website

By initiating a post request to the website, you can log in to the website. In general:

Enter the correct verification code value image_value corresponding to image_code to log in successfully.

Conversely, if the login is successful, it also means that the verification code value image_value we identified is correct.


HEADERS_PORTAL = {
    'User-Agent': 'xxxx',
    "Content-Type": "application/json",
}
def login(image_code, image_value):
    login_flag = False
    url_login = 'xxxx'
    data_login = {"account": "DEMO_Tong",
                  "password": "9xdsaGcy",
                  "image_code": image_code,
                  "captcha": image_value,
                  "nickname": "DEMO_Tong", "client_type": 100}
    html = requests.post(url_login, data=json.dumps(data_login), headers=HEADERS_PORTAL)
    # print(f'login info: {html.text}')
    html = html.json()
    if html.get('data'):
        if html.get('data').get('token'):
            login_flag = True
    return login_flag

3. Image processing

Grayscale processing, binary processing, denoising, dilation and erosion, tilt correction, character cutting, normalization, etc.

# 灰度处理和二值处理
# lookup_table = [0 if i < 200 else 1 for i in range(256)]
def gray_processing(img, threshold = 127):
    # 转为灰度模式
    img = img.convert('L')
    # 转为二值模式,阈值默认是 127,大于为白色,否则黑色。
    # 为什么127呢,256/2=128, 2^8=256, 一个字节byte是8个比特bit
    # image.convert('1')  # 即 threshold = 127 
    # threshold = 125
    lookup_table = [0 if i < threshold else 1 for i in range(256)]
    img = img.point(lookup_table, '1')
    return img
    
 # 膨胀腐蚀法
 def erode_dilate(im, threshold=2):
    # im = cv2.imread('xxx.jpg', 0)
    # cv2.imshow('xxx.jpg', im)

    # (threshold, threshold) 腐蚀矩阵大小
    kernel = np.ones((threshold, threshold), np.uint8)
    # 膨胀
    erosion = cv2.erode(im, kernel, iterations=1)
    # cv2.imwrite('imgCode_erosion.jpg', erosion)
    # Image.open('imgCode_erosion.jpg').show()
    # # 腐蚀
    # eroded = cv2.dilate(erosion, kernel, iterations=1)
    # cv2.imwrite('imgCode_eroded.jpg', eroded)
    # Image.open('imgCode_eroded.jpg').show()
    return erosion

 

4. Captcha recognition test

Test instruction

According to different image processing methods, conduct verification code recognition tests, accumulate successful recognition examples, and observe the recognition effects of different processing methods. The verification codes obtained in the test are random, some are easy to recognize, and some are not easy to recognize, but objectively belong to the verification codes of the same difficulty.

This identification test will be divided into 3 groups, 10,000 pieces will be identified each time, and the identification will be verified by simulating login to the website.

        The first group directly identifies the original image file, and the label is "raw"

        The second group identifies image objects after grayscale processing and binary processing with a threshold of 200, labeled as "gray"

        The third group identifies image objects after grayscale, binary and dilation processing, labeled "erosion"

The recognition results are placed in different folders according to the image processing method and whether the recognition is correct or not, and the recognition results are also appended to the file name:

        imgCode_png_raw: store the original image saved from the website

imgCode_png_raw_pass: Store the original image         that is correctly identified by the raw test

imgCode_png_raw_fail: store the original image         that failed to be recognized by the raw test

imgCode_png_raw_gray_pass: Store the original image         that the gray test recognizes correctly

        imgCode_png_raw_gray_fail: store processed images that failed to be recognized by the gray test

imgCode_png_raw_gray_erosion_pass: store the original image         that is recognized correctly by the erosion test

        imgCode_png_raw_gray_erosion_fail: store processed images that fail to be recognized by the erosion test

 

 

 

 Note: It can be found through the development tools of the browser that the font used for the verification code should be element-icons.535877f5.woff

test code

from tqdm import tqdm, trange 
from tqdm.contrib import tzip # tqdm是进度条模块,为了便于观察处理进度
TEST_TOTAL = 10000  # 测试数量1万张

def test_raw():
    print('raw: ')
    pass_count = 0
    # for _ in range(TEST_TOTAL):
    for _ in trange(TEST_TOTAL):
        try:
            image_code = fetch_imgCode()
            img = Image.open(f'./imgCode_png_raw/imgCode_{image_code}.png')

            result = get_result_by_imgCode_recognition(img)
            login_flag = login(image_code, result)
            if login_flag:
                img.save(f'./imgCode_png_raw_pass/imgCode_{image_code}_{result}.png')
                pass_count += 1
            else:
                img.save(f'./imgCode_png_raw_fail/imgCode_{image_code}_{result}.png')
        except:
            info = sys.exc_info()
            print(info)
    print(f'pass_rate: {pass_count/TEST_TOTAL*100}')

def test_gray():
    print('gray: ')
    pass_count = 0
    for _ in trange(TEST_TOTAL):
        try:
            image_code = fetch_imgCode()
            img = Image.open(f'./imgCode_png_raw/imgCode_{image_code}.png')
            img_gray = gray_processing(img, threshold=200)

            result = get_result_by_imgCode_recognition(img_gray)
            login_flag = login(image_code, result)
            if login_flag:
                img.save(f'./imgCode_png_raw_gray_pass/imgCode_{image_code}_{result}.png')
                pass_count += 1
            else:
                img_gray.save(f'./imgCode_png_raw_gray_fail/imgCode_{image_code}_{result}.png')
        except:
            info = sys.exc_info()
            print(info)
    print(f'pass_rate: {pass_count/TEST_TOTAL*100}')

def test_erosion():
    print('erosion: ')
    pass_count = 0
    for _ in trange(TEST_TOTAL):
        try:
            image_code = fetch_imgCode()
            img = Image.open(f'./imgCode_png_raw/imgCode_{image_code}.png')
            img_gray = gray_processing(img, threshold=200)
            
            im = np.asarray(img_gray, np.uint8)  # gray之后变成array,值变为0和1,有效去噪点
            erosion = erode_dilate(im, threshold=2)
            img1 = Image.fromarray(erosion*255)  # 值为0到1,整个图片都是黑色的。

            result = get_result_by_imgCode_recognition(img1) # 这里用array也可以
            login_flag = login(image_code, result)
            if login_flag:
                img.save(f'./imgCode_png_raw_gray_erosion_pass/imgCode_{image_code}_{result}.png')
                pass_count += 1
            else:
                img1.save(f'./imgCode_png_raw_gray_erosion_fail/imgCode_{image_code}_{result}.png')
        except:
            info = sys.exc_info()
            print(info)
    print(f'pass_rate: {pass_count/TEST_TOTAL*100}')

Test Results

 

5. Re-identification test of successful samples

Test instruction

Copy the correct examples of raw, gray, and erosion recognition tests to the imgCode_pass folder according to the sample ratio of 1:1:1. At this time, the verification code samples all have correct recognition results, and the number must be balanced with the sample ratio. Three processing methods are used for re-identification, and the recognition effects of the three processing methods are compared.

The sample ratio of this re-identification test is 1:1:1, 8844 pieces each, 26532 pieces in total.

test code


def test_pass_raw():
    pass_list = os.listdir('./imgCode_pass')
    pass_value_list = [img_file[-8:-4] for img_file in pass_list]
    pass_cnt1 = 0
    pass_amt = len(pass_list)
    print(f'pass_amt: {pass_amt}')

    # for img_file, img_value in zip(pass_list, pass_value_list):
    for img_file, img_value in tzip(pass_list, pass_value_list):
        # raw
        img = Image.open(f'./imgCode_pass/{img_file}')
        pass_cnt1 += pass_counter(img, img_value)
    print(f'raw: \npass_rate:{pass_cnt1 / pass_amt * 100}')

def test_pass_gray():
    pass_list = os.listdir('./imgCode_pass')
    pass_value_list = [img_file[-8:-4] for img_file in pass_list]
    pass_cnt2 = 0
    pass_amt = len(pass_list)
    print(f'pass_amt: {pass_amt}')

    # for img_file, img_value in zip(pass_list, pass_value_list):
    for img_file, img_value in tzip(pass_list, pass_value_list):
        # raw
        img = Image.open(f'./imgCode_pass/{img_file}')
        # raw + grey200
        img = gray_processing(img, threshold=200)
        pass_cnt2 += pass_counter(img, img_value)
    print(f'raw + grey200: \npass_rate:{pass_cnt2/pass_amt*100}')

def test_pass_erosion():
    pass_list = os.listdir('./imgCode_pass')
    pass_value_list = [img_file[-8:-4] for img_file in pass_list]
    pass_cnt3 = 0
    pass_amt = len(pass_list)
    print(f'pass_amt: {pass_amt}')

    # for img_file, img_value in zip(pass_list, pass_value_list):
    for img_file, img_value in tzip(pass_list, pass_value_list):
        # raw
        img = Image.open(f'./imgCode_pass/{img_file}')
        # raw + grey200
        img = gray_processing(img, threshold=200)
        # raw + grey200 + erosion
        im = np.asarray(img, np.uint8)  # gray之后变成array,值变为0和1,有效去噪点
        erosion = erode_dilate(im, threshold=2)
        img1 = Image.fromarray(erosion*255)  # 值为0到1,整个图片都是黑色的。
        pass_cnt3 += pass_counter(img1, img_value)
    print(f'raw + grey200 + erosion(Image): \npass_rate:{pass_cnt3/pass_amt*100}')

Test Results

 

Test Notes

In this test, special attention should be paid to the sample ratio. If all the samples are re-identified correctly through the raw recognition test, the raw method will be 100% correct. The figure below shows the results of re-identification using most of the successful examples of raw recognition. It is found that the recognition ability of the recognition models with different processing methods shows a downward trend. The closer to the raw recognition model, the better the model accuracy, and the worse it is anyway.

6. Integrate the fusion voting model and use the multi-process mechanism to run the program

Test instruction

Different models have different recognition effects. Consider the model fusion of integrated learning, use the voting method, and use the three models of raw, gray, and erosion to perform recognition and prediction voting, and use the recognition result with the largest number of votes as the recognition result of the integrated fusion voting model. for login verification.

Based on the integrated fusion voting model, it is necessary to identify the same verification code example three times, which is time-consuming. Therefore, a multi-process mechanism is used to run the program in parallel to reduce the time consumed by the program.

test code

def test_ensemble_vote(kwargs):
    result_list = []
    image_code = fetch_imgCode()
    img = Image.open(f'./imgCode_png_raw/imgCode_{image_code}.png')
    result_list.append(get_result_by_imgCode_recognition(img))

    img_gray = gray_processing(img, threshold=200)
    result_list.append(get_result_by_imgCode_recognition(img_gray))

    im = np.asarray(img_gray, np.uint8)  # gray之后变成array,值变为0和1,有效去噪点
    erosion = erode_dilate(im, threshold=2)
    img1 = Image.fromarray(erosion*255)  # 值为0到1,整个图片都是黑色的。
    result_list.append(get_result_by_imgCode_recognition(img1))

    result = max(result_list, key=result_list.count)
    login_flag = login(image_code, result)
    return login_flag

def test_ensemble_vote_multi():
    print('test_ensemble_vote_multi: ')
    from multiprocessing import Pool

    pool = Pool()
    pool_result_list = pool.map(test_ensemble_vote, trange(TEST_TOTAL))
    pool.close()
    pool.terminate()
    pool.join()

    pass_count = pool_result_list.count(True)
    print(f'pass_rate: {pass_count/TEST_TOTAL*100}')

Test Results

Results of running the program in a single process

 Effects and results when running programs in parallel

7. Re-identification of failed examples

Use the fusion voting model identified by different binarization thresholds to re-identify the examples that failed to identify the meta-model (raw, gray, or erosion). 


def test_fail():
    ## 单独一张图片,不同的二值化阈值,最频繁预测结果
    # img = Image.open(f'./imgCode_fail/imgCode_16501101286728_359.png')
    # img.show()
    # result_list = []
    # for i in trange(120,200,1):
    #     img_gray = gray_processing(img, threshold=i)
    #     img_gray.show()
    #     result = get_result_by_imgCode_recognition(img_gray)
    #     result_list.append(result)
    # print(f'most_frequent(lst): {most_frequent(result_list)}')

    ## 多张图片,不同灰度阈值,最频繁预测结果,目的是寻找最佳阈值
    fail_list = os.listdir('./imgCode_fail')
    result_list_1 = []
    for img_file in fail_list:
        img = Image.open(f'./imgCode_fail/{img_file}')
        result_list_2 = []
        for i in trange(120,200,10):
            img_gray = gray_processing(img, threshold=i)
            result = get_result_by_imgCode_recognition(img_gray)
            result_list_2.append(result)
        result_list_1.append(result_list_2)
    for img_file, lst in zip(fail_list, result_list_1):
        print(f'{img_file}, most_frequent(lst): {most_frequent(lst)}')

8. Other

 

 

 

 

 

 

Guess you like

Origin blog.csdn.net/Cameback_Tang/article/details/124247948