阿里ocr表格识别代码

# -*- coding: utf-8 -*-

import base64
import json
import urllib.request

ENCODING = 'utf-8'


def get_img_base64(img_file):
    with open(img_file, 'rb') as infile:
        s = infile.read()
        return base64.b64encode(s).decode(ENCODING)


def predict(url, appcode, img_base64, kv_configure):
    param = {}
    param['image'] = img_base64
    if kv_configure is not None:
        param['configure'] = json.dumps(kv_configure)
    body = json.dumps(param)
    data = bytes(body, "utf-8")

    headers = {'Authorization': 'APPCODE %s' % appcode}
    request = urllib.request.Request(url=url, headers=headers, data=data)
    request.add_header('Content-Type', 'application/json; charset=UTF-8')
    try:
        response = urllib.request.urlopen(request, timeout=10)
        return response.code, response.headers, response.read()
    except urllib.request.HTTPError as e:
        return e.code, e.headers, e.read()

def demo(img_file):
    appcode = '***************************'
    url = "https://form.market.alicloudapi.com/api/predict/ocr_table_parse"
    # url方式请求
    img_file = img_file

    # 如果没有configure字段，configure设为None。
    configure = None
    # configure = {'side':'face'}
    configure = {'format':format,'side':'face'}
    """
    1. format 输出格式：html/json/xlsx; 
    2. dir_assure 图片方向是否确定是正向的: true(确定)/false(不确定) 
    3. line_less:是否无线条: true(无线条,或者只有横线没有竖线)/false(有线条)
    4. skip_detection: 是否跳过检测，如果没有检测到表格，可以设置"skip_detection":true"""

    img_base64data = get_img_base64(img_file)
    stat, header, content = predict(url, appcode, img_base64data, configure)
    if stat != 200:
        print('Http status code: ', stat)
        print('Error msg in header: ', header['x-ca-error-message'] if 'x-ca-error-message' in header else '')
        print('Error msg in body: ', content)
        exit()
    result_str = content
    if format=='xlsx':
        res_obj = json.loads(content)
        with open(img_file.rsplit('.',1)[0]+'.xlsx', 'wb') as fout:
            fout.write(base64.b64decode(res_obj['tables']))
            print('已生成表格！')
    print(result_str.decode(ENCODING))

if __name__ == '__main__':
    format='xlsx'
    demo('***.png')
阿里ocr表格识别代码

猜你喜欢