扫描文件表格识别
1.识别结构
situation1 有明确表格结构
1.纠正表格偏移角度(获取最大轮廓,计算最小的矩形,变换坐标截取矩形)
获取面积最大轮廓
_, contours, HIERARCHY = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
candidate_table = sorted(candidate_table, key=cv2.contourArea, reverse=True)
计算包含该轮廓的最小的矩形
rect = cv2.minAreaRect(candidate_table[0])
box = cv2.boxPoints(rect) # box是四个点的坐标
box = np.int0(box) # 取整
- 去除面积不符合要求的表格
candidate_table = [cnt for cnt in contours if cv2.contourArea(cnt) > min_table_area]
- 去除长宽比不符合要求的表格
box_width, box_height = cv2.minAreaRect(cnt)[1][0], cv2.minAreaRect(cnt)[1][1] # 长宽
if cv2.minAreaRect(cnt)[2] > 45:
box_width, box_height = box_height, box_width
坐标截取矩形
def get_sorted_rect(rect):
'''
获取矩阵排序的四个坐标,方便透视变换使用
@param rect:
@return:按照左上 右上 右下 左下排列返回
'''
mid_x = (max([x[1] for x in rect]) - min([x[1] for x in rect])) * 0.5 + min([x[1] for x in rect]) # 中间点坐标
left_rect = [x for x in rect if x[1] < mid_x]
left_rect.sort(key=lambda x: (x[0], x[1]))
right_rect = [x for x in rect if x[1] > mid_x]
right_rect.sort(key=lambda x: (x[0], x[1]))
sorted_rect = left_rect[0], left_rect[1], right_rect[1], right_rect[0] # 左上 右上 右下 左下
return sorted_rect
def perTran(image, rect):
'''
做透视变换
image 图像
rect 四个顶点位置:左上 右上 右下 左下
'''
tl, tr, br, bl = rect # 左下 右下 左上 右上 || topleft topright 左上 右上 右下 左下
# 计算宽度
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
maxWidth = max(int(widthA), int(widthB))
# 计算高度
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
maxHeight = max(int(heightA), int(heightB))
# 定义变换后新图像的尺寸
dst = np.array([[0, 0], [maxWidth - 1, 0], [maxWidth - 1, maxHeight - 1],
[0, maxHeight - 1]], dtype='float32')
# 变换矩阵
rect = np.array(rect, dtype=np.float32)
dst = np.array(dst, dtype=np.float32)
M = cv2.getPerspectiveTransform(rect, dst)
# 透视变换
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
return warped
2. 计算每一条横线或总线的横坐标
def drop_duplicated_row_points(pos, max_span):
'''
获取去重后的 坐标点
Args:
sort_point:
max_span:
Returns:
'''
sort_point = np.sort(list(set(pos)))
point_arr = [sort_point[0]] # 每种类型数据max_span行都不一样
for i in range(1, len(sort_point) - 1):
if sort_point[i] - point_arr[-1] > max_span:
point_arr.append(sort_point[i])
return point_arr
def dilate_line(binary, type='vertical', x_scale=10, y_scale=5):
'''
获取竖线/横线腐蚀后的二值图
@param binary:
@param type:
@return:
'''
rows_z, cols_z = binary.shape
if type == 'horizontal':
size = (cols_z // x_scale, 1)
else:
size = (1, rows_z // y_scale)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, size)
eroded = cv2.erode(binary, kernel, iterations=1) # 腐蚀
dilated = cv2.dilate(eroded, kernel, iterations=1) # 膨胀
return dilated
def get_page_rows_y_array(binary, row_height, x_scale=20):
'''
获取该图像行数 cv2.imwrite('dilated_col_z.jpg', dilated_col_z)
:param binary:
:return:
'''
dilated_col_z = dilate_line(binary, type='horizontal', x_scale=x_scale)
ys, xs = np.where(dilated_col_z > 0)
point_arr = drop_duplicated_row_points(ys, max_span=row_height)
return point_arr
situation2 无明确表格结构
只有横线/纵线
1.纠正表格偏移角度( 消除除线条外的其他信息,计算横线或者纵向的起始点和坐标点,计算角度,纠正角度后,再通过坐标截取矩形 )
2. 回到situation1 第二步
计算坐标
lsd = cv2.createLineSegmentDetector(0, 1)
def get_pos_by_horizontal_line(lsd,binary ):
'''
获取横线的开始点和结束点
'''
dlines = lsd.detect(binary)[0]
pos = [[x[0][0], x[0][1]] for x in dlines] + [[x[0][2], x[0][3]] for x in dlines]
top_pos_list = [x for x in pos if x[0] == min([x[0] for x in pos])] # 最小的x
bottom_pos_list = [x for x in pos if x[0] == max([x[0] for x in pos])] # 最大的x
top_pos = [x for x in top_pos_list if x[1] == min([x[1] for x in top_pos_list])][0] # 最小的y
bottom_pos = [x for x in bottom_pos_list if x[1] == max([x[1] for x in bottom_pos_list])][0] # 最大的y
x1, y1, x2, y2 = top_pos + bottom_pos
return int(x1), int(y1), int(x2), int(y2)
计算角度
def cal_angle(x1, y1, x2, y2, is_vertical=True):
if x2 - x1 == 0:
result = 90
elif y2 - y1 == 0:
result = 0
else:
# 计算斜率
k = -(y2 - y1) / (x2 - x1)
# 求反正切,再将得到的弧度转换为度
result = np.arctan(k) * 57.29577 # 逆时针
if is_vertical:
if result < 0:
result += 90
elif result == 90:
result = 0
else:
result -= 90
print("通过竖线计算得到直线倾斜角度为:{} 度".format(result))
else:
print("通过横线计算得到直线倾斜角度为:{} 度".format(result))
result = round(result, 3)
return result
纠正角度
def rotate_image( image, angle):
# dividing height and width by 2 to get the center of the image
height, width = image.shape[:2]
# get the center coordinates of the image to create the 2D rotation matrix
center = (width / 2, height / 2)
# using cv2.getRotationMatrix2D() to get the rotation matrix
rotate_matrix = cv2.getRotationMatrix2D(center=center, angle=angle, scale=1)
# rotate the image using cv2.warpAffine
rotated_image = cv2.warpAffine(src=image, M=rotate_matrix, dsize=(width, height))
return rotated_image
无横线/纵线
通过扫描时设置白边处理,图片转灰度图后,非255的像素值转成0,然后回到situation1注意扫描时如果有黑边会影响计算,需要考察扫描仪选项
binary[binary <= 254] = 1
binary[binary == 255] = 0
binary[binary == 1] = 255
tips
图片预处理,通过中值滤波和色差
def clean_gray(gray,ksize=3,difference=50):
gray_copy = gray.copy()
gray_copy[(gray_copy >= (255-difference))] = 255
gray_copy = cv2.medianBlur(gray_copy, ksize)
return gray_copy
如果线条有断裂的地方,可以先膨胀再腐蚀,把断裂线条补齐
opening = cv2.morphologyEx(binary_dilate, cv2.MORPH_OPEN, kernel, 1)
2.paddle ocr 识别文字
排序
ocr_result = ocr.ocr(padding_cell_child )
# 排序
ocr_result = sorted(ocr_result, key=lambda x: (x[0][0][0]))# 横坐标排序
ocr_result = sorted(cell_result, key=lambda x: (x[0][3][1], x[0][3][0])) # 按照y 再按照x 排序
数字识别不到
检测到数字轮廓 ,截取数字,paddle det设置为False
长文本识别不到
描述:paddle超出25个字识别可能会识别不出来
- 改变图片比例为1:10
ratio = cell.shape[1] // cell.shape[0]
cell = cv2.resize(cell, (0, 0), fx=round(1 / (ratio / 10), 2), fy=1)
- 图片加padding (此处上下加5),少部分概率下会导致识别错误
cell = np.pad(cell, ((5, 5), (0, 0)), 'constant', constant_values=(255))