拆箱 opencv 矩形

问题描述

我正在对一堆 pdf 文件进行 OCR。这工作正常,但部分 pdf 是黑线的。实际上,它们并不是真正的黑线,而是“矩形内带有一些文字的矩形”。即使使用单词列表来定位 '(10)(2e)' 的各种组合,该文本也弄乱了我的 OCR。

我正在使用 .jpg 格式,从包含机器人文本和图像(其中包含文本)的 pdf 格式转换而来。这是一个示例:

enter image description here

由于 '(10)(2e)' 的许多变体弄乱了我的 OCR,我的目标是找到所有矩形 - 最有可能包含 '(10)(2e)' 并填充它们。 为了找到矩形,我遵循了来自 nathancy 的这个很好的答案:How to detect all rectangular boxes python opencv without missing anything

但是 - 正如您在上面的绿色矩形中看到的那样 - 有时绿色矩形会与我需要的部分数据重叠。在本例中,“@leiden.nl”和“@”在第二行。

我已经尝试了 (a) 其他图像处理设置(侵蚀/扩张/模糊/thershold)和 (b) Nathancy 的答案中建议的其他设置(内核设置/迭代次数)的多种组合。

寻找较小矩形的最佳做法是什么?

仅供参考:我查找矩形的代码或多或少与 Nathancy 的回答相似:

# https://stackoverflow.com/questions/59979760/how-to-detect-all-rectangular-Boxes-python-opencv-without-missing-anything
import cv2

import os
path = os.getcwd()
print(path+'/test_ocr3/_stuff_IN/')

# Load iamge,grayscale,adaptive threshold
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'1.png')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1.jpg')
image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_opt.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_A_erode_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_B_dilate_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_D_threshold_177255.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY_INV,51,9)

# Fill rectangular contours
# CHECK OTHER CONTOUR SETTINGS ? TO EXLCUDE OUTER ?
# https://docs.opencv.org/master/d9/d8b/tutorial_py_contours_hierarchy.html
# https://medium.com/analytics-vidhya/opencv-findcontours-detailed-guide-692ee19eeb18
# cnts = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(thresh,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(thresh,[c],-1,(255,255),-1)

# Morph open
kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(30,4))
opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel,iterations=4)
# opening = cv2.morphologyEx(thresh,cv2.MORPH_CLOSE,iterations=4)

# Draw rectangles
# cnts = cv2.findContours(opening,cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(opening,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    cv2.rectangle(image,(x,y),(x + w,y + h),(36,12),3)
    # filled
    # cv2.rectangle(image,-1)
    
# cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'1_OUT.png',image)
cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'page_1_0_TST_OUT.jpg',image)

解决方法

# https://stackoverflow.com/questions/59979760/how-to-detect-all-rectangular-boxes-python-opencv-without-missing-anything
import cv2
import os

path = os.getcwd()
print(path + '/test_ocr3/_stuff_IN/')

# Load iamge,grayscale,adaptive threshold
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'1.png')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1.jpg')
image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_opt.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_A_erode_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_B_dilate_551.jpg')
# image = cv2.imread(path+'/test_ocr3/_stuff_OUT/'+'page_1_D_threshold_177255.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY_INV,51,9)

# Fill rectangular contours
# CHECK OTHER CONTOUR SETTINGS ? TO EXLCUDE OUTER ?
# https://docs.opencv.org/master/d9/d8b/tutorial_py_contours_hierarchy.html
# https://medium.com/analytics-vidhya/opencv-findcontours-detailed-guide-692ee19eeb18
# cnts = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(thresh,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(thresh,[c],-1,(255,255),-1)
    cv2.drawContours(thresh,(0,0),1)

# Morph open
kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(7,4))
opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel,iterations=4)
# opening = cv2.morphologyEx(thresh,cv2.MORPH_CLOSE,iterations=4)

# Draw rectangles
# cnts = cv2.findContours(opening,cv2.CHAIN_APPROX_SIMPLE)
cnts = cv2.findContours(opening,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    cv2.rectangle(image,(x,y),(x + w,y + h),(36,12),3)
    # filled
    # cv2.rectangle(image,-1)

# cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'1_OUT.png',image)
cv2.imwrite(path+'/test_ocr3/_stuff_OUT/'+'page_1_0_TST_OUT.jpg',image)

Modified binary 因为我没有更高分辨率的图像,所以我修改了图像。我用手擦除了大框并将边缘锐化到 1px(如果此图像不等于您的原始图像,请上传更高的分辨率并更正。

关键是 cv2.drawContours(thresh,1) .这将一个大盒子(您想要移除的)分成小盒子。如果没有这个,连接区域将被识别为一个大框,这将删除不需要的信息。

image 2 compares Your question and My answer of large box. image 3 shows My answer.