限制tesseract OCR识别的字符数

问题描述

The images I'm trying to mask look like this. 我使用 tesseract 作为库来遍历多个图像并识别它们上的数字,然后屏蔽这些数字。我在每个图像中有 12 位数字,我希望它在第 8 位数字之后停止识别/屏蔽,IE 只需要屏蔽 8 位数字,我尝试通过裁剪图像来硬编码它,因为它们中的大多数都是有点相似,但稍微偏离的会搞砸。有没有办法在第 8 位数字之后停止 tesseract 屏蔽这些图像?我正在使用 cv2.rectangle 来屏蔽图像。

PyTesseract.PyTesseract.tesseract_cmd = "D:/Tess/tesseract.exe"


for imgfilepathactual in glob.iglob('D:/dataset/allpdf/data/*.jpeg'):
    
    imgfilepath2 = imgfilepathactual.split("/")[3]
    imgfilepath1 = imgfilepath2.split('\\')[1]
    imgfilepath = imgfilepath1.split(".")[0]
    #print(filepath)
    print(imgfilepath)
    

 
            
    img = cv2.imread('D:/dataset/allpdf/data/' + imgfilepath + '.jpeg',cv2.IMREAD_GRAYSCALE) #if using with pdf conv
    #print(str(img))


    sobelX1 = cv2.sobel(img,cv2.CV_64F,1,ksize = 1)
    sobelY1 = cv2.sobel(img,ksize = 1)

    sobelX1 = np.uint8(np.absolute(sobelX1))
    sobelY1 = np.uint8(np.absolute(sobelY1))

    sobelCombined1 = cv2.bitwise_or(sobelX1,sobelY1)

    blurred = cv2.blur(sobelX1,(3,3)) #for pdf->img


    canny = cv2.Canny(blurred,5,250)

    pts1 =np.argwhere(canny>0)
    y11,x11 = pts1.min(axis=0)
    y21,x21 = pts1.max(axis=0)

    cropped = img[y11:y21,x11:x21]
    #cv2.imwrite("cropped.png",cropped)
    resizedimage = cv2.resize(cropped,(1080,720),interpolation=cv2.INTER_CUBIC) #actual 
    cv2.imwrite('resizedimage' + imgfilepath + '.jpeg',resizedimage)

    img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
    h1,w1,_= img.shape

   

    resizedimage = Image.open('resizedimage' + imgfilepath + '.jpeg')
    Box1 = (0,0.90*h1)

    resizedimage = resizedimage.crop(Box1)
    resizedimage.save('resizedimage' + imgfilepath + '.jpeg')

    img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
    h2,w2,_ = img.shape

    print ((h2,w2),"reso")

    croppedimg2 = Image.open('resizedimage' + imgfilepath + '.jpeg')
    Box2 = (0,0.75*h2,0.48*w2,h2)

    croppedimg2 = croppedimg2.crop(Box2)
    croppedimg2.save('croppedimg2' + imgfilepath + '.jpeg')

    aadharBoxes = PyTesseract.image_to_Boxes(croppedimg2,lang = "eng")#,config=' --psm 7 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyz'
    #adharBoxes = aadharBoxes[0:8]
    for b in aadharBoxes.splitlines():
        b = b.split(' ')
        high = []
        for i in range(8):
            b[i] = b[i].split(' ')
            high.append(int(b[i][2]))
            high.append(int(b[i][4]))
        #b = b[0:8]
        print(b)
        #print(len(b[0]))
        maskedImage = cv2.rectangle(img,(int(b[0][1]),np.min(high)),(int(b[7][3]),np.max(high)),(0,0),-1)
        cv2.imwrite("maskedImage" + imgfilepath + ".jpeg",maskedImage)
        pdf = img2pdf.convert("maskedImage"+ imgfilepath + ".jpeg")
        file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf","wb")
        file.write(pdf)
        file.close()
    #else:
        #pdf = img2pdf.convert("unmaskedImage"+ resizedimage + ".jpeg")
        #file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf","wb")
        #file.write(pdf)
        #file.close()
        
    os.remove('resizedimage' + imgfilepath + '.jpeg')
    #os.remove('maskedImage' + imgfilepath + '.jpeg')
    #os.remove('croppedimg2' + imgfilepath + '.jpeg')
      


for filepathactual in glob.iglob('D:/dataset/allpdf/*.pdf'):
    #print(filepathactual)
    filepath2 = filepathactual.split("/")[2]
    filepath1 = filepath2.split("\\")[1]
    filepath = filepath1.split(".")[0]
    print(filepath)

    def convertPdf2img():
        pages = convert_from_path(filepathactual,500)  #converting pdf to img
        for page in pages:
            page.save('out' + filepath + '.jpg','JPEG')


    
    convertPdf2img()
    


            
    img = cv2.imread('out' + filepath + '.jpg',cv2.IMREAD_GRAYSCALE) #if using with pdf conv



    sobelX1 = cv2.sobel(img,sobelY1)

    blurred = cv2.blur(img,interpolation=cv2.INTER_CUBIC) #actual 
    cv2.imwrite('resizedimage' +filepath + '.jpeg',resizedimage)

    img = cv2.imread('resizedimage' +filepath + '.jpeg')
    h1,_= img.shape

   

    resizedimage = Image.open('resizedimage' +filepath + '.jpeg')
    Box1 = (0,0.90*h1)

    resizedimage = resizedimage.crop(Box1)
    resizedimage.save('resizedimage' +filepath + '.jpeg')

    img = cv2.imread('resizedimage' +filepath + '.jpeg')
    h2,"reso")

    croppedimg2 = Image.open('resizedimage' +filepath + '.jpeg')
    Box2 = (0,0.65*h2,0.6*w2,h2)

    croppedimg2 = croppedimg2.crop(Box2)
    croppedimg2.save('croppedimg2' + filepath + '.jpeg')

    aadharBoxes = PyTesseract.image_to_Boxes(croppedimg2,lang = "eng")

    for b in aadharBoxes.splitlines():
        b = b.split(' ')
        maskedImage = cv2.rectangle(img,(int(b[1]),h2 - int(b[2])),(int(b[3]),h2 - int(b[4])),-1)
        #print(b,"coords")
        cv2.imwrite("maskedImage" + filepath + ".jpeg",maskedImage)
        pdf = img2pdf.convert("maskedImage"+ filepath + ".jpeg")
        file = open("D:/dataset/allpdf/masked_files/masked" + filepath + ".pdf","wb")
        file.write(pdf)
        file.close()
        


    #print(w,h)
    os.remove('out' + filepath + '.jpg')
    os.remove('resizedimage' + filepath + '.jpeg')
    os.remove('maskedImage' + filepath + '.jpeg')
    os.remove('croppedimg2' + filepath + '.jpeg')
      

解决方法

要选择前八个字符,范围后的字符将被屏蔽。

import pytesseract
import numpy as np
import cv2

img = cv2.imread('muTYX.jpg')

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

text = pytesseract.image_to_boxes(img,lang = 'eng',config='--psm 7 --oem 3')

text = text.split('\n')
high = []
for i in range(8):
    text[i] = text[i].split(' ')
    high.append(int(text[i][2]))
    high.append(int(text[i][4]))

cv2.rectangle(img,(int(text[0][1]),np.min(high)),(int(text[7][3]),np.max(high)),(0,255,0),2)
cv2.imshow('nubmer',img)
cv2.waitKey(0)

enter image description here

enter image description here