PDF到Python中的文本返回图像文件中的空结果

问题描述

我有这个pdf file。基于图像的低分辨率 pdf 文件。我正在尝试提取其中的数据,但我尝试过的所有选项似乎都不起作用。

选项 1 - 使用 pdfminer

from pdfminer.pdfinterp import PDFResourceManager,pdfpageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import pdfpage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr,retstr,laparams=laparams)
    fp = open(path,'rb')
    interpreter = pdfpageInterpreter(rsrcmgr,device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in pdfpage.get_pages(fp,pagenos,maxpages=maxpages,password=password,caching=caching,check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

选项 2 - 使用 tika

from tika import parser # pip install tika
raw = parser.from_file(path)
text=raw['content']
# I don't like to use it very much because it often corrupts the file

选项 3 - 使用 pypdf

    import PyPDF2
    pdf_file = open(path,'rb')
    read_pdf = PyPDF2.PdfFileReader(pdf_file)
    number_of_pages = read_pdf.getNumPages()
    page = read_pdf.getPage(0)
    page_content = page.extractText()
    text=page_content.encode('utf-8')

所有选项都返回空结果。我想这可能与文件的质量有关。 我知道我们可以处理图像并增加图像特征以简化数据提取增加图像大小、处理阈值等,您可以使用 PIL 做很多事情)。有没有一种有效的方法来处理 pdf 文件

解决方法

我只试过提取非扫描的 pdf 文本,我记得 pdfminer 给出了最好的结果。 然而,this!可能对你有帮助,还有一些其他的 OCR python 库用于此目的

,

最后我想出了一个不理想的解决方案,但使用 pdfminer 和 pytesseract 对我有用:

from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_image_to_text(file_path):
    from pdf2image import convert_from_path
    import pytesseract

    dpi = 350 # dots per inch
    pages = convert_from_path(file_path,dpi)
    text=""

    for i in range(len(pages)):
        page = pages[i]
        a=pytesseract.image_to_string(page)
        text=text+a

    return text

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr,retstr,laparams=laparams)
    fp = open(path,'rb')
    interpreter = PDFPageInterpreter(rsrcmgr,device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp,pagenos,maxpages=maxpages,password=password,caching=caching,check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

# extracting data from image pdfs

if "a" not in text or "A" not in text and extract_image_pdfs==True:
    # my pdfs will always have an "a" that's why I use this if sentence above
    try:
        print('starting to convert to image')
        text=convert_pdf_image_to_text(path)
        print('finished converting to image')
    except:
        text="no text"
        print("not pdf nor image")

return text