问题描述
我正在尝试将pdf文件转换为python中的文本格式。我设法做到了,但是在某些情况下,值的位置不正确。就像段落中间的值在转换的文本的末尾或否。有没有人遇到过这个?这与我正在处理的pdf类型有关吗?任何帮助或信息将不胜感激。为了更好地理解,我正在从pdf和输出中添加图像。
----代码----
from pdfminer.converter import pdfpageAggregator
from pdfminer.layout import LAParams,LTTextBox,LTTextLine
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import pdfpageInterpreter,PDFResourceManager
from pdfminer.pdfpage import pdfpage,PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser
from PyPDF2 import PdfFileReader
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager,pdfpageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import pdfpage
from io import StringIO,BytesIO
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr,retstr,codec=codec,laparams=laparams)
interpreter = pdfpageInterpreter(rsrcmgr,device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in pdfpage.get_pages(pdfFile,pagenos,maxpages=maxpages,password=password,caching=caching,check_extractable=True):
interpreter.process_page(page)
device.close()
textstr = retstr.getvalue()
retstr.close()
return textstr
if __name__ == "__main__":
scrape = open("file location",'rb')
pdfFile = BytesIO(scrape.read())
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
解决方法
您实际上并不需要那么多的导入,因为一个名为pyPDF2
的最近的lib
根据我的经验,它不会跳过任何位。
不要忘记先安装库pip install PyPDF2
import PyPDF2
# open the desired pdf
pdfFile=open('1.pdf','rb')
#read it and get the pages number
pdfreader=PyPDF2.PdfFileReader(pdfFile)
x=pdfreader.numPages
pageobj=pdfreader.getPage(x-1)
# extract the pdf to text
text=pageobj.extractText()
# save the output to a .txt
file1=open("1.txt","a")
file1.writelines(text)
file1.close()