问题描述
enumerate
解决方法
感谢您的回答。
我的想法是根据(相同pdf文件的)特定位置的内容动态重命名pdf文件。由于这些都是工资单,因此所有文件的员工注册号和姓名都位于同一位置。
import os,PyPDF2,re
def rename_pdfs(extraced_pdf_folder,rename_folder):
# traverse down through the root directory to sub-directories
for root,dirs,files in os.walk(extraced_pdf_folder):
for filename in files:
basename,extension = os.path.splitext(filename)
# if a file is a pdf
# if extension == ".pdf":
# # create a reference to the full filename path
fullpath = root + "\\" + basename + extension
# # open the individual pdf
pdf_file_obj = open(fullpath,"rb")
pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
# # access the individual page
page_obj = pdf_reader.getPage(0)
# # extract the the text
pdf_text = page_obj.extractText()
# use enumerate to find information
for position,line in enumerate(pdf_text):
lines_to_read [1280,1281,1282,1283,1284,1285,1286,1287,1288,1289,1290,1291,1292,1293,1294,1295,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305,1306,13
07,1308]
doc_num = "nome "
pdf_file_obj.close()
if position in lines_to_read:
#THIS PRINT IN THE CONSOLE FOR EACH PDF THE EMPLYE NAME AND SURNAME
print (line,end="")
#OS RENAME WITH %LINE GIVE ME ONLY A CARACTER
os.rename(fullpath,root + "\\" + doc_num + "%s.pdf"%line)
# parameter variables
root_dir = r"C:\query\pdf\rename_pdf"
extract_to = r"C:\query\pdf\extracted"
rename_to = r"C:\query\pdf\renamed"
rename_pdfs(extract_to,rename_to)
,
我使用链接的正则表达式解决了它。
谢谢大家
for index in re.finditer("SHERAZ0101",pdf_text):
#oc_ext = pdf_text[index.start()+506:index.start()+531]
doc_ext = pdf_text[index.end()+488:index.end()+521] +pdf_text[index.end()+471:index.end()+477]
doc_num = "" + doc_ext.replace('\n\n',"").replace('/',"-").replace('SPECI',"").replace(":","").replace(" ","").replace(" .pdf",".pdf")
pdf_file_obj.close()
print(doc_ext)
# rename the pdf based on the information in the pdf
os.rename(fullpath,rename_folder + "\\" + doc_num + ".pdf")