问题描述
我编译了以下代码来解析pdf文件。当我尝试获取列的长度(在新列中针对它的每个单元格)时,它引发了错误。我无法摆脱它。怎么做。请指导。
from PyPDF2 import PdfFileReader
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
opn_pdf1= open("DEC7366.pdf","rb")
read_pdf= PdfFileReader(opn_pdf1)
book1 = pd.read_excel("Book1.xlsx")
for i in range(0,read_pdf.getNumPages()):
page= read_pdf.getPage(i)
data= page.extractText()
nn = data.split("\n")
df = pd.DataFrame(nn,columns=['Extract'])
frames = (book1,df)
book1 = pd.concat(frames)
book1['Extract'] = book1['Extract'].replace(r'^\s+$',np.nan,regex=True)
#print(book1)
book2 = book1.dropna(how="all")
book2['Duplicate'] = book2['Extract'].duplicated( keep="first")
filter = book2['Duplicate']== True
book2['Duplicate'].where(filter,inplace = True)
book2 = book2.dropna(how="any")
del book2['Duplicate']
book2['Duplicate']="Yes"
book2['Extract']=book2['Extract'].str.replace(r'^\s+$',"")
book2 = book2.dropna(how='any',subset=["Extract"])
#book2['Length'] = book2.select_dtypes(object).sum(axis=1).str.len()
book2['Length']= book2['Extract'].apply(str).str.replace(',','/')
book2 = book2.where(book2['Length']>15)
book2['Duplicates'] = book2['Extract'].duplicated(keep="first")
filter = book2['Duplicates']== False
book2['Duplicates'].where(filter,inplace = True)
#del book2['Duplicates']
#book2['Duplicate']="Yes"
#del book2['Duplicates']
#book2 = book2.drop(columns=['Duplicates','Length',"Duplicate"])
book2 = book2.dropna(how="any")
book2.to_excel("AB1.xlsx",index=False)
TypeError: '>' not supported between instances of 'str' and 'int'
问题在于以下几行
book2['Length']= book2['Extract'].apply(str).str.replace(','/')
book2 = book2.where(book2['Length']>15)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)