问题描述
This is the link to the pdf file from which I want to extract data
def onlyenglish(text):
import re
alphabet_regular_expression = re.compile("[^a-zA-Z|()]")
text = re.sub(alphabet_regular_expression,"",text)
return text
annexure2page1 = tabula.read_pdf(file,pages = 1,lattice = True,relative_area=True)
annexure2page1_df1= annexure2page1[0]
annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
annexure2page1_df2 = annexure2page1_df2.replace('\r',' ',regex=True)`
annexure2page1_df2['ReportMonth'] = reportmonth
annexure2page1_df2['एयरपोर्\rAIRPORT'] = annexure2page1_df2['एयरपोर्\rAIRPORT'].str.title()
annexure2page1_df2['Airports'] = annexure2page1_df2['एयरपोर्\rAIRPORT'].apply(lambda x: onlyenglish(str(x)))
annexure2page1_df2 = annexure2page1_df2.rename(columns={'वरयुयरन प्रचरलन (िंख्यर म )\rAIRCRAFT MOVEMENTS (IN NOS.)':'value','Unnamed: 8':'value_ytm','Unnamed: 4':'value_smly','Unnamed: 10':'value_ytmly'})
annexure2page1_df2 = annexure2page1_df2.replace(r'^\s*$',np.nan,regex=True)
annexure2page1_df3 = annexure2page1_df2.dropna()
annexure2page1_df3["Service"] = "International"
annexure2page1_df3["Metric"] = "ATMs"
annexure2page1_df3['ReportName'] = reportname
annexure2page1_df3['reportlink'] = file
##extracting page 1
annexure2page1extraction = annexure2page1_df3[['ReportName','reportlink','ReportMonth','Airports','Service','Metric','value','value_smly','value_ytm','value_ytmly']]
错误堆栈
> KeyError Traceback (most recent call last)
<ipython-input-14-9c5d09fa538a> in <module>()
2 annexure2page1_df1= annexure2page1[0]
3 #
----> 4 annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','Unnamed: 10']]
5 annexure2page1_df2 = annexure2page1_df2.replace('\r',regex=True)
6 annexure2page1_df2['ReportMonth'] = reportmonth
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self,key,indexer,axis,raise_missing)
1302 if raise_missing:
1303 not_found = list(set(key) - set(ax))
-> 1304 raise KeyError(f"{not_found} not in index")
1305
1306 # we skip the warning on Categorical
KeyError: "['वायुयाि प्रर्ालि (संख्या में)\\rAIRCRAFT MOVEMENTS (IN NOS.)','एयिपोर्च\\rAIRPORT'] not in index"
解决方法
替换这行代码-->
<a>
<b/>
<c>value1</c>
<d>e</d>
<c>value2</c>
<d>e</d>
<c>value3</c>
<d>e</d>
</a>
与 --> annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
您得到的错误 (annexure2page1_df2 = annexure2page1_df1.iloc[:,[3,5,7,11,13]]
) 是因为找不到密钥。
因此,我通过直接提供要子集化的列的索引位置来绕过它。 Check out the screenshot