问题描述
我正在从terms列表的字典d ['text']值中从PyTesseract.image_to_data搜索一组特定的字符串值。我已经编写了一个函数check(),它返回匹配的右坐标,但是对于所有其他不匹配的搜索,它都返回一个默认坐标(['left','top','width' ,'height'])。
为什么我使用split的原因是因为字符串是这样的:[“用户的打印名称”,“投资者的名称”]。 在PyTesseract.image_to_data中,d ['text']的形式为:['Print','Name','of','Subscriber']。
PyTesseract.image_to_data返回带有data.keys()的图像字典:['level','page_num','block_num','par_num','line_num','word_num','left','top ','宽度','高度','conf','文本']。 我仅对“文本”和坐标表示同意:“左”,“顶”,“宽度”,“高度”。没有匹配项时,它应该返回None,它返回一个默认坐标。
代码:
import PyTesseract
from PyTesseract import Output
import cv2
image_path='image/out.jpg'
terms=["Print Name of Subscriber","Investor's Name","Name of Investor"]
image = cv2.imread(image_path)
d = PyTesseract.image_to_data(image,output_type=Output.DICT)
def check(word,d):
phrases = [w for w in word.split()]
for i,t in enumerate(d['text']):
to_match = d['text'][i+1: i+1+len(phrases)]
if all(p == m for p,m in zip(phrases,to_match)):
return [*map(lambda x:d[x][i],['left','top','width','height'])]
for term in terms:
check=check(term,d)
print(check)
当最后一项匹配时,它返回:[93,975,148,20]正确。 对于图像中术语中不匹配的所有其他条目,它将返回:[151,2078,170,8]默认值。我试过在函数的else语句中返回None,然后对所有匹配项也返回None。
d的值:
{'level': [1,2,3,4,5,5],'page_num': [1,1,1],'block_num': [0,6,7,8,9,10,11,12,13,14,14],'par_num': [0,'line_num': [0,2],'word_num': [0,15,16,17,18,19,20,21,22,23,'left': [0,93,145,308,513,616,694,767,348,653,94,155,207,243,355,489,518,570,167,199,300,384,441,472,204,269,372,471,500,237,306,411,516,95,173,248,294,362,394,476,507,553,665,765,816,875,907,974,1009,1127,1270,1306,1403,228,275,323,461,478,508,554,675,818,885,966,1062,1106,1276,1300,1346,1415,177,229,288,320,397,424,451,497,568,595,713,172,251,299,360,404,528,586,621,662,692,758,790,832,994,1068,1123,1159,1200,1281,1316,1371,1402,249,282,345,409,477,496,544,600,629,688,730,852,908,937,978,1135,1168,1237,1315,1393,139,194,241,325,379,422,494,519,642,670,754,791,924,1000,1029,1069,1226,1263,1331,1400,129,198,223,264,396,462,538,581,635,660,845,886,1016,1058,1102,1134,1253,1352,1420,200,256,389,408,456,602,649,739,897,944,1001,1024,1157,1198,1355,1389,152,265,357,416,531,601,693,762,825,860,961,996,1052,1114,1178,1214,1258,1307,1382,195,255,315,358,446,501,546,650,724,804,849,883,952,977,1088,1108,1137,1183,1303,169,257,296,512,681,716,839,865,1007,1060,1079,1201,1330,135,235,292,488,529,661,168,225,277,386,428,506,614,669,712,769,793,891,939,1034,1064,1098,1186,266,298,479,524,673,717,751,827,861,904,991,1095,1184,352,191,221,268,246,274,889,985,1014,1091,1122,1301,1277,376,503,986,1010,98,150,1431,227,151],'top': [0,158,272,273,407,415,474,542,543,610,612,613,618,648,651,652,687,691,689,756,801,795,797,831,833,834,835,870,873,871,872,874,878,912,916,911,909,947,948,949,955,988,989,1026,1092,1096,1093,1130,1138,1133,1132,1202,1199,1266,1274,1267,1333,1401,1469,1522,1524,1590,1591,1594,1593,1657,1658,1661,1660,1727,1778,1782,1780,1785,1847,1849,1913,1914,1916,2043,2058,2078,2078],'width': [1530,43,153,197,69,64,42,234,284,109,599,52,27,102,123,433,65,92,75,50,55,101,57,90,83,134,59,97,96,1342,68,35,24,70,61,40,47,56,132,86,34,125,39,104,38,113,133,72,87,36,162,37,60,25,632,71,115,48,32,58,151,26,1345,45,1344,49,78,77,31,28,62,46,121,112,122,74,82,84,44,54,91,1341,53,80,1343,30,117,114,124,107,624,33,1152,1150,66,100,1144,51,142,157,324,89,149,63,1286,259,79,484,88,491,85,193,1337,29,175,170],'height': [2164,161,103,290,8],'conf': ['-1','-1',81,'text': ['','','THE','COLCHESTER','MULTI-STRATEGY','GLOBAL','BOND','FUND','PLC','Additional','Subscription','Form','(FOR','USE','BY','EXISTING','INVESTORS','IN','COMPANY)','Name','of','Investor','(Please','Print','or','Type)','northern','Trust','Account','Number','(if','kNown):','Designation','Code','(nominee','accounts','only):','Please','select','the','Class','Shares','Company','in','which','you','wish','to','make','an','additional','subscription','by','marking','appropriate','tick','Box','Appendix','1','this','Form.','contact','Administrator','if','invest','is','not','listed','1.','ensure','send','completed','form','fax','email','Administrator.','Forms','sent','should','be','on','+353','1542','2902.','If','email,','please','that','read','and','comply','with','‘Terms','&','Conditions','Service','for','instructions','issued','via',"Email'",'can','found','Application','Emails','accordance','procedures','will','processed','may','result','delays','your','investment.','require','same','day','dealing,'received','no','later','than','Trade','Cut-Off','Time','(1pm','Irish','time).','Cleared','funds','must','6pm','(Irish','time)','falling','three','Business','Days','after','Dealing','Day.','The','receiving',"banks'",'details','2','banking','settlement','redemptions','(and','applicable,'distribution','proceeds)','are','different','from','those','provided','Kindly','note','Trade','executed','until','deemed','good','order.','Preferred','date','left','blank,'shares','earliest','available','Day):','(dd/mm/yyyy)','Amount','Subscription:','Either','shares:','(amount','purchased','words)','figures)','Or','amount','relevant','currency:','GS','COLCHESTER*','Genero']}
解决方法
如果to_match
为空结果,则需要将return语句修正为 not 不返回列表:
d = {
# removed irrelevant stuff
'level': [],'page_num': [],'block_num': [],'par_num': [],'line_num': [],'word_num': [],'conf': [],# keep important data
'left': [0,93,145,308,513,616,694,767,348,653,94,155,207,243,355,489,518,570,167,199,300,384,441,472,204,269,372,471,500,237,306,411,516,95,173,248,294,362,394,476,507,553,665,765,816,875,907,974,1009,1127,1270,1306,1403,228,275,323,461,478,508,554,675,818,885,966,1062,1106,1276,1300,1346,1415,177,229,288,320,397,424,451,497,568,595,713,172,251,299,360,404,528,586,621,662,692,758,790,832,994,1068,1123,1159,1200,1281,1316,1371,1402,249,282,345,409,477,496,544,600,629,688,730,852,908,937,978,1135,1168,1237,1315,1393,139,194,241,325,379,422,494,519,642,670,754,791,924,1000,1029,1069,1226,1263,1331,1400,129,198,223,264,396,462,538,581,635,660,845,886,1016,1058,1102,1134,1253,1352,1420,200,256,389,408,456,602,649,739,897,944,1001,1024,1157,1198,1355,1389,152,265,357,416,531,601,693,762,825,860,961,996,1052,1114,1178,1214,1258,1307,1382,195,255,315,358,446,501,546,650,724,804,849,883,952,977,1088,1108,1137,1183,1303,169,257,296,512,681,716,839,865,1007,1060,1079,1201,1330,135,235,292,488,529,661,168,225,277,386,428,506,614,669,712,769,793,891,939,1034,1064,1098,1186,266,298,479,524,673,717,751,827,861,904,991,1095,1184,352,191,221,268,246,274,889,985,1014,1091,1122,1301,1277,376,503,986,1010,98,150,1431,227,151],'top': [0,158,272,273,407,415,474,542,543,610,612,613,618,648,651,652,687,691,689,756,801,795,797,831,833,834,835,870,873,871,872,874,878,912,916,911,909,947,948,949,955,988,989,1026,1092,1096,1093,1130,1138,1133,1132,1202,1199,1266,1274,1267,1333,1401,1469,1522,1524,1590,1591,1594,1593,1657,1658,1661,1660,1727,1778,1782,1780,1785,1847,1849,1913,1914,1916,2043,2058,2078,2078],'width': [1530,43,153,197,69,64,42,234,284,109,599,52,27,102,123,21,433,65,23,92,75,50,22,55,101,57,90,83,134,59,97,96,1342,68,35,24,70,17,61,40,47,56,132,86,34,125,39,104,8,38,113,133,72,87,36,162,18,37,60,25,632,19,14,71,115,48,32,58,151,26,1345,16,45,1344,49,78,77,31,28,62,46,121,112,122,74,82,15,84,44,54,91,1341,53,80,13,1343,30,117,114,124,107,624,33,1152,1150,66,100,1144,51,142,157,324,89,149,63,1286,259,79,484,88,491,85,193,1337,29,175,4,170],'height': [2164,161,20,12,103,290,8],'text': ['','','THE','COLCHESTER','MULTI-STRATEGY','GLOBAL','BOND','FUND','PLC','Additional','Subscription','Form','(FOR','USE','BY','EXISTING','INVESTORS','IN','COMPANY)','Name','of','Investor','(Please','Print','or','Type)','Northern','Trust','Account','Number','(if','known):','Designation','Code','(nominee','accounts','only):','Please','select','the','Class','Shares','Company','in','which','you','wish','to','make','an','additional','subscription','by','marking','appropriate','tick','box','Appendix','1','this','Form.','contact','Administrator','if','invest','is','not','listed','1.','ensure','send','completed','form','fax','email','Administrator.','Forms','sent','should','be','on','+353','1542','2902.','If','email,','please','that','read','and','comply','with','‘Terms','&','Conditions','Service','for','instructions','issued','via',"Email'",'can','found','Application','Emails','accordance','procedures','will','processed','may','result','delays','your','investment.','require','same','day','dealing,'received','no','later','than','Trade','Cut-Off','Time','(1pm','Irish','time).','Cleared','funds','must','6pm','(Irish','time)','falling','three','Business','Days','after','Dealing','Day.','The','receiving',"banks'",'details','2','banking','settlement','redemptions','(and','applicable,'distribution','proceeds)','are','different','from','those','provided','Kindly','note','trade','executed','until','deemed','good','order.','Preferred','date','left','blank,'shares','earliest','available','Day):','(dd/mm/yyyy)','Amount','Subscription:','Either','shares:','(amount','purchased','words)','figures)','Or','amount','relevant','currency:','GS','COLCHESTER*','Genero']}
def check(word,d): phrases = [w for w in word.split()] for i,t in enumerate(d['text']): to_match = d['text'][i+1: i+1+len(phrases)] if all(p == m for p,m in zip(phrases,to_match)):
print(to_match) # analyze whats wrong
return [*map(lambda x:d[x][i],['left','top','width','height'])] terms = ["Print Name of Subscriber","Investor's Name","Name of Investor"] for term in terms: ch = check(term,d) # rename,dont overwrite your function print(ch)
调试输出:
[] # print(to_match)
[151,170,8]
[] # print(to_match)
[151,8]
['Name','Investor'] # print(to_match)
[94,26]
并修复它:
# dont return if seems to_match is [] (aka Falsy)
if to_match and all(p == m for p,to_match)):
print(to_match) # seems to_match is [] for your faulty results .. so
return [*map(lambda x:d[x][i],'height'])]
新输出:
None
None
[94,26]