问题描述
您好,我有以下两个文件,我想在这两个文件之间找到匹配项。 Test1.csv 中的每条记录最多可以匹配Test2.csv 中的一条记录,但Test1.csv 中的多个记录可以匹配Test2.csv 中的同一条记录。 我正在匹配 name 和 domainWithExtension 列。
这是代码:
import csv
import re
import logging
import optparse
import dedupe
from unidecode import unidecode
def preProcess(column):
column = unidecode(column)
column = re.sub('\n',' ',column)
column = re.sub('-','',column)
column = re.sub('/',column)
column = re.sub("'",column)
column = re.sub(",",column)
column = re.sub(":",column)
column = re.sub(' +',column)
column = column.strip().strip('"').strip("'").lower().strip()
if not column:
column = None
return column
def readData(filename):
"""
Read in our data from a CSV file and create a dictionary of records,where the key is a unique record ID.
"""
data_d = {}
with open(filename,encoding='utf-8') as f:
reader = csv.DictReader(f)
for i,row in enumerate(reader):
clean_row = dict([(k,preProcess(v)) for (k,v) in row.items()])
data_d[filename + str(i)] = dict(clean_row)
return data_d
if __name__ == '__main__':
optp = optparse.OptionParser()
optp.add_option('-v','--verbose',dest='verbose',action='count',help='Increase verbosity (specify multiple times for more)'
)
(opts,args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)
output_file = 'data_matching_output.csv'
settings_file = 'data_matching_learned_settings'
training_file = 'data_matching_training.json'
left_file = 'Test1.csv'
right_file = 'Test2.csv'
print('importing data ...')
data_1 = readData(left_file)
data_2 = readData(right_file)
if os.path.exists(settings_file):
print('reading from',settings_file)
with open(settings_file,'rb') as sf:
linker = dedupe.StaticRecordLink(sf)
else:
fields = [
{'field' : 'name','type': 'String','has missing': True},{'field' : 'domainWithExtension',]
linker = dedupe.RecordLink(fields)
if os.path.exists(training_file):
print('reading labeled examples from ',training_file)
with open(training_file) as tf:
linker.prepare_training(data_1,data_2,training_file=tf,sample_size=15000)
else:
linker.prepare_training(data_1,sample_size=15000)
print('starting active labeling...')
dedupe.console_label(linker)
linker.train()
with open(training_file,'w') as tf:
linker.write_training(tf)
with open(settings_file,'wb') as sf:
linker.write_settings(sf)
print('clustering...')
linked_records = linker.join(data_1,0.5,constraint='many-to-one')
print(linked_records)
print('# duplicate sets',len(linked_records))
cluster_membership = {}
for cluster_id,(cluster,score) in enumerate(linked_records):
for record_id in cluster:
cluster_membership[record_id] = {'Cluster ID': cluster_id,'Link score': score}
print(cluster_membership)
with open(output_file,'w',encoding = "utf-8") as f:
header_unwritten = True
for fileno,filename in enumerate((left_file,right_file)):
with open(filename,encoding = "utf-8") as f_input:
reader = csv.DictReader(f_input)
if header_unwritten:
fieldnames = (['Cluster ID','Link score','source file'] +
reader.fieldnames)
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
header_unwritten = False
for row_id,row in enumerate(reader):
record_id = filename + str(row_id)
cluster_details = cluster_membership.get(record_id,{})
row['source file'] = fileno
row.update(cluster_details)
writer.writerow(row)
这有效并给出以下结果:
“Boxaround”的记录在 Test1.csv 中出现两次。因此,我希望这两个记录都与 Test2.csv 中的“Boxaround”记录匹配,并且输出中的集群 ID 应该相同,但是输出中的集群 ID 4 有两条记录,另一条记录的集群 ID 为 0 表示“盒装”。我希望所有三个“Boxaround”记录都具有相同的集群 ID 4。我怎样才能做到这一点?请帮忙。
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)