问题描述
我正在使用python脚本生成json文件,但是for循环后的问题是它仅获取了最近更新的值。下面是代码。
1个读取水印文件:
watermark_file = config_dict["watermark_file"] + "watermark.json"
current_date,flag = read_watermark_file(config_dict.get("out_bucket"),watermark_file)
contents = list_s3_files(opt={'Bucket': config_dict['inp_bucket'],'Prefix': config_dict['inp_location']})
print("contents :",contents)
for n in range(len(contents)):
watermark_json = {}
loop = {}
zipped_fileName = contents[n].split("/")[-1]
therapeutic_area = re.match("(.*?)_(.*)",zipped_fileName)[1]
indication = re.match("(.*?)_(.*?)_(.*)",zipped_fileName)[2]
print("value of n:",n)
loop['item_' + str(n)] = {"therapeutic_area": therapeutic_area,"indication": indication,"s3_path": config_dict["inp_location"] + therapeutic_area + "/" + indication + "/"}
print("loop :",loop)
watermark_json.update(loop)
print("watermark_json :",watermark_json)
# update water mark file
print("watermark_file :",watermark_file)
watermark_json['date_dir'] = datetime.datetime.Now().strftime("%Y/%m/%d/%H") + "/"
watermark_json['processed_flag'] = False
print("final watermark file ",watermark_json)
# refresh watermark file
write_to_s3(config_dict['out_bucket'],watermark_file,watermark_json,config_dict)
日志:
2020-08-23T23:00:43.055+05:30
copy
contents : ['mdit/cord/data/inBox/Immunology_COVID-19_Data_202008061200_09.zip','mdit/cord/data/inBox/Immunology_SLE_Data_202008131800_01.zip','mdit/cord/data/inBox/Neurology_ALZ_Data_202008031800_01.zip']
contents : ['mdit/cord/data/inBox/Immunology_COVID-19_Data_202008061200_09.zip','mdit/cord/data/inBox/Neurology_ALZ_Data_202008031800_01.zip']
2020-08-23T23:00:43.055+05:30
copy
value of n: 0
value of n: 0
2020-08-23T23:00:43.055+05:30
copy
loop : {'item_0': {'therapeutic_area': 'Immunology','indication': 'COVID-19','s3_path': 'mdit/cord/data/inBox/Immunology/COVID-19/'}}
loop : {'item_0': {'therapeutic_area': 'Immunology','s3_path': 'mdit/cord/data/inBox/Immunology/COVID-19/'}}
2020-08-23T23:00:43.055+05:30
copy
watermark_json : {'item_0': {'therapeutic_area': 'Immunology','s3_path': 'mdit/cord/data/inBox/Immunology/COVID-19/'}}
watermark_json : {'item_0': {'therapeutic_area': 'Immunology','s3_path': 'mdit/cord/data/inBox/Immunology/COVID-19/'}}
2020-08-23T23:00:43.055+05:30
copy
value of n: 1
value of n: 1
2020-08-23T23:00:43.055+05:30
copy
loop : {'item_1': {'therapeutic_area': 'Immunology','indication': 'SLE','s3_path': 'mdit/cord/data/inBox/Immunology/SLE/'}}
loop : {'item_1': {'therapeutic_area': 'Immunology','s3_path': 'mdit/cord/data/inBox/Immunology/SLE/'}}
2020-08-23T23:00:43.055+05:30
copy
watermark_json : {'item_1': {'therapeutic_area': 'Immunology','s3_path': 'mdit/cord/data/inBox/Immunology/SLE/'}}
watermark_json : {'item_1': {'therapeutic_area': 'Immunology','s3_path': 'mdit/cord/data/inBox/Immunology/SLE/'}}
2020-08-23T23:00:43.055+05:30
copy
value of n: 2
value of n: 2
2020-08-23T23:00:43.055+05:30
copy
loop : {'item_2': {'therapeutic_area': 'Neurology','indication': 'ALZ','s3_path': 'mdit/cord/data/inBox/Neurology/ALZ/'}}
loop : {'item_2': {'therapeutic_area': 'Neurology','s3_path': 'mdit/cord/data/inBox/Neurology/ALZ/'}}
2020-08-23T23:00:43.055+05:30
copy
watermark_json : {'item_2': {'therapeutic_area': 'Neurology','s3_path': 'mdit/cord/data/inBox/Neurology/ALZ/'}}
watermark_json : {'item_2': {'therapeutic_area': 'Neurology','s3_path': 'mdit/cord/data/inBox/Neurology/ALZ/'}}
2020-08-23T23:00:43.055+05:30
copy
watermark_file : mdit/cord/technical_Metadata/watermark/watermark.json
watermark_file : mdit/cord/technical_Metadata/watermark/watermark.json
2020-08-23T23:00:43.055+05:30
copy
final watermark file
{'item_2': {'therapeutic_area': 'Neurology','s3_path': 'mdit/cord/data/inBox/Neurology/ALZ/'},'date_dir': '2020/08/23/17/','processed_flag': False}
{
"loop": {
"item_0":{
"therapeutic_area": "Immunology","indication": "SLE","s3_path": "mdit/cord/data/inbound/Immunology/SLE/"
},"item_1":{
"therapeutic_area": "Immunology","indication": "COVID-19","s3_path": "mdit/cord/data/inbound/Immunology/COVID-19/"
},"item_2":{
"therapeutic_area": "Neurology","indication": "ALZ","s3_path": "mdit/cord/data/inbound/Immunology/ALZ/"
}
},"date_dir": "2020/08/23/12/","processed_flag": false
}
{
"item_2": {
"therapeutic_area": "Neurology","s3_path": "mdit/cord/data/inBox/Neurology/ALZ/"
},"date_dir": "2020/08/23/17/","processed_flag": false
}
我在代码中做错了什么?
解决方法
代码错误行为的原因是watermark_json = {}
在for n in range(len(contents)):
循环内。它应该位于for循环之前。
应该对代码进行进一步的更改以获取所需的输出。
您可以尝试以下代码:
watermark_file = config_dict["watermark_file"] + "watermark.json"
current_date,flag = read_watermark_file(config_dict.get("out_bucket"),watermark_file)
contents = list_s3_files(opt={'Bucket': config_dict['inp_bucket'],'Prefix': config_dict['inp_location']})
print("contents :",contents)
watermark_json = {'loop': {}} # <- This line is changed
for n in range(len(contents)):
loop = {}
zipped_fileName = contents[n].split("/")[-1]
therapeutic_area = re.match("(.*?)_(.*)",zipped_fileName)[1]
indication = re.match("(.*?)_(.*?)_(.*)",zipped_fileName)[2]
print("value of n:",n)
loop['item_' + str(n)] = {"therapeutic_area": therapeutic_area,"indication": indication,"s3_path": config_dict["inp_location"] + therapeutic_area + "/" + indication + "/"}
print("loop :",loop)
watermark_json['loop'].update(loop) # <- This line is changed
print("watermark_json :",watermark_json)
# update water mark file
print("watermark_file :",watermark_file)
watermark_json['date_dir'] = datetime.datetime.now().strftime("%Y/%m/%d/%H") + "/"
watermark_json['processed_flag'] = False
print("final watermark file ",watermark_json)
# refresh watermark file
write_to_s3(config_dict['out_bucket'],watermark_file,watermark_json,config_dict)