问题描述
我是 Python 新手,正在尝试了解如何自动化操作。我有一个文件夹,其中每天更新 5 个 csv 文件,但有时其中一两个文件不会在特定日期更新。我必须手动检查此文件夹。相反,我希望以这样一种方式自动执行此操作:如果 csv 文件在过去 24 小时内没有更新,它可以向自己发送一封电子邮件,提醒我这一点。
我的代码:
import datetime
import glob
import os
import smtplib
import string
Now = datetime.datetime.today() #Get current date
list_of_files = glob.glob('c:/Python/*.csv') # * means all if need specific format then *.csv
latest_file = max(list_of_files,key=os.path.getctime) #get latest file created in folder
newestFileCreationDate = datetime.datetime.utcfromtimestamp(os.path.getctime(latest_file)) # get creation datetime of last file
dif = (Now - newestFileCreationDate) #calculating days between actual date and last creation date
logFile = "c:/Python/log.log" #defining a log file
def checkFolder(dif,Now,logFile):
if dif > datetime.timedelta(days = 1): #Check if difference between today and last created file is greater than 1 days
HOST = "12.55.13.12" #This must be your smtp server ip
SUBJECT = "Alert! At least 1 day wthout a new file in folder xxxxxxx"
TO = "xx.t@gmail.com"
FROM = "xx.t@gmail.com"
text = "%s - The oldest file in folder it's %s old " %(Now,dif)
BODY = string.join((
"From: %s" % FROM,"To: %s" % TO,"Subject: %s" % SUBJECT,"",text
),"\r\n")
server = smtplib.SMTP(HOST)
server.sendmail(FROM,[TO],BODY)
server.quit()
file = open(logFile,"a") #Open log file in append mode
file.write("%s - [WARNING] The oldest file in folder it's %s old \n" %(Now,dif)) #Write a log
file.close()
else : # If difference between today and last creation file is less than 1 days
file = open(logFile,"a") #Open log file in append mode
file.write("%s - [OK] The oldest file in folder it's %s old \n" %(Now,dif)) #write a log
file.close()
checkFolder(dif,logFile) #Call function and pass 3 arguments defined before
但是,这不会没有错误地运行,我只想通过邮件通知文件夹中尚未更新的那些文件。即使它是其中的 5 个文件之一或 5 个尚未更新的文件。
解决方法
使用纯python,简洁的方式
import hashlib
import glob
import json
import smtplib
from email.message import EmailMessage
import time
import schedule #pip install schedule
hasher = hashlib.md5()
size = 65536 #to read large files in chunks
list_of_files = glob.glob('./*.csv') #absolute path for crontab
第 1 部分)首先运行此脚本,然后将其注释掉。它将创建一个带有文件哈希值的 json 文件。
first_hashes = {}
for x in list_of_files:
with open(x,'rb') as f:
buf = f.read(size)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(size)
first_hashes[x] = hasher.hexdigest()
with open('hash.json','w') as file:
file.write(json.dumps(first_hashes,indent=2))
现在注释掉甚至删除它。
第 2 部分)自动化脚本:
def send_email():
check_hash = {} #Contain hashes that have not changed
with open('hash.json') as f: #absolute path for crontab
data = json.load(f)
for x in list_of_files:
with open(x,'rb') as f:
buf = f.read(size)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(size)
new_hash = hasher.hexdigest()
#if a hash match with one in data,that file has not changed
if new_hash in data.values():
check_hash[x] = new_hash
data[x] = new_hash
#update our hashes
with open('hash.json','w') as file: #absolute path for crontab
file.write(json.dumps(data,indent=2))
if len(check_hash) > 0: #check if there's anything in check_hash
filename="check_hash.txt" #absolute path for crontab
#write to a text file named "check_hash.txt"
with open(filename,'w') as f: #absolute path for crontab
f.write(json.dumps(check_hash,indent=2))
# for gmail smtp setup watch youtu.be/JRCJ6RtE3xU
EMAIL_ADDRESS = 'SMTPAddress@gmail.com'
EMAIL_PASSWORD = 'SMTPPassWord'
msg = EmailMessage()
msg['Subject'] = 'Unupdated files'
msg['From'] = EMAIL_ADDRESS
msg['To'] = 'receive@gmail.com'
msg.set_content('These file(s) did not update:')
msg.add_attachment(open(filename,"r").read(),filename=filename)
with smtplib.SMTP_SSL('smtp.gmail.com',465) as smtp:
smtp.login(EMAIL_ADDRESS,EMAIL_PASSWORD)
smtp.send_message(msg)
#for faster testing check other options here github.com/dbader/schedule
schedule.every().day.at("10:30").do(send_email)
while 1:
schedule.run_pending()
time.sleep(1)
编辑:如果你重新启动你的电脑,你需要再次运行这个文件来重新启动计划,为了避免这种情况,你可以使用 crontab 如下(从 youtu.be/j- KgGVbyU08):
# mm hh DOM MON DOW command
30 10 * * * python3 path-to-file/email-script.py #Linux
30 10 * * * python path-to-file/email-script.py #Windows
如果当时电脑处于开机状态,这将在每天上午 10:30 运行脚本。为了更快地测试(每 1 分钟运行一次),请使用:
* * * * * python3 path-to-file/email-script.py
注意:如果您要使用 crontab,则必须对所有文件引用使用绝对路径并替换
schedule.every().day.at("10:30").do(send_email)
while 1:
schedule.run_pending()
time.sleep(1)
与
if __name__ == "__main__":
send_email()
经过测试,效果很好!
,你在考虑这样的事情吗?
import os
from datetime import datetime
import smtplib
import textwrap
def send_email_failure():
SERVER = "12.55.13.12" #This must be your smtp server ip
SUBJECT = "Alert! At least 1 day without a new file in folder xxxxxxx"
TO = "xx.t@gmail.com"
FROM = "xx.t@gmail.com"
TEXT = "%s - The oldest file in folder it's %sh old " %(datetime.now(),oldest_time_hour)
"""this is some test documentation in the function"""
message = textwrap.dedent("""\
From: %s
To: %s
Subject: %s
%s
""" % (FROM,",".join(TO),SUBJECT,TEXT))
print(message)
# Send the mail
server = smtplib.SMTP(SERVER)
server.sendmail(FROM,TO,message)
server.quit()
def save_log(logFile,ok_or_failure,time_now,delta):
file = open(logFile,"a") #Open log file in append mode
if ok_or_failure != 'ok':
file.write("%s - [WARNING] The oldest file in folder it's %s old \n" %(time_now,delta))
else:
file.write("%s - [OK] The oldest file in folder it's %s old \n" %(time_now,delta))
file.close()
def check_file(filename):
print(filename)
if filename.endswith('.csv'):
print('csv')
try:
mtime = os.path.getmtime(filename) # get modified time
except OSError:
mtime = 0
last_modified_date = datetime.fromtimestamp(mtime)
tdelta = datetime.now() - last_modified_date
hours = tdelta.seconds // 3600 # convert to hours
return hours
else:
return 0
# we check what files are in the dir 'files'
# and their modification time
oldest_time_hour = 0
for path,dirs,files in os.walk('./files'): # this need to be modified by case
for file in files:
# get each file time of modification
time = check_file(path+'/'+file)
if time > 0:
# save the oldest time
if time > oldest_time_hour:
oldest_time_hour = time
# if it is older that 24h
if oldest_time_hour > 24:
save_log('log.log','failure',datetime.now(),oldest_time_hour)
send_email_failure()
else:
save_log('log.log','ok',oldest_time_hour)
你还需要一个无限循环来运行 python 脚本或一个 chronjob 来每小时运行一次这个 python 脚本
,为什么要检查 last_modified_date?我建议你用md5校验和检查文件的修改。 我的想法是,如果您有以下文件:
file1.csv
file2.csv
file3.csv
file4.csv
file5.csv
您可以检查他们的 md5 校验和并将结果 + DateTime 写入原始文件旁边的文件中。如下:
file1.csv
file1.csv_checksum
file1.csv_checksum 的内容
时间戳,校验和
1612820511,d41d8cd98f00b204e9800998ecf8427e
您可以使用以下代码检查文件的 md5:
>>> import hashlib
>>> hashlib.md5(open('filename.exe','rb').read()).hexdigest()
然后您可以使用校验和文件中提供的结果来检查结果(如果校验和文件不存在,则只需第一次创建)
我认为您可以通过这种方法轻松应对。
,起初我从一个任务调度器装饰器开始,它可以让你轮询一个固定延迟的目录:
import time
import functools
def scheduled(fixed_delay):
def decorator_scheduled(func):
functools.wraps(func)
def wrapper_schedule(*args,**kwargs):
result = func(*args,**kwargs)
self = args[0]
delay = getattr(self,fixed_delay)
time.sleep(delay)
return result
return wrapper_schedule
return decorator_scheduled
将其保存为名为 task_scheduler.py 的单独模块。 我将在我的文件观察器中使用它:
import os
from task_scheduler import scheduled
import smtplib,ssl
class FileWatcher:
def __init__(self,files_path='./myFiles',extension='.csv',poll_delay=2):
self.files_path = files_path
self.extension = extension
self.poll_delay = poll_delay
def notify_host_on_nonchange(self,file_path):
port = 465
smtp_server = "smtp.gmail.com"
sender_email = "sender@gmail.com"
receiver_email = "receiver@gmail.com"
password = "Your password here" #You may want to read it from file
message = f"No change in file: {file_path} for 24 hurs!"
context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server,port,context=context) as server:
server.login(sender_email,password)
server.sendmail(sender_email,receiver_email,message)
def watch(self):
try:
while True:
self.poll_()
except KeyboardInterrupt:
log.debug('Polling interrupted by user.')
@scheduled("poll_delay")
def poll_(self,):
for f in os.listdir(self.files_path):
full_path = os.path.join(self.files_path,f)
path_stat = os.stat(full_path)
_,file_ext = os.path.splitext(f)
ctime = path_stat.st_ctime
diff = time.time() - ctime/3600
if diff<=24 or not S_ISREG(path_stat.st_mode) or str(file_ext) != self.extension:
continue
self.notify_host_on_nonchange(full_path)
if __name__ == "__main__":
file_listener = FileWatcher()
file_listener.watch()
上面的类定义了一个 poll_ 函数,它受益于 os.stat 模块来检查修改时间。如果修改时间小于等于 24 或者文件不是普通文件(意味着它是一个目录)或者它没有你要查找的扩展名,轮询将跳过它,否则调用通知函数发送电子邮件.它使用 gmail smtp server example,但您可以根据自己的环境进行更改。 Watch 函数是连续轮询的包装器。
该类改编自我的机器学习模型观察器和加载器,您可以从 my github 访问该版本和项目。有关装饰器和脚本的进一步说明,您可以查看 my medium post。
,当然,我不知道 CSV,但我会导入时间并使用格式和时间。睡眠功能创建一个定时器。 time 模块的好处在于您可以将其配置为在时间结束后为变量设置一个值。所以也许如果你这样做并放入一个 if 语句,当变量达到一个值时,发送电子邮件。