问题描述
我一直试图在本地文件系统中下载成千上万张图像,但是由于我下载了大约5,000张图像时出现了 asyncio.exceptions.TimeoutError 异常,因此无法正常工作通过目录。
我第一次执行下一个脚本时,下载量为16.000,但是每次执行该脚本时,下载的图像数量都会减少,目前大约有5,000张图像。
那是我实现的脚本:
import os
import asyncio
import aiofiles
import async_timeout
from aiohttp import ClientSession
from generator import generate_hash
from logger import logger
from typing import List,Dict,Any
async def download_file(session: Any,remote_url: str,filename: str) -> None:
try:
async with async_timeout.timeout(120):
async with session.get(remote_url) as response:
if response.status == 200:
async with aiofiles.open(filename,mode='wb') as f:
async for data in response.content.iter_chunked(1024):
await f.write(data)
else:
logger.error(f"Error to get {filename} from Remote Server")
except asyncio.TimeoutError:
logger.error(f"Timeout error to download {filename} into Local Server")
raise
async def download_files(images: List[Dict[str,Any]],path: str) -> None:
headers = {"user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
async with ClientSession(headers=headers) as session:
tasks = [asyncio.ensure_future(download_file(session,image['resource'],get_filename(image,path))) for image in images]
await asyncio.gather(*tasks)
def download_images(images: List[Dict[str,path: str) -> None:
try:
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(download_files(images,path))
loop.run_until_complete(future)
logger.info(f'Images from Remote Server have been downloaded successfully')
except Exception as error:
logger.error(f'Error to download images from Remote Server: {error}')
raise
def get_filename(image: Dict[str,Any],path: str) -> str:
image_dir = '{}/{}'.format(path,image['id'])
image_file = '{}.jpg'.format(generate_hash(image['resource']))
if not os.path.exists(image_dir):
os.makedirs(image_dir)
return os.path.join(image_dir,image_file)
def main():
images = [
{
'id': '10755431','resource': 'http://image1.jpg'
},{
'id': '10755432','resource': 'http://image2.jpg'
},{
'id': '101426201','recurso': 'http://image3.jpg'
}
]
IMAGES_PATH = '/home/stivenramireza'
download_images(images,IMAGES_PATH)
if __name__ == "__main__":
main()
我收到此错误:
ERROR:root:Timeout error to download /home/stivenramireza/10755431/664e3bdd10cd69452774f38ec822a9eb.jpg into Local Server
ERROR:root:Error to download images from Remote Server:
Traceback (most recent call last):
File "/home/stivenramireza/storage/main.py",line 17,in download_file
async for data in response.content.iter_chunked(1024):
File "/home/stivenramireza/.local/lib/python3.8/site-packages/aiohttp/streams.py",line 39,in __anext__
rv = await self.read_func()
File "/home/stivenramireza/.local/lib/python3.8/site-packages/aiohttp/streams.py",line 368,in read
await self._wait('read')
File "/home/stivenramireza/.local/lib/python3.8/site-packages/aiohttp/streams.py",line 296,in _wait
await waiter
asyncio.exceptions.CancelledError
During handling of the above exception,another exception occurred:
Traceback (most recent call last):
File "main.py",line 70,in <module>
main()
File "main.py",line 67,in main
download_images(images,IMAGES_PATH)
File "/home/stivenramireza/storage/main.py",line 34,in download_images
loop.run_until_complete(future)
File "/usr/lib/python3.8/asyncio/base_events.py",line 616,in run_until_complete
return future.result()
File "/home/stivenramireza/storage/main.py",line 28,in download_files
await asyncio.gather(*[asyncio.ensure_future(download_file(session,image['recurso'],path))) for image in images])
File "/home/stivenramireza/storage/main.py",line 20,in download_file
logger.error(f"Error to get {filename} from Re Server")
File "/home/stivenramireza/.local/lib/python3.8/site-packages/async_timeout/__init__.py",line 55,in __aexit__
self._do_exit(exc_type)
File "/home/stivenramireza/.local/lib/python3.8/site-packages/async_timeout/__init__.py",line 92,in _do_exit
raise asyncio.TimeoutError
asyncio.exceptions.TimeoutError
我该怎么办?
谢谢。
解决方法
您的download_file
函数捕获超时错误,然后重新引发。您的download_files
函数使用asyncio.gather()
,它在第一个异常时退出并传播到调用者。可以合理地假设,在下载大量文件时,其中一个文件迟早会超时,在这种情况下,您的整个程序都会被中断。
我该怎么办?
这取决于您的程序在超时情况下想要执行的操作。例如,您可能想重试该文件,或者您想要放弃。但是,由于单个文件超时,您很可能不想中断整个下载。
虽然在很多情况下重新引发您所捕获的异常是正确的事情,但这并不是正确的事情。您可以将raise
末尾的download_file
更改为return (remote_url,filename)
,这将导致gather()
返回失败的下载列表,您可以尝试再次下载。