问题描述
我的蜘蛛抓取了数千个主页。问题在于它最近开始筹集资金
2020-09-05 21:02:17 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://www.xxx.co.uk> (Failed 4 times): Couldn't bind: 24: Too many open files.
twisted.internet.error.ConnectBindError: Couldn't bind: 24: Too many open files.
还有许多DNS和超时错误。
据我了解,Twisted / Scrapy使用套接字(例如文件),因此可能使用了太多套接字。
所以我找到了一个“解决方案”:ulimit -n 1000000
但这根本没有帮助。
编辑:
如您所见,我已经设置了CONCURRENT_REQUESTS = 20
,所以这绝对没有道理...
服务器: Ubuntu 18.04 4GB RAM
我将不胜感激,从任何建议或至少指导中可以找到解决方案的地方。
我不知道如何解决它,而且没有意义-这是我的设置:
CONCURRENT_REQUESTS = 20
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
SCHEDULER_disK_QUEUE = 'scrapy.squeues.PickleFifodiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
CONCURRENT_REQUESTS_PER_DOMAIN = 4
DEPTH_PRIORITY = 1
其中一些被custom_settings
覆盖:
class Profilesspider(BaseSpiderMixin,scrapy.Spider):
name = 'myspider'
headers = {
...
}
custom_settings = {
'CONCURRENT_REQUESTS': 20,'LOG_FILE': 'profiles_spider.log','DOWNLOAD_TIMEOUT': 80,'DNS_TIMEOUT': 80,'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter','RETRY_TIMES': 3,'USER_AGENT': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/27.0.1453.93 Safari/537.36"
}
def start_requests(self):
self._lock()
self.load_websites()
self.buffer = []
for website in self.websites:
try:
yield scrapy.Request(website.url,self.parse,Meta={'website': website},headers=self.headers,)
except ValueError:
continue
def parse(self,response: Response):
Meta = response.Meta
website = Meta['website']
title = utils.parsing.extract_title(response)
Meta_tags = utils.Meta_tags.extract_Meta_tags(response)
social = utils.social_extractors.extract_social_links(response)
logos,regex_logos = utils.website_profiles.extract_logos(response,include_regex=True)
first_logo = logos[0] if logos else None
first_regex_logo = regex_logos[0] if regex_logos else None
msappImage = utils.Meta_tags.search_Meta_content(Meta_tags,'name','msapplication-TileImage')
profile: WebSiteProfile = website.get_or_create_profile()
website.title = title
website.Meta_tags = Meta_tags
profile.profile_name = utils.Meta_tags.search_Meta_content(Meta_tags,'property','og:site_name') or website.title
profile.description = utils.Meta_tags.search_Meta_content(Meta_tags,'og:description') or utils.Meta_tags.search_Meta_content(Meta_tags,'description')
Meta_og_image = utils.Meta_tags.search_Meta_content(Meta_tags,'og:image')
profile.profile_image = Meta_og_image or first_logo or msappImage or first_regex_logo
website.profile_scraped_at = Now()
website.save()
profile.save()
def error(self,failure):
# log all failures
Meta = failure.request.Meta
website = Meta['website']
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
website.set_response_code(response.status,save=False)
elif failure.check(DNSLookupError):
website.set_response_code(WebSite.RESPONSE_CODE__DNS_LOOKUP_ERROR,save=False)
elif failure.check(TimeoutError,TCPTimedOutError):
website.set_response_code(WebSite.RESPONSE_CODE__TIMEOUT,save=False)
else:
website.set_response_code(WebSite.RESPONSE_CODE__UNKNowN,save=False)
website.profile_scraped_at = Now()
website.save()
这是BaseSpiderMixin:
class BaseSpiderMixin():
lock_filepath = '/tmp/spiderlock.lock'
name = 'base_spider_mixin'
@classmethod
def _unlock(cls):
if cls._is_locked():
os.remove(cls.lock_filepath)
def _lock(self):
locked_spider = self._is_locked()
if locked_spider:
raise SpiderAlreadyLockedException(locked_spider)
with open(self.lock_filepath,'w') as f:
f.write(self.name)
@classmethod
def _is_locked(cls):
if os.path.exists(cls.lock_filepath):
with open(cls.lock_filepath,'r') as f:
spider_name = f.read().replace(os.linesep,'').strip()
return spider_name
def load_websites(self):
by = getattr(self,'by')
val = getattr(self,'val')
self.websites_ids = None
self.topic = None
if by == 'ids':
self.websites_ids = val.split('.')
self.websites = WebSite.objects.filter(id__in=self.websites_ids)
elif by == 'topic':
self.websites = WebSite.objects.filter(topic__code=val)
def close(self,spider,reason):
self._unlock()
super().close(spider,reason)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)