问题描述
item.py和其他文件的代码如下所述。最后还提到了日志。我没有收到任何错误,但根据日志,scrapy尚未刮取任何页面。
```
import scrapy
class YelpItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
name_url = scrapy.Field()
rating = scrapy.Field()
date = scrapy.Field()
review_text = scrapy.Field()
user_pic = scrapy.Field()
city = scrapy.Field()
is_true = scrapy.Field()
```
settings.py的代码
import pathlib
BOT_NAME = 'yelp-scrapy-dev'
SPIDER_MODULES = ['yelp-scrapy-dev.spiders']
NEWSPIDER_MODULE = 'yelp-scrapy-dev.spiders'
{
pathlib.Path('output1.csv'):{
'format':'csv',},}
ROBOTSTXT_OBEY = False
class YelpPipeline:
def open_spider(self,spider):
self.file = open('output1.csv','w')
def close_spider(self,spider):
self.file.close()
def process_item(self,item,spider):
return item
middlewares.py的代码
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item,ItemAdapter
class YelpSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls,crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened,signal=signals.spider_opened)
return s
def process_spider_input(self,response,spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self,result,spider):
# Called with the results returned from the Spider,after
# it has processed the response.
# Must return an iterable of Request,or item objects.
for i in result:
yield i
def process_spider_exception(self,exception,spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self,start_requests,spider):
# Called with the start requests of the spider,and works
# similarly to the process_spider_output() method,except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self,spider):
spider.logger.info('Spider opened: %s' % spider.name)
class YelpDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls,signal=signals.spider_opened)
return s
def process_request(self,request,spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self,spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self,spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self,spider):
spider.logger.info('Spider opened: %s' % spider.name)
import scrapy
from ..items import YelpItem
# currently will grab the first 100 reviews from the first 10 businesses from start url
class CitySpider(scrapy.Spider):
name = 'city'
start_urls = [
'https://www.yelp.com/search?find_desc=&find_loc=Seattle%2C+WA','https://www.yelp.com/search?find_desc=&find_loc=SanFrancisco%2C+CA','https://www.yelp.com/search?find_desc=&find_loc=NewYork%2C+NY','https://www.yelp.com/search?find_desc=&find_loc=Dallas%2C+TX','https://www.yelp.com/search?find_desc=&find_loc=Atlanta%2C+GA',]
# gets the first 10 businesses from the start url
def parse(self,response):
business_pages = response.css('.text-weight--bold__373c0__1elNz a')
yield from response.follow_all(business_pages,self.parse_business)
# extracts the first 100 reviews from the yelp-scrapy-dev business
def parse_business(self,response):
items = YelpItem()
all_reviews = response.css('.sidebaractionsHoverTarget__373c0__2kfhe')
address = response.request.url.split('?')
src = address[0].split('/')
biz = src[-1].split('-')
loc = biz[-1] if not biz[-1].isdigit() else biz[-2]
if loc == 'seattle':
city = 'Seattle,WA'
elif loc == 'dallas':
city = 'Dallas,TX'
elif loc == 'francisco':
city = 'San Francisco,CA'
elif loc == 'york':
city = 'New York,NY'
elif loc == 'atlanta':
city = 'Atlanta,GA'
else:
city = 'outofrange'
for review in all_reviews:
name = review.css('.link-size--inherit__373c0__1VFlE::text').extract_first()
name_url = review.css('.link-size--inherit__373c0__1VFlE::attr(href)').extract_first().split('=')
rating = review.css('.overflow--hidden__373c0__2y4YK::attr(aria-label)').extract()
date = review.css('.arrange-unit-fill__373c0__3Sfw1 .text-color--mid__373c0__jCeOG::text').extract()
review_text = review.css('.raw__373c0__3rKqk::text').extract()
user_pic = review.css('.gutter-1__373c0__2l5bx .photo-Box-img__373c0__35y5v::attr(src)').extract()
if city != 'outofrange':
# making sure data is stored as a str
items['name'] = name
items['name_url'] = name_url[1]
items['rating'] = rating[0]
items['date'] = date[0]
items['review_text'] = review_text[0]
items['user_pic'] = user_pic[0] != 'https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_styleguide/514f6997a318/assets/img/default_avatars/user_60_square.png'
items['city'] = city
items['is_true'] = True
yield items
source = response.request.url
# prevent duplicate secondary pages from being recrawled
if '?start=' not in source:
# gets 20th-100th reviews,pages are every 20 reviews
for i in range(1,5):
next_page = source + '?start=' + str(i*20)
yield response.follow(next_page,callback=self.parse_business)
下面是日志行。
(venv) C:\Users\somar\yelp-scrapy\yelp>scrapy crawl city
2020-10-09 22:34:53 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: yelp-scrapy-dev)
2020-10-09 22:34:53 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0,libxml2 2.9.5,cssselect 1.1.0,parsel 1.6.0,w3lib 1.22.0,Twisted 20.3.0,Python 3.7
.6 (default,Jan 8 2020,20:23:39) [MSC v.1916 64 bit (AMD64)],pyOpenSSL 19.1.0 (OpenSSL 1.1.1h 22 Sep 2020),cryptography 3.1.1,Platform Windows-10-10
.0.18362-SP0
2020-10-09 22:34:53 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-10-09 22:34:53 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'yelp-scrapy-dev','NEWSPIDER_MODULE': 'yelp-scrapy-dev.spiders','SPIDER_MODULES': ['yelp-scrapy-dev.spiders']}
2020-10-09 22:34:53 [scrapy.extensions.telnet] INFO: Telnet Password: 1f95c571b9245c42
2020-10-09 22:34:53 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats','scrapy.extensions.telnet.TelnetConsole','scrapy.extensions.logstats.LogStats']
2020-10-09 22:34:54 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware','scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware','scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware','scrapy.downloadermiddlewares.useragent.UserAgentMiddleware','scrapy.downloadermiddlewares.retry.RetryMiddleware','scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware','scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware','scrapy.downloadermiddlewares.redirect.RedirectMiddleware','scrapy.downloadermiddlewares.cookies.CookiesMiddleware','scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware','scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-10-09 22:34:54 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware','scrapy.spidermiddlewares.offsite.OffsiteMiddleware','scrapy.spidermiddlewares.referer.RefererMiddleware','scrapy.spidermiddlewares.urllength.UrlLengthMiddleware','scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-10-09 22:34:54 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-10-09 22:34:54 [scrapy.core.engine] INFO: Spider opened
2020-10-09 22:34:54 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min),scraped 0 items (at 0 items/min)
2020-10-09 22:34:54 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-10-09 22:34:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yelp.com/search?find_desc=&find_loc=Dallas%2C+TX> (referer: None)
2020-10-09 22:34:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yelp.com/search?find_desc=&find_loc=Atlanta%2C+GA> (referer: None)
2020-10-09 22:34:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yelp.com/search?find_desc=&find_loc=Seattle%2C+WA> (referer: None)
2020-10-09 22:34:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yelp.com/search?find_desc=&find_loc=NewYork%2C+NY> (referer: None)
2020-10-09 22:34:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yelp.com/search?find_desc=&find_loc=SanFrancisco%2C+CA> (referer: None)
2020-10-09 22:34:56 [scrapy.core.engine] INFO: Closing spider (finished)
2020-10-09 22:34:56 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1264,'downloader/request_count': 5,'downloader/request_method_count/GET': 5,'downloader/response_bytes': 278234,'downloader/response_count': 5,'downloader/response_status_count/200': 5,'elapsed_time_seconds': 2.159687,'finish_reason': 'finished','finish_time': datetime.datetime(2020,10,5,34,56,173193),'log_count/DEBUG': 5,'log_count/INFO': 10,'response_received_count': 5,'scheduler/dequeued': 5,'scheduler/dequeued/memory': 5,'scheduler/enqueued': 5,'scheduler/enqueued/memory': 5,'start_time': datetime.datetime(2020,54,13506)}
2020-10-09 22:34:56 [scrapy.core.engine] INFO: Spider closed (finished)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)