问题描述
我们的目标是从“vivino (dot) com”获取数据。
具体来说,对于每种葡萄酒,我们都需要用户评分和评论。 不幸的是,我们正在处理查询字符串参数以及函数的无限滚动和回调问题。 我们收到以下错误:
Traceback (most recent call last):
File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/defer.py",line 654,in _runcallbacks
current.result = callback(current.result,*args,**kw)
File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/scrapy/spiders/__init__.py",line 90,in _parse
return self.parse(response,**kwargs)
File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/scrapy/spiders/__init__.py",line 93,in parse
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
NotImplementedError: ScrollSpider.parse callback is not defined
请看附件代码:
import scrapy
import json
class ScrollSpider(scrapy.Spider):
name = 'scroll'
start_urls = ["https://www.vivino.com/IT/en/"]
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.107","Accept": "application/json","Content-Type": "application/json","Accept-Encoding": "gzip,deflate,be","Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",}
query_string = {
"country_code": "IT","currency_code":"EUR","grape_filter":"varietal","min_rating":"1","order_by":"price","order":"asc","page": 1,"price_range_max":"500","price_range_min":"1","region_ids[]":"394",}
def parse_explore(self,response): # go to explore page
yield scrapy.Request(
url = "https://www.vivino.com/explore",headers=self.headers,body=self.query_string,callback=self.parse_products,)
针对每个查询结果构建请求
def parse_products(self,response):
raw_json = response.body
data = json.loads(raw_json)
for wine in data:
wine_id = wine['explore_vintage']['matches'][2]['vintage']['wine']['id']
wine_price_id = wine['explore_vintage']['matches'][2]['price']['id']
wine_year_id = wine['explore_vintage']['matches'][2]['vintage']['year']
wine_name_id = wine['explore_vintage']['matches'][2]['vintage']['SEO_name']
yield scrapy.Request(
f"https://www.vivino.com/IT/en/{wine_name_id}/w/{wine_id}?year={wine_year_id}&price_id={wine_price_id}",callback=self.parse_wine,)
def parse_wine(self,response):
raw_json = response.body
data = json.loads(raw_json)
wine_id = data['reviews'][0]['vintage']['wine']['id']
wine_year_id = data['reviews'][0]['vintage']['year']
yield scrapy.Request(
url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=1000&page=1&year={wine_year_id}",# 1000 per page
headers=self.headers,callback=self.parse_review,)
def parse_review(self,response): # look for product rating and review
reviews = json.loads(response.body)
for review in reviews:
yield {
"rating": review['reviews'][0]['rating'],"review": review['reviews'][0]['note'],}
解决方法
遗憾的是,Vivino 将 per_page
端点上的 https://www.vivino.com/api/wines/{wine_id}/reviews
参数的数量限制为 50,因此您也需要遍历页面。
你可以用 requests
完成同样的事情,而且它更直接(在我看来):
import requests
# Instantiate a dictionary of headers
# We only need to `manipulate` an User-Agent key
headers = {
"User-Agent": ""
}
# Instantiate a dictionary of query strings
# Defines the only needed payload
payload = {
"min_rating": 1,"order_by": "price","order": "asc","price_range_max": 500,"price_range_min": 1,"region_ids[]": 394
}
# Performs an initial request and gathers the amount of results
r = requests.get('https://www.vivino.com/api/explore/explore?',params=payload,headers=headers)
n_matches = r.json()['explore_vintage']['records_matched']
# Iterates through the amount of possible pages
# A page is defined by n_matches divided by 25 (number of results per page)
for i in range(int(n_matches / 25)):
# Adds the page on the payload
payload['page'] = i + 1
print(f'Requesting data from page: {payload["page"]}')
# Performs the request and saves the matches
r = requests.get('https://www.vivino.com/api/explore/explore?',headers=headers)
matches = r.json()['explore_vintage']['matches']
# Iterates through every match
for match in matches:
# Defines the wine's identifier
_id = match['vintage']['wine']['id']
# Defines a page counter
page_counter = 1
# Performs an all-time true loop
while True:
print(f'Requesting reviews from wine: {_id} and page: {page_counter}')
# Performs the request and saves the reviews
r = requests.get(f'https://www.vivino.com/api/wines/{_id}/reviews?per_page=50&page={page_counter}',headers=headers)
reviews = r.json()['reviews']
print(f'Number of reviews: {len(reviews)}')
# If there are no reviews anymore,# it indicates that the loop can be broken
if len(reviews) == 0:
# Breaks the loop
break
# Otherwise,increments the counter
page_counter += 1
对于多合一的脚本和一些重复的定义,我很抱歉,我只是急于检查它是否有效。
最好的问候, 古斯塔沃。