问题描述
这里的代码很多,但要详细说明,该站点的评分最高,例如排名前10的餐馆,我可以使用yield Request(url=item['detail_url'])
成功进入这10种餐馆的详细信息页面。现在,对于每个详细信息页面,都有一个侧边栏导航,每个导航将在单击后在右侧给出详细内容。
我用谷歌搜索并提到我必须使用Selenium,看来点击确实有效,但是我不确定如何才能正确地获得新内容。
我做了一些关于使用self.driver.page_source
的研究,但仍然不确定如何像response.css('selector')
下面的代码
import scrapy
from scrapy import Request
from scrape.items.pj_fx110 import PjFx110Item
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class PjFx110Spider(scrapy.Spider):
name = "pj_fx110"
# so this spider will only run this pipeline
custom_settings = {
'ITEM_PIPELINES': {
'scrape.pipelines.pj_fx110.PjJsonWriterPipeline': 300,}
}
ROOT_URL = 'https://site'
start_urls = [
ROOT_URL
]
def __init__(self):
options = Options()
options.add_argument("--headless")
self.driver = webdriver.Chrome('./chromedriver',options=options)
def parse(self,response):
for rest in response.css('.rest'):
item = PjFx110Item()
item['detail_url'] = f'{self.ROOT_URL}{rest.css(".more::attr(href)").get().strip()}'
scraped_data = {
'detail_url': item['detail_url'],}
self.driver.get(item['detail_url'])
yield Request(
url=item['detail_url'],callback=self.parse_detail_link,cb_kwargs={'main_page_data': scraped_data}
)
def parse_detail_link(self,response,main_page_data):
main_content = response.css('.main_content')
yield {
'detail_url': main_page_data['detail_url'],'main': self.get_main_contents(
main_content
),}
def main_left_content(self,main_left,id):
content = main_left.css(f'li[data-id="{id}"]')
return {
'heading': content.css('.span_left::text').get(),'score': content.css('.span_right em::text').getall(),'contents': None,}
def main_right_content(self,detail_txt):
result = []
for p in detail_txt.css('p'):
has_image = p.css('img::attr(src)').get()
has_text = p.css('p::text').get()
has_span = p.css('p span::text').get()
if has_image:
result.append(has_image)
elif has_text:
result.append(has_text)
else:
result.append(has_span)
return list(filter(None,result))
def get_main_contents(self,main_content):
main_left = main_content.css('.main_left')
main_right = main_content.css('.main_right')
detail_text = main_right.css('.detail_txt')
ranking = self.main_left_content(main_left,7)
score = self.main_left_content(main_left,8)
ranking['contents'] = self.main_right_content(detail_text[0])
score['contents'] = self.main_right_content(detail_text[1])
next_li = self.driver.find_element_by_xpath('//*[@id="roll"]/h2[2]')
next_li.click()
###############
# this is the part where I am not sure what should be done
# in order to get the new contents to the right after each click
###############
new_main_content = self.driver.page_source.css('.main_content')
new_main_right = new_main_content.css('.main_right')
new_detail_text = new_main_right.css('.detail_txt')
established_growth = self.main_left_content(main_left,9)
established_growth['contents'] = self.main_right_content(new_detail_text[0])
return {
'ranking': ranking,'score': score,'established_growth': established_growth,}
编辑
发现我必须做些类似的事情来解决上面的问题,并且需要睡眠,因此单击后便有时间刷新BUT
def get_main_contents(self,main_content):
# other codes
time.sleep(1)
new_detail_text = self.driver.find_elements_by_class_name('detail_txt')
established_growth = self.main_left_content(main_left,9)
established_growth['contents'] =
self.main_right_content_selenium(new_detail_text[0])
# other codes
def main_right_content_selenium(self,detail_txt):
result = []
for p in detail_txt.find_elements_by_tag_name('p'):
try:
has_image = p.find_element_by_css_selector('img').get_attribute('src')
except Exception as e:
has_image = None
try:
has_text = p.text
except Exception as e:
has_text = None
try:
has_span = p.find_element_by_tag_name('span').text
except Exception as e:
has_span = None
if has_image:
result.append(has_image)
elif has_text:
result.append(has_text)
else:
result.append(has_span)
return list(filter(None,result))
即使这有助于获取内容,但内容还是有问题。如果我只得到一个detail_url
,那么当我循环浏览时,内容将是正确的,但是我将所有内容混合了。假设第一餐厅的内容是正确的,但点击后第二餐厅的内容实际上是第四餐厅的
我相信这是由于餐厅循环以及点击后的延迟引起的吗?我尝试将sleep(5)
放在self.driver.get(item['detail_url'])
或self.driver.implicitly_wait(6)
之前,但几分钟后会出现连接错误。
任何人都知道如何解决此问题?
在此先感谢您的帮助和建议
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)