使用scrapy-selenium模块从多个JavaScript页面中抓取硒数据

问题描述

现代世界的你好英雄

我目前正在抓取这个基于JS的网页https://golden.com/list-of-cryptocurrency-companies/,这是我到目前为止已实现的代码

import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


class ScrapperSpider(scrapy.Spider):
    name = 'scrapper'
    allowed_domains = ['golden.com']
    start_urls = ['https://golden.com/list-of-cryptocurrency-companies/']
    current_page = 1


    def __init__(self):
        
        chrome_path = which('chromedriver')
        self.driver = webdriver.Chrome(executable_path=chrome_path)  


    def parse(self,response):
        driver = self.driver 
        number_of_pages = 27

        for i in range(number_of_pages): 

            url = 'https://golden.com/list-of-cryptocurrency-companies/'
            driver.get(url + str(i+1))
            driver.set_window_size(1920,1080)

            all_results = driver.find_element_by_xpath("//select[contains(@class,'PageSize')]/option[3]").click()

            new_table = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"NewTable__body")))

            driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

            import time
            time.sleep(5)

            driver.implicitly_wait(10)
                    # driver.find_element

            self.html = driver.page_source
         

            resp = Selector(text=self.html)
            for currency in resp.xpath("//div[@class='NewTable__body']/div"):
                exchange_name = currency.xpath('.//div[1]/div/div/div/span/a/span/text()').get()
                website = currency.xpath(".//div[3]/div/div/div/div/span/a/@href").get()

                industry_type = currency.xpath(".//div[4]/div/div/div/div")
                for industry in industry_type:
                    industry_1 = industry.xpath(".//div[1]/span/a/span/text()").get()
                    industry_2 = industry.xpath(".//div[2]/span/a/span/text()").get()
                    industry_3 = industry.xpath(".//div[3]/span/a/span/text()").get()
                    industry_4 = industry.xpath(".//div[4]/span/a/span/text()").get()
                    industry_5 = industry.xpath(".//div[5]/span/a/span/text()").get()


                    
                    location = currency.xpath(".//div[5]/div/div/div/div/div/span/a/span/text()").get()
                

                    yield {
                        'ex_name': exchange_name,'url': website,'industry_1': industry_1,'industry_2': industry_2,'industry_3': industry_3,'industry_4': industry_4,'indsutry_5': industry_5,'location': location

                    }
            
        driver.close()   
        driver.quit()

我的主要问题是网页从https://golden.com/list-of-cryptocurrency-companies/更改为https://golden.com/list-of-cryptocurrency-companies/2,然后立即回到原始格式,而没有从其他页面上刮掉其他任何内容。现在,对于我的一生来说,我似乎无法理解正在发生的事情,因为我已经整整整整一个星期都在为此工作。

如果有人可以在这里帮助我,将非常感激,因为我真的很笨

解决方法

这是示例示例代码,说明如何等待url更改为某些内容。这将从每个页面上抓取公司名称。

number_of_pages = 27

for i in range(number_of_pages):
    url = 'https://golden.com/list-of-cryptocurrency-companies/'+ str(i+1)
    driver.get(url)
    # wait upto 10 seconds for url changes
    WebDriverWait(driver,10).until(EC.url_to_be(url))
    companies = driver.find_elements_by_xpath("//div[@class='QueryResults']//span[@class='TopicLink__text']")
    print("Printing from page#",i+1)
    for company in companies:
        print(company.text)

 
 
driver.close()   
driver.quit()   

以下是输出:

Printing from page# 1
Temtum
CRYPTOCURRENCY
BLOCKCHAIN
Tortola
National Digital Asset Exchange Inc. (NDAX)
CRYPTOCURRENCY
...
Printing from page# 2
Dentacoin
CRYPTOCURRENCY
BLOCKCHAIN
HEALTHCARE
Netherlands
Waves Platform
...

相关问答

依赖报错 idea导入项目后依赖报错,解决方案:https://blog....
错误1:代码生成器依赖和mybatis依赖冲突 启动项目时报错如下...
错误1:gradle项目控制台输出为乱码 # 解决方案:https://bl...
错误还原:在查询的过程中,传入的workType为0时,该条件不起...
报错如下,gcc版本太低 ^ server.c:5346:31: 错误:‘struct...