使用Python和Selenium刮擦Tripadvisor对特定酒店所有页面的评论

问题描述

我正在使用Python和Selenium刮刮Tripadvisor对特定酒店的所有评论,而我对刮刮还是陌生的。但目前,它正在刮除36页中前6页中的评论。我需要从该酒店的所有页面中抓取评论,并将其保存到csv文件中。以下是我正在使用的代码

import csv
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver

driver = webdriver.Chrome("./chromedriver")


def check_exists_by_xpath(xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True
    time.sleep(2)


def getHotelReviews():
    # Find and click the More link (to load all reviews)
    driver.find_element_by_xpath("//span[@class='_33O9dg0j']").click()
    time.sleep(20)

    reviews = driver.find_elements_by_xpath("//div[@data-test-target='reviews-tab']/div")
    reviews_count = len(reviews)
    print(reviews_count)

    # Loop through the reviews found
    for i in range(2,reviews_count):

        try:

            if (check_exists_by_xpath(".//div[contains(@class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]")):
                moreBtn = reviews[i].find_element_by_xpath(
                    ".//div[contains(@class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]").click()
                time.sleep(20)
            if (check_exists_by_xpath(".//div[contains(@class,'_2f_ruteS _1bona3Pu')]/div/q/span")):
                review = reviews[i].find_element_by_xpath(
                    ".//div[contains(@class,'_2f_ruteS _1bona3Pu')]/div/q/span").text
                print(review)

            date = reviews[i].find_element_by_xpath(".//span[contains(@class,'_34Xs-BQm')]").text
            print(date)
            title = reviews[i].find_element_by_xpath(".//div[contains(@class,'glasR4aX')]/a/span").text
            print(title)
            # Save to CSV
            csvWriter.writerow((date,title,review))

        except:
            break

    driver.close()
    driver.switch_to.window(driver.window_handles[0])


def getHotelPages(url):
    driver.get(url)
    # to maximize the driver
    driver.maximize_window()

    nextPage = driver.find_elements_by_xpath("//a[contains(@class,'pageNum cx_brand_refresh_phase2 ')]")
    noOfPages = len(nextPage)
    print(noOfPages)

    for i in range(noOfPages):
        print(nextPage[i].get_attribute("href"))
        URLs.append(nextPage[i].get_attribute("href"))


URLs = [
    'https://www.tripadvisor.com/Hotel_Review-g304141-d3895228-Reviews-The_Hideout_Sigiriya-Sigiriya_Central_Province.html#REVIEWS']

# Prepare CSV file
csvFile = open("hideoutSigiriyab_reviews1.csv","w",newline='',encoding="utf-8")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Date','Title','Review'])

try:
    getHotelPages(URLs[0])
except:
    print("Error!!")

time.sleep(60)

for url in URLs:
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(url)

    getHotelReviews()
    time.sleep(20)

csvFile.close()
driver.close()

您能否通过建议一种方法或有效代码来从酒店所有页面获取评论来帮助我。

解决方法

点击第1-36页的简单方法。

size=int(driver.find_element_by_css_selector('div.pageNumbers >a:nth-last-child(1)').text)

for i in range(2,size):
    pageNums=WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CSS_SELECTOR,"div.pageNumbers")))
    pageNums.find_element_by_xpath("//a[text()='{}']".format(i)).click()
    time.sleep(5)

导入

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC