使用 Selenium 在 R 中抓取网页以单击新页面

问题描述

我正在尝试进入这个动态网站 (https://es.gofundme.com/s?q=covid) 的不同页面。在这个搜索引擎中，我的意图是进入每个项目。每页有 12 个项目。

一旦您输入了这些项目中的每一个并获得了所需的信息（即，如果我得到了它），我希望您继续阅读下一页。也就是说，一旦你获得了第1页的12个项目，你必须获得第2页的12个项目，依此类推。

怎么办？你帮了我很多。谢谢！

这是我的代码：

#Loading the rvest package
library(rvest)
library(magrittr) # for the '%>%' pipe symbols
library(RSelenium) # to get the loaded html of 
library(purrr) # for 'map_chr' to get reply 
library(tidyr) #extract_numeric(years)
library(stringr)

df_0<-data.frame(project=character(),name=character(),location=character(),dates=character(),objective=character(),collected=character(),donor=character(),shares=character(),follow=character(),comments=character(),category=character())

#Specifying the url for desired website to be scraped
url <- 'https://es.gofundme.com/f/ayuda-a-ta-josefina-snchez-por-covid-en-pulmn?qid=00dc4567cb859c97b9c3cefd893e1ed9&utm_campaign=p_cp_url&utm_medium=os&utm_source=customer'

# starting local RSelenium (this is the only way to start RSelenium that is working for me atm)
selCommand <- wdman::selenium(jvmargs = c("-Dwebdriver.chrome.verboseLogging=true"),retcommand = TRUE)
shell(selCommand,wait = FALSE,minimized = TRUE)
remDr <- remoteDriver(port = 4567L,browserName = "firefox")
remDr$open()

  
require(RSelenium)

# go to website
remDr$navigate(url)

# get page source and save it as an html object with rvest
html_obj <- remDr$getPageSource(header = TRUE)[[1]] %>% read_html()
  
# 1) Project name
project <- html_obj %>% html_nodes(".a-campaign-title") %>% html_text()
  
 # 2) name 
info <- html_obj %>% html_nodes(".m-person-info") %>% html_text()
  
# 3) location 
location <- html_obj %>% html_nodes(".m-person-info-content") %>% html_text()

  
# 4) dates 
dates <- html_obj %>% html_nodes(".a-created-date") %>% html_text()
  
# 5) Money -collected -objective
money <- html_obj %>% html_nodes(".m-progress-meter-heading") %>% html_text()
  
# 6) doner,shares and followers
popularity <- html_obj %>% html_nodes(".text-stat-value") %>% html_text()
  
# 7) Comments
comments <- html_obj %>% html_nodes(".o-expansion-list-wrapper") %>% html_text()
  
# 8) Category
category <- html_obj %>% html_nodes(".a-link") %>% html_text()
  
  
  
# create the df with all the info
review_data <- data.frame(project=project,name= gsub("\\Organizador.*","",info[7]),location=str_remove(location[7],"Organizador"),dates = dates,collected = unlist(strsplit(money," "))[1],objective = unlist(strsplit(money," "))[8],donor = popularity[1],shares = popularity[2],follow = popularity[3],comments = extract_numeric(comments),category = category[17],stringsAsFactors = F)

解决方法

页面执行您可以模仿/简化的 POST 请求。为了保持动态，您需要首先从源 js 文件中获取 api 密钥和应用程序 ID，然后在随后的 POST 请求中传递它们。

在下面，我只是从每个请求中提取 url。我将 POST 的查询字符串设置为每页最多 20 个结果。在我检索页数的初始请求之后，然后我在页码之间映射一个函数，从每个 POST 响应中提取 url；改变 page 参数。

您最终会得到所有项目的 url 列表，然后您可以访问以从中提取信息；或者，可能会发出 xmlhttp 请求。

注意代码可以稍微重构一下。

library(httr)
library(stringr)
library(purrr)
library(tidyverse)

get_df <- function(x){
  df <- map_dfr(x,.f = as_tibble) %>% select(c('url')) %>% unique() %>% 
    mutate( url = paste0('https://es.gofundme.com/f/',url))
  return(df)
}

r <- httr::GET('https://es.gofundme.com/static/js/main~4f8b914b.bfe3a91b38d67631e0fa.js') %>% content(as='text')

matches <- stringr::str_match_all(r,'t\\.algoliaClient=r\\.default\\("(.*?)","(.*?)"')

application_id <- matches[[1]][,2]
api_key <-matches[[1]][,3]

headers = c(
  'User-Agent' = 'Mozilla/5.0','content-type' = 'application/x-www-form-urlencoded','Referer' = 'https://es.gofundme.com/'
)

params = list(
  'x-algolia-agent' = 'Algolia for JavaScript (4.7.0); Browser (lite); JS Helper (3.2.2); react (16.12.0); react-instantsearch (6.8.2)','x-algolia-api-key' = api_key,'x-algolia-application-id' = application_id
)  
post_body <- '{"requests":[{"indexName":"prod_funds_feed_replica_1","params":"filters=status%3D1%20AND%20custom_complete%3D1&exactOnSingleWordQuery=word&query=covid&hitsPerPage=20&attributesToRetrieve=%5B%22fundname%22%2C%22username%22%2C%22bene_name%22%2C%22objectID%22%2C%22thumb_img_url%22%2C%22url%22%5D&clickAnalytics=true&userToken=00-e940a6572f1b47a7b2338b563aa09b9f-6841178f&page='
page_num <- 0
data <- paste0(post_body,page_num,'"}]}')
res <- httr::POST(url = 'https://e7phe9bb38-dsn.algolia.net/1/indexes/*/queries',httr::add_headers(.headers=headers),query = params,body = data) %>% content()
num_pages <- res$results[[1]]$nbPages
df <- get_df(res$results[[1]]$hits)
pages <- c(1:num_pages-1)

df2 <- map_dfr(pages,function(page_num){
  data <- paste0(post_body,'"}]}')
  res <- httr::POST('https://e7phe9bb38-dsn.algolia.net/1/indexes/*/queries',body = data) %>% content()
  temp_df <-get_df(res$results[[1]]$hits)
}
)

df <- rbind(df,df2)

html r r rselenium selenium selenium web-scraping