使用 Selenium 在 R 中抓取网页以单击新页面

问题描述

我正在尝试进入这个动态网站 (https://es.gofundme.com/s?q=covid) 的不同页面在这搜索引擎中,我的意图是进入每个项目。每页有 12 个项目。

enter image description here

一旦您输入了这些项目中的每一个并获得了所需的信息(即,如果我得到了它),我希望您继续阅读下一页。也就是说,一旦你获得了第1页的12个项目,你必须获得第2页的12个项目,依此类推。

enter image description here

怎么办?你帮了我很多。谢谢!

这是我的代码

#Loading the rvest package
library(rvest)
library(magrittr) # for the '%>%' pipe symbols
library(RSelenium) # to get the loaded html of 
library(purrr) # for 'map_chr' to get reply 
library(tidyr) #extract_numeric(years)
library(stringr)

df_0<-data.frame(project=character(),name=character(),location=character(),dates=character(),objective=character(),collected=character(),donor=character(),shares=character(),follow=character(),comments=character(),category=character())

#Specifying the url for desired website to be scraped
url <- 'https://es.gofundme.com/f/ayuda-a-ta-josefina-snchez-por-covid-en-pulmn?qid=00dc4567cb859c97b9c3cefd893e1ed9&utm_campaign=p_cp_url&utm_medium=os&utm_source=customer'

# starting local RSelenium (this is the only way to start RSelenium that is working for me atm)
selCommand <- wdman::selenium(jvmargs = c("-Dwebdriver.chrome.verboseLogging=true"),retcommand = TRUE)
shell(selCommand,wait = FALSE,minimized = TRUE)
remDr <- remoteDriver(port = 4567L,browserName = "firefox")
remDr$open()

  
require(RSelenium)

# go to website
remDr$navigate(url)

# get page source and save it as an html object with rvest
html_obj <- remDr$getPageSource(header = TRUE)[[1]] %>% read_html()
  
# 1) Project name
project <- html_obj %>% html_nodes(".a-campaign-title") %>% html_text()
  
 # 2) name 
info <- html_obj %>% html_nodes(".m-person-info") %>% html_text()
  
# 3) location 
location <- html_obj %>% html_nodes(".m-person-info-content") %>% html_text()

  
# 4) dates 
dates <- html_obj %>% html_nodes(".a-created-date") %>% html_text()
  
# 5) Money -collected -objective
money <- html_obj %>% html_nodes(".m-progress-meter-heading") %>% html_text()
  
# 6) doner,shares and followers
popularity <- html_obj %>% html_nodes(".text-stat-value") %>% html_text()
  
# 7) Comments
comments <- html_obj %>% html_nodes(".o-expansion-list-wrapper") %>% html_text()
  
# 8) Category
category <- html_obj %>% html_nodes(".a-link") %>% html_text()
  
  
  
# create the df with all the info
review_data <- data.frame(project=project,name= gsub("\\Organizador.*","",info[7]),location=str_remove(location[7],"Organizador"),dates = dates,collected = unlist(strsplit(money," "))[1],objective = unlist(strsplit(money," "))[8],donor = popularity[1],shares = popularity[2],follow = popularity[3],comments = extract_numeric(comments),category = category[17],stringsAsFactors = F)  

解决方法

页面执行您可以模仿/简化的 POST 请求。为了保持动态,您需要首先从源 js 文件中获取 api 密钥和应用程序 ID,然后在随后的 POST 请求中传递它们。

在下面,我只是从每个请求中提取 url。我将 POST 的查询字符串设置为每页最多 20 个结果。在我检索页数的初始请求之后,然后我在页码之间映射一个函数,从每个 POST 响应中提取 url;改变 page 参数。

您最终会得到所有项目的 url 列表,然后您可以访问以从中提取信息;或者,可能会发出 xmlhttp 请求。

注意代码可以稍微重构一下。

library(httr)
library(stringr)
library(purrr)
library(tidyverse)

get_df <- function(x){
  df <- map_dfr(x,.f = as_tibble) %>% select(c('url')) %>% unique() %>% 
    mutate( url = paste0('https://es.gofundme.com/f/',url))
  return(df)
}

r <- httr::GET('https://es.gofundme.com/static/js/main~4f8b914b.bfe3a91b38d67631e0fa.js') %>% content(as='text')

matches <- stringr::str_match_all(r,'t\\.algoliaClient=r\\.default\\("(.*?)","(.*?)"')

application_id <- matches[[1]][,2]
api_key <-matches[[1]][,3]

headers = c(
  'User-Agent' = 'Mozilla/5.0','content-type' = 'application/x-www-form-urlencoded','Referer' = 'https://es.gofundme.com/'
)

params = list(
  'x-algolia-agent' = 'Algolia for JavaScript (4.7.0); Browser (lite); JS Helper (3.2.2); react (16.12.0); react-instantsearch (6.8.2)','x-algolia-api-key' = api_key,'x-algolia-application-id' = application_id
)  
post_body <- '{"requests":[{"indexName":"prod_funds_feed_replica_1","params":"filters=status%3D1%20AND%20custom_complete%3D1&exactOnSingleWordQuery=word&query=covid&hitsPerPage=20&attributesToRetrieve=%5B%22fundname%22%2C%22username%22%2C%22bene_name%22%2C%22objectID%22%2C%22thumb_img_url%22%2C%22url%22%5D&clickAnalytics=true&userToken=00-e940a6572f1b47a7b2338b563aa09b9f-6841178f&page='
page_num <- 0
data <- paste0(post_body,page_num,'"}]}')
res <- httr::POST(url = 'https://e7phe9bb38-dsn.algolia.net/1/indexes/*/queries',httr::add_headers(.headers=headers),query = params,body = data) %>% content()
num_pages <- res$results[[1]]$nbPages
df <- get_df(res$results[[1]]$hits)
pages <- c(1:num_pages-1)

df2 <- map_dfr(pages,function(page_num){
  data <- paste0(post_body,'"}]}')
  res <- httr::POST('https://e7phe9bb38-dsn.algolia.net/1/indexes/*/queries',body = data) %>% content()
  temp_df <-get_df(res$results[[1]]$hits)
}
)

df <- rbind(df,df2)