R中的网页抓取搜索结果

问题描述

我正在尝试创建一个将从this webpage中提取搜索结果的函数，但由于我希望该函数自动搜索多个基础而感到困惑，因此我不确定如何修改该函数，它要么循环遍历“ mydf”中的所有行，要么使用apply（）函数之一来获得相同的循环效果，以便抓取“ mydf”中每一行的结果。

当我在“ mydf”的单行上运行下面的函数时，结果是正确的，但是当我不指定特定行时，出现以下错误：Error in parse_url(url) : length(url) == 1 is not TRUE

示例data.frame：

# construct sample data frame with Name,City,State:
name <- c("johnny carson foundation","melinda gates foundation","macarthur foundation")
city <- c("","","")
state <- c("","")
mydf <- data.frame(name,city,state)

#replace spaces between words with '+' for consistent formatting of 'url' object:
mydf$name <- str_replace_all(mydf$name," ","+")
mydf$city <- str_replace_all(mydf$city,"+")

我目前对该函数的尝试：

get_data <- function(df) {
        # root components of url:
        root <- "http://apps.irs.gov/app/eos/allSearch.do?ein1=&names="
        root2 <- "&resultsPerPage=25&indexOfFirstRow=0&dispatchMethod=searchAll&city="
        root3 <- "&country=US&postDateFrom=&postDateto=&exemptTypeCode=al&deductibility=all&sortColumn=orgName&isDescending=false&submitName=Search"
        
     
        # construct url by adding roots and search strings from 'mydf':
        url <- paste(root,mydf$name,root2,mydf$city,'&state=',mydf$state,root3,sep = "")
        
        gt <- GET(url)
        
        content2 <- content(gt)
        
        parsedHtml <- htmlParse(content2,asText = TRUE)
        
        
        # Scraped results to be populated into 'mydf':
        mydf$result_org <- ifelse(str_starts(xpathSApply(parsedHtml,"//div[@class='row results-body-row']",xmlValue,trim = TRUE),"Your search did not return any results"),NA,xpathSApply(parsedHtml,"//h3[@class='result-orgname']",trim = TRUE)) # Name
        mydf$result_ein <- ifelse(str_starts(xpathSApply(parsedHtml,"/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[1]",xmlValue)) # EIN
        mydf$result_city <- ifelse(str_starts(xpathSApply(parsedHtml,"/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[2]",xmlValue)) # City
        mydf$result_state <- ifelse(str_starts(xpathSApply(parsedHtml,"/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[3]",trim = TRUE)) # State
        mydf$result_country <- ifelse(str_starts(xpathSApply(parsedHtml,"/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[4]",xmlValue)) # Country
        
}


get_data(mydf)


mydf

非常感谢我的凌乱和不雅代码！

解决方法

您需要将状态指示为All+States。
您需要通过purrr::map_dfr()来映射URL


library(rvest)
# construct sample data frame with Name,City,State:
name <- c("johnny carson foundation","melinda gates foundation","macarthur foundation")
city <- c("","","")
state <- c("All+States","All+States","All+States")
mydf <- data.frame(name,city,state)

#replace spaces between words with '+' for consistent formatting of 'url' object:
mydf$name <- str_replace_all(mydf$name," ","+")
mydf$city <- str_replace_all(mydf$city,"+")

# root components of url:
root <- "http://apps.irs.gov/app/eos/allSearch.do?ein1=&names="
root2 <- "&resultsPerPage=25&indexOfFirstRow=0&dispatchMethod=searchAll&city="
root3 <- "&country=US&postDateFrom=&postDateTo=&exemptTypeCode=al&deductibility=all&sortColumn=orgName&isDescending=false&submitName=Search"


# construct url by adding roots and search strings from 'mydf':
url <- paste(root,mydf$name,root2,mydf$city,'&state=',mydf$state,root3,sep = "")

data <- 
  purrr::map_dfr(
    url,function(url) {
      items <- read_html(url) %>% html_nodes("ul.views-row > li")
      data.frame(
        name = items %>% html_node("h3") %>% html_text() %>% stringi::stri_trans_totitle(),ein = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[1]") %>% html_text(),city = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[2]") %>% html_text(),state = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[3]") %>% html_text(),country = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[4]") %>% html_text()
      )
    }
  )

httr r r web-scraping xml xml xml xml xml xml