问题描述
我想使用R抓取下面的HTML文本。我知道我可以使用
示例:
scrape_css_attr <- function(css,group,attribute,html){
txt <- html %>%
html_nodes(group) %>%
lapply(. %>% html_nodes(css) %>% html_attr(attribute) %>% ifelse(identical(.,character(0)),NA,.)) %>%
unlist()
return(txt)
}
urls <- scrape_css_attr("a","#w0","href",html)
但是我的问题是,我该如何抓取数据密钥?
<div data-key="34356"><a href="/storingen/34356-18-september-2020-defecte-trein-sittard-maastricht" class="disruption-list-item resolved" title="Sittard-Maastricht">
<div class="disruption-icon">
<img src="/images/disruptions/defective-stock.svg" alt=""> </div>
<div class="disruption-content">
<span class="disruption-line">Sittard-Maastricht</span>
<br>
<em>
defecte trein </em>
<div class="timestamp">
<span class="glyphicon glyphicon-time" aria-hidden="true"></span>
11:32 -
12:34 •
<span class="glyphicon glyphicon-record" aria-hidden="true"></span>
1 uur,2 minuten </div>
</div>
</a></div>
1 个答案:
答案 0 :(得分:0)
如果您想获取数据密钥列表,请找到包含<div>
属性的data-key
标签并提取它们:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
datakeys <-
flatten_chr(map(
1:last_page,function(i) {
read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=",i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]") %>%
html_attr("data-key")
}
))
或者您可以直接抓取这些标签的数据:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
data <-
map_dfr(
1:last_page,function(i) {
items <- read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=",i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]")
print(i)
data.frame(
date = items %>% html_node(xpath = "(./preceding-sibling::h4)[last()]") %>% html_text(trim = T),datakey = items %>% html_attr("data-key"),link = paste0("https://www.rijdendetreinen.nl/",items %>% html_node(xpath = "./a[1]") %>% html_attr("href")),title = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::span[@class='disruption-line']") %>% html_text(trim = T),description = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::em") %>% html_text(trim = T),timestamp = items %>% html_node(xpath = "./descendant::div[@class='timestamp']") %>% html_text(trim = T)
)
}
)
解决方法
如果您想获取数据密钥列表,请找到包含<div>
属性的data-key
标签并提取它们:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
datakeys <-
flatten_chr(map(
1:last_page,function(i) {
read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=",i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]") %>%
html_attr("data-key")
}
))
或者您可以直接抓取这些标签的数据:
library(rvest)
library(purrr)
last_page <- 5 # change it to 1655 to scrape all pages
data <-
map_dfr(
1:last_page,function(i) {
items <- read_html(paste0("https://www.rijdendetreinen.nl/storingen?page=",i)) %>%
html_nodes(xpath = "//div[@id='w0']/div[@data-key]")
print(i)
data.frame(
date = items %>% html_node(xpath = "(./preceding-sibling::h4)[last()]") %>% html_text(trim = T),datakey = items %>% html_attr("data-key"),link = paste0("https://www.rijdendetreinen.nl/",items %>% html_node(xpath = "./a[1]") %>% html_attr("href")),title = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::span[@class='disruption-line']") %>% html_text(trim = T),description = items %>% html_node(xpath = "./descendant::div[@class='disruption-content']/descendant::em") %>% html_text(trim = T),timestamp = items %>% html_node(xpath = "./descendant::div[@class='timestamp']") %>% html_text(trim = T)
)
}
)