抓取 HTML 时将值从网站添加到表格

问题描述

这是我需要的数据:

https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0

我已经将表导入到 R 中:

library(tidyverse)
library(rvest)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.PHP?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=normal%20table&dr=&page=0")

tbls <- html_nodes(webpage,"table")
tbls_ls <- webpage %>%
  html_nodes("table") %>%
  .[5] %>%
  html_table(fill = TRUE)

data = as.tibble(tbls_ls[[1]]) 

然而,我还需要在表格中添加一件事。对于某些陨石,有可用的氧同位素值。单击“地块”部分下的陨石名称时,可以看到这一点。单击该图时,我们将被重定向到具有三个同位素值的页面。我想要做的是在我的表中添加三列,包含每个陨石各自的同位素值。我尝试分别为每个“情节”部分编写代码,但我觉得可能有一个更优雅的解决方案。

解决方法

如果您决定使用同位素,您可以抓取没有同位素的表格,然后模仿页面所做的发布请求;然后在 Name 列上左连接这两个。您将获得比左表中更多的行(没有同位素),因为有多个 Change values,但这与您在查看您描述的同位素的方法中看到的相匹配,其中有逗号分隔的值列表同位素,在图中,而不是按行分割。

我选择更具选择性的 css 选择器来最初定位感兴趣的特定表,而不是索引到列表中。

我使用 write_excel_csv 在写出时保留标头的字符编码(我从 @stefan 得到的想法)。

在写出(子集/选择等)之前,您可以从 joint_table 的输出中删除不需要的列。


r

library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T) 

data <- list(
  'sfor' = "names",'stype' = "contains",'country' = "All",'categ' = "Ungrouped achondrites",'page' = "0",'map' = "ge",'srt' = "name",'lrec' = "200",'pnt' = "Oxygen isotopes",'mblist' = "All",'snew' = "0",'sea' = "*"
)


r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php",body = data)

isotopes <- content(r,"text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)


joint_table <- dplyr::left_join(no_isotopes,isotopes,by = "Name",copy = FALSE)

write_excel_csv(x = joint_table,path = "joint.csv",col_names = T,na = "")

示例输出:

enter image description here


编辑:

根据您在评论中的要求添加来自其他网址的附加信息。我必须动态确定要选择哪个表号,以及处理没有表的情况。

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#> 
#>     pluck
#> The following object is masked from 'package:readr':
#> 
#>     guess_encoding
library(readr)
library(furrr)

get_table <- function(url) {
  page <- read_html(url)
  test_list <- page %>%
    html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
    html_text() # get left hand column %>%
  index <- match(TRUE,stringr::str_detect(test_list,"Data from:")) + 1
  table <- page %>%
    html_node(paste0("#maintable tr:nth-of-type(",index,") table")) %>%
    html_table() %>%
    as_tibble()
  temp <- set_names(data.frame(t(table[,-1]),row.names = c()),t(table[,1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
  return(temp)
}


start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T)

data <- list(
  "sfor" = "names","stype" = "contains","country" = "All","categ" = "Ungrouped achondrites","page" = "0","map" = "ge","srt" = "name","lrec" = "200","pnt" = "Oxygen isotopes","mblist" = "All","snew" = "0","sea" = "*"
)

r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php","text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)

joint_table <- dplyr::left_join(no_isotopes,copy = FALSE)

lookups <- webpage %>%
  html_node("#maintable") %>%
  html_nodes("td:nth-of-type(1) a") %>%
  map_df(~ c(html_text(.),html_attr(.,"href")) %>%
    set_names("Name","Link")) %>%
  mutate(Link = paste0(base,gsub("\\s+","%20",Link)))

error_df <- tibble(
  `State/Prov/County:` = NA_character_,`Origin or pseudonym:` = NA_character_,`Date:` = NA_character_,`Latitude:` = NA_character_,`Longitude:` = NA_character_,`Mass (g):` = NA_character_,`Pieces:` = NA_character_,`Class:` = NA_character_,`Shock stage:` = NA_character_,`Fayalite (mol%):` = NA_character_,`Ferrosilite (mol%):` = NA_character_,`Wollastonite (mol%):` = NA_character_,`Magnetic suscept.:` = NA_character_,`Classifier:` = NA_character_,`Type spec mass (g):` = NA_character_,`Type spec location:` = NA_character_,`Main mass:` = NA_character_,`Finder:` = NA_character_,`Comments:` = NA_character_,)

no_cores <- future::availableCores() - 1

future::plan(future::multisession,workers = no_cores)

df <- furrr::future_map_dfr(lookups$Link,~ tryCatch(get_table(.x),error = function(e) error_df))

colnames(df) <- sub(":","",colnames(df))

df2 <- df %>%
  mutate(
    `Mass (g)` = gsub(",",`Mass (g)`),across(c(`Mass (g)`,`Magnetic suscept.`),as.numeric)
  )

if (nrow(df2) == nrow(no_isotopes)) {
  additional_info <- cbind(lookups,df2)
  joint_table$Name <- gsub(" \\*\\*",joint_table$Name)
  final_table <- dplyr::left_join(joint_table,additional_info,copy = FALSE)
  write_excel_csv(x = final_table,file = "joint.csv",na = "")
}

reprex package (v0.3.0) 于 2021 年 2 月 27 日创建


注意

OP 由于某种原因在查找变量方面存在问题,所以这里是我写的一个对他们有用的替代方法:

lookups <- map_df(
  webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a"),~
    data.frame(
      Name = .x %>% html_text(),Link =  paste0(base,.x %>%  html_attr("href")))
    )
) %>% as_tibble()