如何解析化学式以获得原子成分?

问题描述

我的数据集存储在名为“Formula”的单列表中,如下所示:

row.identity..main.ID.
C5H6O2N3
C10H12N
C5H6O2N3S

我想扩展当前表格,其中在每一列中写入字母并在对应数字下方的行中显示。基本上我想要这样的东西:

row.identity..main.ID.   C  H  O  N  S  X
C5H6O2N3                 5  6  2  3  0  0   
C10H12N                 10 12  0  1  0  0
C5H6O2N3S                5  6  2  3  1  0

如果代码可以灵活处理带有不同字母的更长数据集,那就太好了。到目前为止,我尝试实施 Onyambu 的解决方案。

library(tidyverse)
library(stringr)    
Formula%>%mutate(row.identity..main.ID.=gsub("\\b([A-Za-z]+)\\b","\\30",row.identity..main.ID.),elements=str_extract_all(row.identity..main.ID.,"[A-Za-z]+"),value=str_extract_all(row.identity..main.ID.,"\\d+"))%>%
          unnest()%>%pivot_wider(elements,value,fill=0)

然而,这会导致一些错误,例如“长度不兼容:4、3”。和/或 cols 现在在使用 unnest() 时是必需的。

解决方法

你也可以这样做:

a<- sub("([A-Z]$)","\\1:1",gsub("(\\D+)(\\d+)","\\1:\\2\n",df[,1]))
e <- sapply(a,function(x)data.frame(read.dcf(textConnection(x))))
f <- cbind(df,plyr::rbind.fill(e))
f[is.na(f)] <- 0
f

  row.identity..main.ID.  C  H O N S
1               C5H6O2N3  5  6 2 3 0
2                C10H12N 10 12 0 1 0
3              C5H6O2N3S  5  6 2 3 1

另一种选择是将文本转换为 Json,然后将其读入 R:

a <- gsub("(\\D)(\\d+)",'"\\1":\\2,',1])
b <- gsub("([A-Z])$",'"\\1":1',trimws(a,whitespace = ","))

cbind(df,jsonlite::fromJSON(sprintf("[{%s}]",paste(b,collapse = "},{"))))
replace(f,is.na(f),0)

  row.identity..main.ID.  C  H O N S
1               C5H6O2N3  5  6 2 3 0
2                C10H12N 10 12 0 1 0
3              C5H6O2N3S  5  6 2 3 1
,

你可以试试下面的代码

df <- cbind(
  df,do.call(
    rbind,Map(function(x) {
      x <- gsub("(?<=[A-z])(?![0-9])","1",x,perl = TRUE)
      table(
        factor(rep(
          gsub("\\d+","",x),as.numeric(gsub("\\D+",x))
        ),levels = c("C","H","O","N","S","X"))
      )
    },regmatches(df$ID,gregexpr("[A-z]+(\\d+)?",df$ID)))
  )
)

给出

> df
         ID  C  H O N S X
1  C5H6O2N3  5  6 2 3 0 0
2   C10H12N 10 12 0 1 0 0
3 C5H6O2N3S  5  6 2 3 1 0

数据

> dput(df)
structure(list(ID = c("C5H6O2N3","C10H12N","C5H6O2N3S"),C = c(5L,10L,5L),H = c(6L,12L,6L),O = c(2L,0L,2L),N = c(3L,1L,3L),S = c(0L,1L),X = c(0L,0L)),class = "data.frame",row.names = c(NA,-3L))
,

这是使用 purrr 的方法:

library(purrr); library(dplyr); library(stringr)
Formula %>% 
  pmap_dfr(~map2_dbl(c(...),LETTERS,~max((str_extract(.x,paste0("(?<=",.y,")[0-9]+"))%>% as.integer),(str_extract(.x,.y) == .y),na.rm = TRUE)) %>%
             replace(is.infinite(.),0) %>%
             set_names(LETTERS)) %>%
    select_if(~sum(.) > 0)
# A tibble: 3 x 5
      C     H     N     O     S
  <dbl> <dbl> <dbl> <dbl> <dbl>
1     5     6     3     2     0
2    10    12     1     0     0
3     5     6     3     2     1

数据

Formula <- structure(list(row.identity..main.ID. = c("C5H6O2N3","C5H6O2N3S")),-3L))