如何在迭代中使用mutate函数

问题描述

我想搜索数据框中的图案列表。在这里,我将附加示例数据集和代码

Gene_and_Promoter <- tibble::tribble(
                     ~Gene,~Promoter,"Gene1","AGTCACGTGCGTGCATACGTGCAAATTGGGCGTACGTGGCTATCTCAACTATCH","Gene2","AACGTGGCGTGGCAGTGCACGTGCCAGTTGTCCCGCAGTGTGCATACTACTCT","Gene3","ACTGGCTACGTGCTGCAATGCGTGCGTAGTGCGTACCAAAGTTAAACCGGCG","Gene4","GCAATACGTGCAAGTGCGTGTACGTGCGTGATGTCGTACGTAACCGGCCGGT","Gene5","ATACGTGCGTCGTACGTGCGTACTAATACATACATCATAATTTAAACCCG","Gene6","GGGGGAATCTCGTTCCTACGTCAAGGATAGATGCTGATAGTCGTA"
                   )
Motifs <- tibble::tribble(
             ~MOTIF,"CGTGC","GGAATA","CCAG","CGTA"
           )


 Gene_and_Promoter %>% 
  mutate(CGTGC = vcountPattern("CGTGC",DNAStringSet(Gene_and_Promoter$Promoter))) %>% 
  mutate(GGAATA = vcountPattern("GGAATA",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
  mutate(CCAG = vcountPattern("CCAG",DNAStringSet(Gene_and_Promoter$Promoter))) %>% 
  mutate(CGTA = vcountPattern("CGTA",DNAStringSet(Gene_and_Promoter$Promoter)))

上面的代码提供了所需的输出(启动子中存在主题)。

我可以通过减少使用mutate的次数来优化上面的代码吗? (可能是通过迭代)

解决方法

无需深入了解功能DNAStringSet即可说。也许尝试这样的事情:

library(data.table)
library(purrr)

vec <- DNAStringSet(Gene_and_Promoter$Promoter)
Motifs <- c("CGTGC","GGAATA","CCAG","CGTA")

setDT(Gene_and_Promoter)
Gene_and_Promoter[,(Motifs) := map(Motifs,~vcountPattern(.x,vec))]
,

这里的可能性类似于@det的答案,但在整洁的范围内...

library(tidyverse)

pat <- c("CGTGC","CGTA")

# set names so that map_df() keeps them...
lpat <- as.list(pat) %>%
  set_names(.,pat)

dd <-
  Gene_and_Promoter %>%
  mutate(across(Promoter,~map_df(lpat,~ vcountPattern(.,DNAStringSet(Promoter))))) %>%
  as.list() %>%
  bind_cols() %>%
  full_join(Gene_and_Promoter,.)