问题描述
我想搜索数据框中的图案列表。在这里,我将附加示例数据集和代码。
Gene_and_Promoter <- tibble::tribble(
~Gene,~Promoter,"Gene1","AGTCACGTGCGTGCATACGTGCAAATTGGGCGTACGTGGCTATCTCAACTATCH","Gene2","AACGTGGCGTGGCAGTGCACGTGCCAGTTGTCCCGCAGTGTGCATACTACTCT","Gene3","ACTGGCTACGTGCTGCAATGCGTGCGTAGTGCGTACCAAAGTTAAACCGGCG","Gene4","GCAATACGTGCAAGTGCGTGTACGTGCGTGATGTCGTACGTAACCGGCCGGT","Gene5","ATACGTGCGTCGTACGTGCGTACTAATACATACATCATAATTTAAACCCG","Gene6","GGGGGAATCTCGTTCCTACGTCAAGGATAGATGCTGATAGTCGTA"
)
Motifs <- tibble::tribble(
~MOTIF,"CGTGC","GGAATA","CCAG","CGTA"
)
Gene_and_Promoter %>%
mutate(CGTGC = vcountPattern("CGTGC",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
mutate(GGAATA = vcountPattern("GGAATA",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
mutate(CCAG = vcountPattern("CCAG",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
mutate(CGTA = vcountPattern("CGTA",DNAStringSet(Gene_and_Promoter$Promoter)))
我可以通过减少使用mutate的次数来优化上面的代码吗? (可能是通过迭代)
解决方法
无需深入了解功能DNAStringSet
即可说。也许尝试这样的事情:
library(data.table)
library(purrr)
vec <- DNAStringSet(Gene_and_Promoter$Promoter)
Motifs <- c("CGTGC","GGAATA","CCAG","CGTA")
setDT(Gene_and_Promoter)
Gene_and_Promoter[,(Motifs) := map(Motifs,~vcountPattern(.x,vec))]
,
这里的可能性类似于@det的答案,但在整洁的范围内...
library(tidyverse)
pat <- c("CGTGC","CGTA")
# set names so that map_df() keeps them...
lpat <- as.list(pat) %>%
set_names(.,pat)
dd <-
Gene_and_Promoter %>%
mutate(across(Promoter,~map_df(lpat,~ vcountPattern(.,DNAStringSet(Promoter))))) %>%
as.list() %>%
bind_cols() %>%
full_join(Gene_and_Promoter,.)