确定在火山图上过滤了哪些 RNA SEQ 数据

问题描述

我正在使用 RNA seq 数据通过 R 中的火山图(比较使用和不使用抗生素的细菌的差异基因表达)来分析基因。创建我的图后,我不确定为什么我的一些值显示为“0 " 一种情况下的 TPM 和另一种情况下不为“0”的 TPM 值未确定为差异表达。我的火山图上的一些基因在 TPM 上有这种差异,并在我的图上显示为显着,而根据我的图,其他具有“0”值差异的基因不被认为是显着的。

这是我的数据示例 (UO1_D712_#####代表基因座编号,顶栏代表不同的重复,其中“1”未处理,“2”用抗生素处理,数字旁边的字母代表不同的技术重复):

structure(list(`1` = c("U01_D712_00001","U01_D712_00002","U01_D712_00003","U01_D712_00004","U01_D712_00005","U01_D712_00006","U01_D712_00007","U01_D712_00008","U01_D712_00009","U01_D712_00010","U01_D712_00011","U01_D712_00012","U01_D712_00013","U01_D712_00014","U01_D712_00015","U01_D712_00016","U01_D712_00017","U01_D712_00018","U01_D712_00019","U01_D712_00020","U01_D712_00021","U01_D712_00022","U01_D712_00023","U01_D712_00024","U01_D712_00025","U01_D712_00026","U01_D712_00027","U01_D712_00028","U01_D712_00029","U01_D712_00030"),`1a` = c("6.590456502","6.741694688","7.110342585","6.299482433","2.77173982","2.470330508","3.827125186","6.267229842","5.524708663","2.657913228","2.87209859","7.479820548","4.980185572","4.210955199","0","4.492492822","3.611091371","3.714270433","7.455036914","7.045203025","6.061860857","2.925268313","6.544077039","5.747318013","9.97083444","9.22523089","4.20205383","6.097040679","2.621192351","0"),`1b` = c("6.544454427","6.601489488","7.134224619","5.814043553","3.280958379","2.649180803","3.860542083","6.256648363","5.380427766","2.581027705","3.016132165","7.405329447","4.701503289","4.073814818","4.304196924","3.515977329","3.843535649","7.342972625","6.966606769","6.122878624","3.007522306","6.495797641","5.965431621","9.828050269","9.219563915","4.065778989","6.105331066","3.061209408",`1c` = c("6.608196006","6.743010138","7.102600793","6.146518601","3.555184202","2.364971542","4.034053983","6.158523627","5.656051812","2.658660735","3.054717455","7.392164473","4.950953264","4.277770477","4.507936666","3.794842979","3.610794578","7.471646548","7.104792624","6.484767016","3.071205184","6.584425715","6.20466015","9.986342122","9.282943758","4.179958213","6.219551653","2.984738345",`1d` = c("6.547155382","6.558328892","6.992501615","6.449558793","3.059801464","2.418800257","3.96498952","6.013208538","5.279919645","2.893295085","1.750510471","7.408735671","4.9425624","3.804549986","4.174835979","3.806006888","3.570390524","7.641137006","6.976672494","6.363030106","3.083061726","6.300910093","6.007490342","9.926316442","9.09671588","4.320556917","6.153860107","2.877230446",`1e` = c("6.626724417","6.577345176","7.156821278","6.296873411","3.618089702","2.394444986","4.129376392","6.011715246","5.31197869","2.00754706","2.695493528","7.538910448","4.606060035","3.909472643","4.346616047","3.468681284","3.338231445","7.559599613","7.1527452","6.232923513","3.108624209","6.535435309","6.12922864","10.13108497","9.310331313","3.959568571","6.182335537","2.902736258",`1f` = c("6.419219179","6.650459302","7.319125725","5.570869357","2.962845933","2.55903176","4.087597573","5.995610538","5.386268651","2.750800859","2.416572678","7.579148955","3.952633067","3.615227674","4.562838935","3.76942104","3.920096905","7.935320749","7.501470652","6.13700099","3.123910608","6.035971952","5.706235015","9.254751395","8.379630979","4.51391973","5.6890651","2.43285316",`1g` = c("6.553221221","6.633949852","7.182305386","5.769365973","2.721354972","1.668390466","4.148367057","6.240883382","5.458877133","1.733842637","2.723803355","7.522249899","4.149567197","3.780763096","4.496306813","3.645643535","3.851001768","7.678552875","7.283411279","6.591585956","2.879345378","6.389427003","5.911222165","9.851084493","9.084575304","4.272587776","5.974762147","2.98852705",`2a` = c("9.769887737","4.550226652","1.869021464","4.944848987","7.9678549","2.682865013","8.495575559","2.234521659","3.667196316","9.180445037","3.210107621","7.21523691","5.714579923","5.423986751","9.118981459","8.635701597","2.742889473","3.712618983","8.006057144","4.999541279","10.54351774","5.880978085","7.145433526","6.982416661","9.339651188","5.360835327","4.699680905","3.423826225","6.408271885","5.038170992"),`2b` = c("10.26397519","1.945664005","2.086763158","2.800904763","8.583418657","1.536094563","9.32057547","3.10685839","4.224502319","10.1252842","1.811175407","5.714439316","6.039142559","3.833361174","9.757360286","9.565906731","1.523640473","2.315033488","6.312524363","4.889986456","10.23020108","4.848727685","7.533256999","7.138160378","10.30380331","3.955469283","3.167940742","3.599655687","4.828945262","3.701043054"),`2c` = c("9.478481216","2.789289131","3.393949527","3.754810933","8.710777154","1.806170784","9.150005253","2.612275457","4.961073313","9.802701699","2.933183115","6.532384958","6.919449225","4.432699799","9.715063475","9.265691356","1.412064593","3.330131873","6.665979896","5.158526421","9.417365584","4.899531204","8.173459354","7.271400938","9.813068613","4.384622077","3.700645365","4.457874829","5.649440022","3.531010379"
    ),`2d` = c("8.199795497","2.565711524","2.202287889","3.856354444","7.380224849","2.192476466","8.14446837","1.144258862","3.31447122","8.713146629","2.697890381","6.304428859","5.745291803","4.898396114","9.173362747","8.339933849","3.159678152","4.094587234","7.608649692","5.280206424","10.34630403","5.098585806","7.262400625","6.150190905","9.316698845","5.073027993","4.695003229","2.485847024","5.545300465","4.350571411"),`2e` = c("8.74935033","3.739489484","0.217205045","4.413657999","8.745525588","3.657060086","9.279834921","2.898951179","4.282874018","9.610485827","3.561102455","7.228334332","6.388491443","4.389908652","8.781086564","9.178866581","2.374603596","3.961037408","7.864369809","3.654728044","10.15284858","5.894439123","7.68020282","7.12243523","9.998637438","5.092174395","4.111530392","3.776835632","5.624523213","4.095011377"),`2f` = c("7.648310926","4.345557215","1.986576876","4.99426288","7.087937177","2.810917253","7.77706637","2.62822773","3.581811188","8.470225989","3.335757437","7.416094847","5.208841128","5.536128034","8.255571138","7.993319997","1.9209089","3.573861828","7.318814519","5.233804806","11.05855833","6.247720809","6.673407583","6.029960625","8.806867591","5.459208493","4.001428729","3.609936979","5.876522973","4.652839671"),`2g` = c("7.555235468","3.892899549","1.726443458","5.304546796","7.039588042","3.027235295","7.703852207","1.753183519","2.909288568","8.385169315","3.902707541","7.523315081","4.978364017","5.49103181","8.096218606","7.944822989","2.352609608","4.155433517","7.227355741","5.532668321","11.24946953","6.159185473","6.443203375","5.931761874","8.7421732","5.502000205","4.652883503","3.458323017","6.566487449","4.89790353")),row.names = c(NA,30L),class = "data.frame")

设计矩阵:

structure(list(SampleID = c("1a ","1b","1c","1d","1e","1f","1g","2a","2b","2c","2d","2e","2f","2g"),Group = c(1L,1L,2L,2L),Replicate = c("a","b","c","d","e","f","g","a","g"
)),14L),class = "data.frame")

我通读了这个 edgeR 用户手册来编写我的代码http://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf

这是我编写代码以对 RNA seq 数据进行采样的方式(注意 ~/Documents/VOLCANO/R10LB_0ugvs0.5ug.csv 指的是上面的数据样本,而 ~/Documents/VOLCANO/R10LB_0vs0.5_designmatrix.csv 指的是到上面的设计矩阵):

R10LB_0vs0.5 <- read.csv2("~/Documents/VOLCANO/R10LB_0ugvs0.5ug.csv",sep=",",check.names = F)
R10LB_0vs0.5 <- janitor::remove_empty(R10LB_0vs0.5,which = "cols")
R10LB_0vs0.5_design <- read.csv2("~/Documents/VOLCANO/R10LB_0vs0.5_designmatrix.csv",")

#Set rownames
rownames(R10LB_0vs0.5) <- R10LB_0vs0.5$`1`
R10LB_0vs0.5 <- R10LB_0vs0.5[,-1]

#colnames(R10LB_0ugvs0.5ug) <- R10LB_0ugvs0.5ug[1,]
#R10LB_0ugvs0.5ug <- R10LB_0ugvs0.5ug[-1,]

#Convert to Matrix
R10LB_0vs0.5 <- data.matrix(R10LB_0vs0.5)

#Create DGEList
R10LB_0vs0.5 <- DGEList(counts = R10LB_0vs0.5,group = R10LB_0vs0.5_design$Group)
str(R10LB_0vs0.5)

#Design Matrix
Group <- as.vector(as.character(R10LB_0vs0.5_design$Group))
Replicate <- as.vector((R10LB_0vs0.5_design$Replicate))
R10LB_0vs0.5_designmatrix <- model.matrix(~0+Group+Replicate)

#Filter
keep <- filterByExpr(R10LB_0vs0.5)
R10LB_0vs0.5_filter <- R10LB_0vs0.5[keep,keep.lib.sizes = FALSE]

#Estimate Sample dispersion
R10LB_0vs0.5_disp <- estimatedisp(R10LB_0vs0.5_filter,R10LB_0vs0.5_designmatrix)

#Create Contrasts (Group Comparisons)
CONTRASTS <- makeContrasts( Group1vs2 = Group1-Group2,levels = R10LB_0vs0.5_designmatrix)

#GLM Fit for Group 1 vs. Group 2
R10LB_0vs0.5fit <- glmFit(R10LB_0vs0.5_disp,contrast = CONTRASTS[,1])
R10LB_0vs0.5lrt <- glmLRT(R10LB_0vs0.5fit)
R10LB_0vs0.5TT <- topTags(R10LB_0vs0.5lrt,n=nrow(R10LB_0vs0.5_disp))
write.csv(R10LB_0vs0.5TT,file = "R10LB_0vs0.5_comparison.csv")
saveRDS(R10LB_0vs0.5TT,file = "R10LB_0vs0.5_comparison.RDS")

这是我将数据格式化为火山图的代码

    #Basic scatter plot: x is "logFC",y is "PValue"
ggplot(data = R10LB_0vs0.5_comparison,aes(x = logFC,y = PValue)) + geom_point()

#Doesn't look quite like a Volcano plot... convert the p-value into a -log10(p-value)
p4 <- ggplot(data = R10LB_0vs0.5_comparison,y = -log10(PValue))) + geom_point() + theme_minimal()

#Add vertical lines for logFC thresholds,and one horizontal line for the p-value threshold
p5 <- p4 + geom_vline(xintercept = c(-0.6,0.6),col = "red") + geom_hline(yintercept = -log10(0.05),col = "red")

# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.
# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)

# add a column of NAs
R10LB_0vs0.5_comparison$diffexpressed <- "NO"
# if logFC > 0.6 and PValue < 0.05,set as "UP" 
R10LB_0vs0.5_comparison$diffexpressed[R10LB_0vs0.5_comparison$logFC > 0.6 & R10LB_0vs0.5_comparison$PValue < 0.05] <- "UP"
# if logFC < -0.6 and PValue < 0.05,set as "DOWN"
R10LB_0vs0.5_comparison$diffexpressed[R10LB_0vs0.5_comparison$logFC < -0.6 & R10LB_0vs0.5_comparison$PValue < 0.05] <- "DOWN"

# Re-plot but this time color the points with "diffexpressed"
p4 <- ggplot(data=R10LB_0vs0.5_comparison,aes(x=logFC,y=-log10(PValue),col=diffexpressed)) + geom_point() + theme_minimal()

# Add lines as before...
p5 <- p4 + geom_vline(xintercept=c(-0.6,col="red") + geom_hline(yintercept=-log10(0.05),col="red")

## Change point color 

# 1. by default,it is assigned to the categories in an alphabetical order):
p6 <- p5 + scale_color_manual(values=c("blue","black","red"))

# 2. to automate a bit: create a named vector: the values are the colors to be used,the names are the categories they will be assigned to:
mycolors <- c("blue","red","black")
names(mycolors) <- c("DOWN","UP","NO")
p6 <- p5 + scale_colour_manual(values = mycolors)

# Now write down the name of genes beside the points...
# Create a new column "delabel" to de,that will contain the name of genes differentially expressed (NA in case they are not)
R10LB_0vs0.5_comparison$delabel <- NA
R10LB_0vs0.5_comparison$delabel[R10LB_0vs0.5_comparison$diffexpressed != "NO"] <- R10LB_0vs0.5_comparison$gene_symbol[R10LB_0vs0.5_comparison$diffexpressed != "NO"]

ggplot(data=R10LB_0vs0.5_comparison,col=diffexpressed,label=delabel)) + geom_point() + theme_minimal() +geom_text()

# Finally,we can organize the labels nicely using the "ggrepel" package and the geom_text_repel() function
# load library
library(ggrepel)
# plot adding up all layers we have seen so far
ggplot(data=R10LB_0vs0.5_comparison,label=delabel)) +geom_point() + theme_minimal() +geom_text_repel() + scale_color_manual(values=c("blue","red")) + geom_vline(xintercept=c(-0.6,col="red") +geom_hline(yintercept=-log10(0.05),col="red")

这是我目前拥有的所有数据的火山图:

volcano plot

你能解释一下这些函数是如何过滤我的数据的吗(例如 filterByExpr、estimatedisp、makeContrasts、glmFit、glmLRT)?为什么我的某些数据点会从一种条件下的 0 TPM 变为另一种条件下的某个值,而其他数据点却没有?

您是否会推荐其他特定的过滤流程来更改、修复和/或改进我的火山图?

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)