This document depends on the following packages:
library(devtools)
library(Biobase)
library(limma)
library(edge)
library(genefilter)
To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE
from the chunk.)
install.packages(c("devtools"))
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("Biobase","limma","genefilter","jdstorey/edge"))
Here we are going to use some data from the paper Evaluating gene expression in C57BL/6J and DBA/2J mouse striatum using RNA-Seq and microarrays. that is a comparative RNA-seq analysis of different mouse strains.
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file=con)
close(con)
bot = bottomly.eset
pdata=pData(bot)
edata=as.matrix(exprs(bot))
fdata = fData(bot)
ls()
## [1] "bot" "bottomly.eset" "con" "edata"
## [5] "fdata" "pdata" "tropical"
Here we will transform the data and remove lowly expressed genes.
edata = log2(as.matrix(edata) + 1)
edata = edata[rowMeans(edata) > 10, ]
The genefilter
package lets you compute statistics rapidly for very simple cases (two or multi-group) comparisons. These are not moderated in any way.
tstats_obj = rowttests(edata,pdata$strain)
names(tstats_obj)
## [1] "statistic" "dm" "p.value"
hist(tstats_obj$statistic,col=2)
For multi-group comparisons use the F-statistic
fstats_obj = rowFtests(edata,as.factor(pdata$lane.number))
names(fstats_obj)
## [1] "statistic" "p.value"
hist(fstats_obj$statistic,col=2)
This approach fits many moderated statistics simultaneously
mod = model.matrix(~ pdata$strain)
fit_limma = lmFit(edata,mod)
ebayes_limma = eBayes(fit_limma)
head(ebayes_limma$t)
## (Intercept) pdata$strainDBA/2J
## ENSMUSG00000000131 61.56232 1.48876396
## ENSMUSG00000000149 63.84978 -0.09514821
## ENSMUSG00000000374 63.48896 1.00196514
## ENSMUSG00000000399 60.07990 0.19593890
## ENSMUSG00000000532 64.97678 0.80640805
## ENSMUSG00000000605 68.18274 0.79136446
If the model is unadjusted you get the moderated version of the statistic from rowttests
plot(ebayes_limma$t[,2],-tstats_obj$statistic,col=4,
xlab="Moderated T-stat",ylab="T-stat")
abline(c(0,1),col="darkgrey",lwd=3)
Here we adjust for the lane number, now the test-statistic for the strain is adjusted for the lane number (a surrogate for a batch effect).
mod_adj = model.matrix(~ pdata$strain + as.factor(pdata$lane.number))
fit_limma_adj = lmFit(edata,mod_adj)
ebayes_limma_adj = eBayes(fit_limma_adj)
head(ebayes_limma_adj$t)
## (Intercept) pdata$strainDBA/2J
## ENSMUSG00000000131 27.42608 0.993341906
## ENSMUSG00000000149 28.30319 -0.302293604
## ENSMUSG00000000374 28.43334 0.684161612
## ENSMUSG00000000399 26.89729 -0.007509614
## ENSMUSG00000000532 29.20894 0.552296098
## ENSMUSG00000000605 30.52908 0.450401742
## as.factor(pdata$lane.number)2
## ENSMUSG00000000131 0.106103723
## ENSMUSG00000000149 0.283169296
## ENSMUSG00000000374 0.062660205
## ENSMUSG00000000399 0.066998156
## ENSMUSG00000000532 -0.036205851
## ENSMUSG00000000605 0.001102388
## as.factor(pdata$lane.number)3
## ENSMUSG00000000131 0.04870234
## ENSMUSG00000000149 0.15268674
## ENSMUSG00000000374 -0.04788353
## ENSMUSG00000000399 -0.13957244
## ENSMUSG00000000532 -0.13080316
## ENSMUSG00000000605 -0.08760933
## as.factor(pdata$lane.number)5
## ENSMUSG00000000131 0.137709189
## ENSMUSG00000000149 0.114049004
## ENSMUSG00000000374 -0.031272733
## ENSMUSG00000000399 0.156240669
## ENSMUSG00000000532 -0.007169364
## ENSMUSG00000000605 0.185393375
## as.factor(pdata$lane.number)6
## ENSMUSG00000000131 0.7274749
## ENSMUSG00000000149 1.0154366
## ENSMUSG00000000374 0.5228153
## ENSMUSG00000000399 0.8271336
## ENSMUSG00000000532 0.3135213
## ENSMUSG00000000605 0.4940291
## as.factor(pdata$lane.number)7
## ENSMUSG00000000131 0.41722714
## ENSMUSG00000000149 0.58175790
## ENSMUSG00000000374 0.23222225
## ENSMUSG00000000399 0.05475727
## ENSMUSG00000000532 0.14890971
## ENSMUSG00000000605 0.27370736
## as.factor(pdata$lane.number)8
## ENSMUSG00000000131 0.28886043
## ENSMUSG00000000149 0.39833581
## ENSMUSG00000000374 0.11020429
## ENSMUSG00000000399 -0.01182217
## ENSMUSG00000000532 -0.02978295
## ENSMUSG00000000605 0.18942807
If the model is adjusted you get the moderated version of the statistic but it doesn’t match rowttests
because it is adjusted
plot(ebayes_limma_adj$t[,2],-tstats_obj$statistic,col=3,
xlab="Moderated T-stat",ylab="T-stat")
abline(c(0,1),lwd=3,col="darkgrey")
Sometimes we want to compare the null model to the alternative model with some additional covariates. Here we have to know which coefficients we want to test in the alternative model.
Suppose we wanted to find lane effects then we can fit a limma model and find which coefficients belong to the lane variable.
mod_lane = model.matrix(~ as.factor(pdata$lane.number))
fit_limma_lane = lmFit(edata,mod_lane)
ebayes_limma_lane = eBayes(fit_limma_lane)
head(ebayes_limma_lane$t)
## (Intercept) as.factor(pdata$lane.number)2
## ENSMUSG00000000131 29.18101 0.108054558
## ENSMUSG00000000149 29.75296 0.288375678
## ENSMUSG00000000374 30.15864 0.063812283
## ENSMUSG00000000399 28.35119 0.068229991
## ENSMUSG00000000532 30.94034 -0.036871536
## ENSMUSG00000000605 32.30420 0.001122656
## as.factor(pdata$lane.number)3
## ENSMUSG00000000131 0.04959779
## ENSMUSG00000000149 0.15549405
## ENSMUSG00000000374 -0.04876393
## ENSMUSG00000000399 -0.14213864
## ENSMUSG00000000532 -0.13320812
## ENSMUSG00000000605 -0.08922013
## as.factor(pdata$lane.number)5
## ENSMUSG00000000131 0.33389895
## ENSMUSG00000000149 0.06002328
## ENSMUSG00000000374 0.09926018
## ENSMUSG00000000399 0.16048443
## ENSMUSG00000000532 0.09886277
## ENSMUSG00000000605 0.27882684
## as.factor(pdata$lane.number)6
## ENSMUSG00000000131 0.9451392
## ENSMUSG00000000149 0.9942322
## ENSMUSG00000000374 0.6735237
## ENSMUSG00000000399 0.8558060
## ENSMUSG00000000532 0.4312304
## ENSMUSG00000000605 0.5987006
## as.factor(pdata$lane.number)7
## ENSMUSG00000000131 0.62359473
## ENSMUSG00000000149 0.54476241
## ENSMUSG00000000374 0.37234955
## ENSMUSG00000000399 0.05530581
## ENSMUSG00000000532 0.26062501
## ENSMUSG00000000605 0.37035652
## as.factor(pdata$lane.number)8
## ENSMUSG00000000131 0.49055391
## ENSMUSG00000000149 0.35466155
## ENSMUSG00000000374 0.24588867
## ENSMUSG00000000399 -0.01369792
## ENSMUSG00000000532 0.07542578
## ENSMUSG00000000605 0.28300845
Then we can get the F-statistics with topTable
top_lane = topTable(ebayes_limma_lane, coef=2:7,
number=dim(edata)[1],sort.by="none")
head(top_lane)
## as.factor.pdata.lane.number.2
## ENSMUSG00000000131 0.0545143060
## ENSMUSG00000000149 0.1454876148
## ENSMUSG00000000374 0.0321937582
## ENSMUSG00000000399 0.0344225239
## ENSMUSG00000000532 -0.0186019566
## ENSMUSG00000000605 0.0005663883
## as.factor.pdata.lane.number.3
## ENSMUSG00000000131 0.02502244
## ENSMUSG00000000149 0.07844787
## ENSMUSG00000000374 -0.02460175
## ENSMUSG00000000399 -0.07170997
## ENSMUSG00000000532 -0.06720446
## ENSMUSG00000000605 -0.04501220
## as.factor.pdata.lane.number.5
## ENSMUSG00000000131 0.16845444
## ENSMUSG00000000149 0.03028218
## ENSMUSG00000000374 0.05007748
## ENSMUSG00000000399 0.08096556
## ENSMUSG00000000532 0.04987698
## ENSMUSG00000000605 0.14067016
## as.factor.pdata.lane.number.6
## ENSMUSG00000000131 0.4768296
## ENSMUSG00000000149 0.5015973
## ENSMUSG00000000374 0.3397976
## ENSMUSG00000000399 0.4317603
## ENSMUSG00000000532 0.2175589
## ENSMUSG00000000605 0.3020488
## as.factor.pdata.lane.number.7
## ENSMUSG00000000131 0.31460805
## ENSMUSG00000000149 0.27483657
## ENSMUSG00000000374 0.18785304
## ENSMUSG00000000399 0.02790218
## ENSMUSG00000000532 0.13148720
## ENSMUSG00000000605 0.18684754
## as.factor.pdata.lane.number.8 AveExpr F
## ENSMUSG00000000131 0.24748800 10.59389 0.23962863
## ENSMUSG00000000149 0.17892932 10.78688 0.23360430
## ENSMUSG00000000374 0.12405261 10.86015 0.12886955
## ENSMUSG00000000399 -0.00691070 10.18493 0.21598161
## ENSMUSG00000000532 0.03805285 11.08784 0.07307893
## ENSMUSG00000000605 0.14277981 11.62820 0.12134214
## P.Value adj.P.Val
## ENSMUSG00000000131 0.9635205 0.9998831
## ENSMUSG00000000149 0.9657594 0.9998831
## ENSMUSG00000000374 0.9927726 0.9998831
## ENSMUSG00000000399 0.9718839 0.9998831
## ENSMUSG00000000532 0.9985087 0.9998831
## ENSMUSG00000000605 0.9938655 0.9998831
This is again the moderated version of the F-statistic we saw earlier
plot(top_lane$F,fstats_obj$statistic,
xlab="Moderated F-statistic",ylab="F-statistic",col=3)
We can also perform the unmoderated nested comparisons in the edge
package, which also has functions for calculating a more powerful odp statistic
edge_study = build_study(edata, grp = as.factor(pdata$lane.number))
de_obj = lrt(edge_study)
qval = qvalueObj(de_obj)
plot(qval$stat,fstats_obj$statistic,col=4,
xlab="F-stat from edge",ylab="F-stat from genefilter")
We can easily adjust for variables by passing arguments to the adj.var
variable.
edge_study2 = build_study(edata, grp = as.factor(pdata$lane.number),
adj.var=pdata$strain)
de_obj2 = lrt(edge_study2)
qval2 = qvalueObj(de_obj2)
plot(qval2$stat,fstats_obj$statistic,col=4,
xlab="F-stat from edge",ylab="F-stat from genefilter")
You can find a lot more information on this model fitting strategy in:
Here is the session information
devtools::session_info()
## setting value
## version R version 3.2.1 (2015-06-18)
## system x86_64, darwin10.8.0
## ui RStudio (0.99.447)
## language (EN)
## collate en_US.UTF-8
## tz America/New_York
##
## package * version date
## acepack 1.3-3.3 2014-11-24
## annotate 1.46.1 2015-07-11
## AnnotationDbi * 1.30.1 2015-04-26
## assertthat 0.1 2013-12-06
## BiasedUrn * 1.06.1 2013-12-29
## Biobase * 2.28.0 2015-04-17
## BiocGenerics * 0.14.0 2015-04-17
## BiocInstaller * 1.18.4 2015-07-22
## BiocParallel 1.2.20 2015-08-07
## biomaRt 2.24.0 2015-04-17
## Biostrings 2.36.3 2015-08-12
## bitops 1.0-6 2013-08-17
## bladderbatch * 1.6.0 2015-08-26
## broom * 0.3.7 2015-05-06
## caTools 1.17.1 2014-09-10
## cluster 2.0.3 2015-07-21
## colorspace 1.2-6 2015-03-11
## corpcor 1.6.8 2015-07-08
## curl 0.9.2 2015-08-08
## DBI * 0.3.1 2014-09-24
## dendextend * 1.1.0 2015-07-31
## DESeq2 * 1.8.1 2015-05-02
## devtools * 1.8.0 2015-05-09
## digest 0.6.8 2014-12-31
## dplyr * 0.4.3 2015-09-01
## edge * 2.1.0 2015-09-06
## evaluate 0.7.2 2015-08-13
## foreign 0.8-66 2015-08-19
## formatR 1.2 2015-04-21
## Formula * 1.2-1 2015-04-07
## futile.logger 1.4.1 2015-04-20
## futile.options 1.0.0 2010-04-06
## gdata 2.17.0 2015-07-04
## genefilter * 1.50.0 2015-04-17
## geneLenDataBase * 1.4.0 2015-09-06
## geneplotter 1.46.0 2015-04-17
## GenomeInfoDb * 1.4.2 2015-08-15
## GenomicAlignments 1.4.1 2015-04-24
## GenomicFeatures 1.20.2 2015-08-14
## GenomicRanges * 1.20.5 2015-06-09
## genstats * 0.1.02 2015-09-05
## ggplot2 * 1.0.1 2015-03-17
## git2r 0.11.0 2015-08-12
## GO.db 3.1.2 2015-09-06
## goseq * 1.20.0 2015-04-17
## gplots * 2.17.0 2015-05-02
## gridExtra 2.0.0 2015-07-14
## gtable 0.1.2 2012-12-05
## gtools 3.5.0 2015-05-29
## highr 0.5 2015-04-21
## HistData * 0.7-5 2014-04-26
## Hmisc * 3.16-0 2015-04-30
## htmltools 0.2.6 2014-09-08
## httr 1.0.0 2015-06-25
## IRanges * 2.2.7 2015-08-09
## KernSmooth 2.23-15 2015-06-29
## knitr * 1.11 2015-08-14
## lambda.r 1.1.7 2015-03-20
## lattice * 0.20-33 2015-07-14
## latticeExtra 0.6-26 2013-08-15
## lazyeval 0.1.10 2015-01-02
## limma * 3.24.15 2015-08-06
## lme4 1.1-9 2015-08-20
## locfit 1.5-9.1 2013-04-20
## magrittr 1.5 2014-11-22
## MASS * 7.3-43 2015-07-16
## Matrix * 1.2-2 2015-07-08
## MatrixEQTL * 2.1.1 2015-02-03
## memoise 0.2.1 2014-04-22
## mgcv * 1.8-7 2015-07-23
## minqa 1.2.4 2014-10-09
## mnormt 1.5-3 2015-05-25
## munsell 0.4.2 2013-07-11
## nlme * 3.1-122 2015-08-19
## nloptr 1.0.4 2014-08-04
## nnet 7.3-10 2015-06-29
## org.Hs.eg.db * 3.1.2 2015-07-17
## plyr 1.8.3 2015-06-12
## preprocessCore * 1.30.0 2015-04-17
## proto 0.3-10 2012-12-22
## psych 1.5.6 2015-07-08
## qvalue * 2.0.0 2015-04-17
## R6 2.1.1 2015-08-19
## RColorBrewer 1.1-2 2014-12-07
## Rcpp * 0.12.0 2015-07-25
## RcppArmadillo * 0.5.400.2.0 2015-08-17
## RCurl 1.95-4.7 2015-06-30
## reshape2 1.4.1 2014-12-06
## rmarkdown 0.7 2015-06-13
## rpart 4.1-10 2015-06-29
## Rsamtools 1.20.4 2015-06-01
## RSkittleBrewer * 1.1 2015-09-05
## RSQLite * 1.0.0 2014-10-25
## rstudioapi 0.3.1 2015-04-07
## rtracklayer 1.28.9 2015-08-19
## rversions 1.0.2 2015-07-13
## S4Vectors * 0.6.5 2015-09-01
## scales 0.3.0 2015-08-25
## snm 1.16.0 2015-04-17
## snpStats * 1.18.0 2015-04-17
## stringi 0.5-5 2015-06-29
## stringr 1.0.0 2015-04-30
## survival * 2.38-3 2015-07-02
## sva * 3.14.0 2015-04-17
## tidyr 0.2.0 2014-12-05
## UsingR * 2.0-5 2015-08-06
## whisker 0.3-2 2013-04-28
## XML 3.98-1.3 2015-06-30
## xml2 0.1.2 2015-09-01
## xtable 1.7-4 2014-09-12
## XVector 0.8.0 2015-04-17
## yaml 2.1.13 2014-06-12
## zlibbioc 1.14.0 2015-04-17
## source
## CRAN (R 3.2.0)
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.2)
## Github (jdstorey/edge@a1947b5)
## CRAN (R 3.2.2)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## local
## CRAN (R 3.2.0)
## CRAN (R 3.2.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.1)
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## Bioconductor
## Github (alyssafrazee/RSkittleBrewer@0a96a20)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.0)
## Bioconductor
It is also useful to compile the time the document was processed. This document was processed on: 2015-09-06.