Calculating statistics

Dependencies
Download the data
Transform the data
Calculate t- or F-statistics rapidly
Fit many statistics with limma
Fit many adjusted statistics with limma
Calculating a nested model comparison with limma
Calculating a nested comparison with edge
More information
Session information

Dependencies

This document depends on the following packages:

  library(devtools)
  library(Biobase)
  library(limma)
  library(edge)
  library(genefilter)

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

install.packages(c("devtools"))
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("Biobase","limma","genefilter","jdstorey/edge"))

Download the data

Here we are going to use some data from the paper Evaluating gene expression in C57BL/6J and DBA/2J mouse striatum using RNA-Seq and microarrays. that is a comparative RNA-seq analysis of different mouse strains.

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file=con)
close(con)
bot = bottomly.eset
pdata=pData(bot)
edata=as.matrix(exprs(bot))
fdata = fData(bot)
ls()

## [1] "bot"           "bottomly.eset" "con"           "edata"        
## [5] "fdata"         "pdata"         "tropical"

Transform the data

Here we will transform the data and remove lowly expressed genes.

edata = log2(as.matrix(edata) + 1)
edata = edata[rowMeans(edata) > 10, ]

Calculate t- or F-statistics rapidly

The genefilter package lets you compute statistics rapidly for very simple cases (two or multi-group) comparisons. These are not moderated in any way.

tstats_obj = rowttests(edata,pdata$strain)
names(tstats_obj)

## [1] "statistic" "dm"        "p.value"

hist(tstats_obj$statistic,col=2)

For multi-group comparisons use the F-statistic

fstats_obj = rowFtests(edata,as.factor(pdata$lane.number))
names(fstats_obj)

## [1] "statistic" "p.value"

hist(fstats_obj$statistic,col=2)

Fit many statistics with limma

This approach fits many moderated statistics simultaneously

mod = model.matrix(~ pdata$strain)
fit_limma = lmFit(edata,mod)
ebayes_limma = eBayes(fit_limma)
head(ebayes_limma$t)

##                    (Intercept) pdata$strainDBA/2J
## ENSMUSG00000000131    61.56232         1.48876396
## ENSMUSG00000000149    63.84978        -0.09514821
## ENSMUSG00000000374    63.48896         1.00196514
## ENSMUSG00000000399    60.07990         0.19593890
## ENSMUSG00000000532    64.97678         0.80640805
## ENSMUSG00000000605    68.18274         0.79136446

If the model is unadjusted you get the moderated version of the statistic from rowttests

plot(ebayes_limma$t[,2],-tstats_obj$statistic,col=4,
     xlab="Moderated T-stat",ylab="T-stat")
abline(c(0,1),col="darkgrey",lwd=3)

Fit many adjusted statistics with limma

Here we adjust for the lane number, now the test-statistic for the strain is adjusted for the lane number (a surrogate for a batch effect).

mod_adj = model.matrix(~ pdata$strain + as.factor(pdata$lane.number))
fit_limma_adj = lmFit(edata,mod_adj)
ebayes_limma_adj = eBayes(fit_limma_adj)
head(ebayes_limma_adj$t)

##                    (Intercept) pdata$strainDBA/2J
## ENSMUSG00000000131    27.42608        0.993341906
## ENSMUSG00000000149    28.30319       -0.302293604
## ENSMUSG00000000374    28.43334        0.684161612
## ENSMUSG00000000399    26.89729       -0.007509614
## ENSMUSG00000000532    29.20894        0.552296098
## ENSMUSG00000000605    30.52908        0.450401742
##                    as.factor(pdata$lane.number)2
## ENSMUSG00000000131                   0.106103723
## ENSMUSG00000000149                   0.283169296
## ENSMUSG00000000374                   0.062660205
## ENSMUSG00000000399                   0.066998156
## ENSMUSG00000000532                  -0.036205851
## ENSMUSG00000000605                   0.001102388
##                    as.factor(pdata$lane.number)3
## ENSMUSG00000000131                    0.04870234
## ENSMUSG00000000149                    0.15268674
## ENSMUSG00000000374                   -0.04788353
## ENSMUSG00000000399                   -0.13957244
## ENSMUSG00000000532                   -0.13080316
## ENSMUSG00000000605                   -0.08760933
##                    as.factor(pdata$lane.number)5
## ENSMUSG00000000131                   0.137709189
## ENSMUSG00000000149                   0.114049004
## ENSMUSG00000000374                  -0.031272733
## ENSMUSG00000000399                   0.156240669
## ENSMUSG00000000532                  -0.007169364
## ENSMUSG00000000605                   0.185393375
##                    as.factor(pdata$lane.number)6
## ENSMUSG00000000131                     0.7274749
## ENSMUSG00000000149                     1.0154366
## ENSMUSG00000000374                     0.5228153
## ENSMUSG00000000399                     0.8271336
## ENSMUSG00000000532                     0.3135213
## ENSMUSG00000000605                     0.4940291
##                    as.factor(pdata$lane.number)7
## ENSMUSG00000000131                    0.41722714
## ENSMUSG00000000149                    0.58175790
## ENSMUSG00000000374                    0.23222225
## ENSMUSG00000000399                    0.05475727
## ENSMUSG00000000532                    0.14890971
## ENSMUSG00000000605                    0.27370736
##                    as.factor(pdata$lane.number)8
## ENSMUSG00000000131                    0.28886043
## ENSMUSG00000000149                    0.39833581
## ENSMUSG00000000374                    0.11020429
## ENSMUSG00000000399                   -0.01182217
## ENSMUSG00000000532                   -0.02978295
## ENSMUSG00000000605                    0.18942807

If the model is adjusted you get the moderated version of the statistic but it doesn’t match rowttests because it is adjusted

plot(ebayes_limma_adj$t[,2],-tstats_obj$statistic,col=3,
     xlab="Moderated T-stat",ylab="T-stat")
abline(c(0,1),lwd=3,col="darkgrey")

Calculating a nested model comparison with limma

Sometimes we want to compare the null model to the alternative model with some additional covariates. Here we have to know which coefficients we want to test in the alternative model.

Suppose we wanted to find lane effects then we can fit a limma model and find which coefficients belong to the lane variable.

mod_lane = model.matrix(~ as.factor(pdata$lane.number))
fit_limma_lane = lmFit(edata,mod_lane)
ebayes_limma_lane = eBayes(fit_limma_lane) 
head(ebayes_limma_lane$t)

##                    (Intercept) as.factor(pdata$lane.number)2
## ENSMUSG00000000131    29.18101                   0.108054558
## ENSMUSG00000000149    29.75296                   0.288375678
## ENSMUSG00000000374    30.15864                   0.063812283
## ENSMUSG00000000399    28.35119                   0.068229991
## ENSMUSG00000000532    30.94034                  -0.036871536
## ENSMUSG00000000605    32.30420                   0.001122656
##                    as.factor(pdata$lane.number)3
## ENSMUSG00000000131                    0.04959779
## ENSMUSG00000000149                    0.15549405
## ENSMUSG00000000374                   -0.04876393
## ENSMUSG00000000399                   -0.14213864
## ENSMUSG00000000532                   -0.13320812
## ENSMUSG00000000605                   -0.08922013
##                    as.factor(pdata$lane.number)5
## ENSMUSG00000000131                    0.33389895
## ENSMUSG00000000149                    0.06002328
## ENSMUSG00000000374                    0.09926018
## ENSMUSG00000000399                    0.16048443
## ENSMUSG00000000532                    0.09886277
## ENSMUSG00000000605                    0.27882684
##                    as.factor(pdata$lane.number)6
## ENSMUSG00000000131                     0.9451392
## ENSMUSG00000000149                     0.9942322
## ENSMUSG00000000374                     0.6735237
## ENSMUSG00000000399                     0.8558060
## ENSMUSG00000000532                     0.4312304
## ENSMUSG00000000605                     0.5987006
##                    as.factor(pdata$lane.number)7
## ENSMUSG00000000131                    0.62359473
## ENSMUSG00000000149                    0.54476241
## ENSMUSG00000000374                    0.37234955
## ENSMUSG00000000399                    0.05530581
## ENSMUSG00000000532                    0.26062501
## ENSMUSG00000000605                    0.37035652
##                    as.factor(pdata$lane.number)8
## ENSMUSG00000000131                    0.49055391
## ENSMUSG00000000149                    0.35466155
## ENSMUSG00000000374                    0.24588867
## ENSMUSG00000000399                   -0.01369792
## ENSMUSG00000000532                    0.07542578
## ENSMUSG00000000605                    0.28300845

Then we can get the F-statistics with topTable

top_lane = topTable(ebayes_limma_lane, coef=2:7,
                    number=dim(edata)[1],sort.by="none")
head(top_lane)

##                    as.factor.pdata.lane.number.2
## ENSMUSG00000000131                  0.0545143060
## ENSMUSG00000000149                  0.1454876148
## ENSMUSG00000000374                  0.0321937582
## ENSMUSG00000000399                  0.0344225239
## ENSMUSG00000000532                 -0.0186019566
## ENSMUSG00000000605                  0.0005663883
##                    as.factor.pdata.lane.number.3
## ENSMUSG00000000131                    0.02502244
## ENSMUSG00000000149                    0.07844787
## ENSMUSG00000000374                   -0.02460175
## ENSMUSG00000000399                   -0.07170997
## ENSMUSG00000000532                   -0.06720446
## ENSMUSG00000000605                   -0.04501220
##                    as.factor.pdata.lane.number.5
## ENSMUSG00000000131                    0.16845444
## ENSMUSG00000000149                    0.03028218
## ENSMUSG00000000374                    0.05007748
## ENSMUSG00000000399                    0.08096556
## ENSMUSG00000000532                    0.04987698
## ENSMUSG00000000605                    0.14067016
##                    as.factor.pdata.lane.number.6
## ENSMUSG00000000131                     0.4768296
## ENSMUSG00000000149                     0.5015973
## ENSMUSG00000000374                     0.3397976
## ENSMUSG00000000399                     0.4317603
## ENSMUSG00000000532                     0.2175589
## ENSMUSG00000000605                     0.3020488
##                    as.factor.pdata.lane.number.7
## ENSMUSG00000000131                    0.31460805
## ENSMUSG00000000149                    0.27483657
## ENSMUSG00000000374                    0.18785304
## ENSMUSG00000000399                    0.02790218
## ENSMUSG00000000532                    0.13148720
## ENSMUSG00000000605                    0.18684754
##                    as.factor.pdata.lane.number.8  AveExpr          F
## ENSMUSG00000000131                    0.24748800 10.59389 0.23962863
## ENSMUSG00000000149                    0.17892932 10.78688 0.23360430
## ENSMUSG00000000374                    0.12405261 10.86015 0.12886955
## ENSMUSG00000000399                   -0.00691070 10.18493 0.21598161
## ENSMUSG00000000532                    0.03805285 11.08784 0.07307893
## ENSMUSG00000000605                    0.14277981 11.62820 0.12134214
##                      P.Value adj.P.Val
## ENSMUSG00000000131 0.9635205 0.9998831
## ENSMUSG00000000149 0.9657594 0.9998831
## ENSMUSG00000000374 0.9927726 0.9998831
## ENSMUSG00000000399 0.9718839 0.9998831
## ENSMUSG00000000532 0.9985087 0.9998831
## ENSMUSG00000000605 0.9938655 0.9998831

This is again the moderated version of the F-statistic we saw earlier

plot(top_lane$F,fstats_obj$statistic,
     xlab="Moderated F-statistic",ylab="F-statistic",col=3)

Calculating a nested comparison with edge

We can also perform the unmoderated nested comparisons in the edge package, which also has functions for calculating a more powerful odp statistic

edge_study = build_study(edata, grp = as.factor(pdata$lane.number))
de_obj = lrt(edge_study)
qval = qvalueObj(de_obj)
plot(qval$stat,fstats_obj$statistic,col=4,
      xlab="F-stat from edge",ylab="F-stat from genefilter")

We can easily adjust for variables by passing arguments to the adj.var variable.

edge_study2 = build_study(edata, grp = as.factor(pdata$lane.number),
                        adj.var=pdata$strain)
de_obj2 = lrt(edge_study2)
qval2 = qvalueObj(de_obj2)
plot(qval2$stat,fstats_obj$statistic,col=4,
      xlab="F-stat from edge",ylab="F-stat from genefilter")

More information

You can find a lot more information on this model fitting strategy in:

The limma paper
The limma vignette

Session information

Here is the session information

devtools::session_info()

##  setting  value                       
##  version  R version 3.2.1 (2015-06-18)
##  system   x86_64, darwin10.8.0        
##  ui       RStudio (0.99.447)          
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/New_York            
## 
##  package           * version     date      
##  acepack             1.3-3.3     2014-11-24
##  annotate            1.46.1      2015-07-11
##  AnnotationDbi     * 1.30.1      2015-04-26
##  assertthat          0.1         2013-12-06
##  BiasedUrn         * 1.06.1      2013-12-29
##  Biobase           * 2.28.0      2015-04-17
##  BiocGenerics      * 0.14.0      2015-04-17
##  BiocInstaller     * 1.18.4      2015-07-22
##  BiocParallel        1.2.20      2015-08-07
##  biomaRt             2.24.0      2015-04-17
##  Biostrings          2.36.3      2015-08-12
##  bitops              1.0-6       2013-08-17
##  bladderbatch      * 1.6.0       2015-08-26
##  broom             * 0.3.7       2015-05-06
##  caTools             1.17.1      2014-09-10
##  cluster             2.0.3       2015-07-21
##  colorspace          1.2-6       2015-03-11
##  corpcor             1.6.8       2015-07-08
##  curl                0.9.2       2015-08-08
##  DBI               * 0.3.1       2014-09-24
##  dendextend        * 1.1.0       2015-07-31
##  DESeq2            * 1.8.1       2015-05-02
##  devtools          * 1.8.0       2015-05-09
##  digest              0.6.8       2014-12-31
##  dplyr             * 0.4.3       2015-09-01
##  edge              * 2.1.0       2015-09-06
##  evaluate            0.7.2       2015-08-13
##  foreign             0.8-66      2015-08-19
##  formatR             1.2         2015-04-21
##  Formula           * 1.2-1       2015-04-07
##  futile.logger       1.4.1       2015-04-20
##  futile.options      1.0.0       2010-04-06
##  gdata               2.17.0      2015-07-04
##  genefilter        * 1.50.0      2015-04-17
##  geneLenDataBase   * 1.4.0       2015-09-06
##  geneplotter         1.46.0      2015-04-17
##  GenomeInfoDb      * 1.4.2       2015-08-15
##  GenomicAlignments   1.4.1       2015-04-24
##  GenomicFeatures     1.20.2      2015-08-14
##  GenomicRanges     * 1.20.5      2015-06-09
##  genstats          * 0.1.02      2015-09-05
##  ggplot2           * 1.0.1       2015-03-17
##  git2r               0.11.0      2015-08-12
##  GO.db               3.1.2       2015-09-06
##  goseq             * 1.20.0      2015-04-17
##  gplots            * 2.17.0      2015-05-02
##  gridExtra           2.0.0       2015-07-14
##  gtable              0.1.2       2012-12-05
##  gtools              3.5.0       2015-05-29
##  highr               0.5         2015-04-21
##  HistData          * 0.7-5       2014-04-26
##  Hmisc             * 3.16-0      2015-04-30
##  htmltools           0.2.6       2014-09-08
##  httr                1.0.0       2015-06-25
##  IRanges           * 2.2.7       2015-08-09
##  KernSmooth          2.23-15     2015-06-29
##  knitr             * 1.11        2015-08-14
##  lambda.r            1.1.7       2015-03-20
##  lattice           * 0.20-33     2015-07-14
##  latticeExtra        0.6-26      2013-08-15
##  lazyeval            0.1.10      2015-01-02
##  limma             * 3.24.15     2015-08-06
##  lme4                1.1-9       2015-08-20
##  locfit              1.5-9.1     2013-04-20
##  magrittr            1.5         2014-11-22
##  MASS              * 7.3-43      2015-07-16
##  Matrix            * 1.2-2       2015-07-08
##  MatrixEQTL        * 2.1.1       2015-02-03
##  memoise             0.2.1       2014-04-22
##  mgcv              * 1.8-7       2015-07-23
##  minqa               1.2.4       2014-10-09
##  mnormt              1.5-3       2015-05-25
##  munsell             0.4.2       2013-07-11
##  nlme              * 3.1-122     2015-08-19
##  nloptr              1.0.4       2014-08-04
##  nnet                7.3-10      2015-06-29
##  org.Hs.eg.db      * 3.1.2       2015-07-17
##  plyr                1.8.3       2015-06-12
##  preprocessCore    * 1.30.0      2015-04-17
##  proto               0.3-10      2012-12-22
##  psych               1.5.6       2015-07-08
##  qvalue            * 2.0.0       2015-04-17
##  R6                  2.1.1       2015-08-19
##  RColorBrewer        1.1-2       2014-12-07
##  Rcpp              * 0.12.0      2015-07-25
##  RcppArmadillo     * 0.5.400.2.0 2015-08-17
##  RCurl               1.95-4.7    2015-06-30
##  reshape2            1.4.1       2014-12-06
##  rmarkdown           0.7         2015-06-13
##  rpart               4.1-10      2015-06-29
##  Rsamtools           1.20.4      2015-06-01
##  RSkittleBrewer    * 1.1         2015-09-05
##  RSQLite           * 1.0.0       2014-10-25
##  rstudioapi          0.3.1       2015-04-07
##  rtracklayer         1.28.9      2015-08-19
##  rversions           1.0.2       2015-07-13
##  S4Vectors         * 0.6.5       2015-09-01
##  scales              0.3.0       2015-08-25
##  snm                 1.16.0      2015-04-17
##  snpStats          * 1.18.0      2015-04-17
##  stringi             0.5-5       2015-06-29
##  stringr             1.0.0       2015-04-30
##  survival          * 2.38-3      2015-07-02
##  sva               * 3.14.0      2015-04-17
##  tidyr               0.2.0       2014-12-05
##  UsingR            * 2.0-5       2015-08-06
##  whisker             0.3-2       2013-04-28
##  XML                 3.98-1.3    2015-06-30
##  xml2                0.1.2       2015-09-01
##  xtable              1.7-4       2014-09-12
##  XVector             0.8.0       2015-04-17
##  yaml                2.1.13      2014-06-12
##  zlibbioc            1.14.0      2015-04-17
##  source                                      
##  CRAN (R 3.2.0)                              
##  Bioconductor                                
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.2)                              
##  Github (jdstorey/edge@a1947b5)              
##  CRAN (R 3.2.2)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  Bioconductor                                
##  local                                       
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.2)                              
##  Bioconductor                                
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.2)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  Bioconductor                                
##  CRAN (R 3.2.2)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  Github (alyssafrazee/RSkittleBrewer@0a96a20)
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.0)                              
##  Bioconductor                                
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.2)                              
##  Bioconductor                                
##  Bioconductor                                
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.2)                              
##  CRAN (R 3.2.0)                              
##  CRAN (R 3.2.1)                              
##  CRAN (R 3.2.2)                              
##  CRAN (R 3.2.0)                              
##  Bioconductor                                
##  CRAN (R 3.2.0)                              
##  Bioconductor

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-06.