## Dependencies

This document depends on the following packages:

  library(devtools)
library(Biobase)
library(snpStats)
library(broom)
library(MASS)
library(DESeq2)

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

install.packages(c("devtools","broom","MASS"))
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("Biobase","snpStats","DESeq2"))

## Logistic regression

Here we use example SNP data from a case-control genome-wide association study Load an example data set and take a smaller subset of samples for computational efficiency

data(for.exercise)
use <- seq(1, ncol(snps.10), 10)
sub.10 <- snps.10[,use]

Calculate the PCs

xxmat <- xxt(sub.10, correct.for.missing=FALSE)
evv <- eigen(xxmat, symmetric=TRUE)
pcs <- evv$vectors[,1:5] ### A single logistic regression First we do an unadjusted logistic regression assuming an additive model.The coefficient is the change in log-odds for a one unit decrease (because homozygous major allele is coded 1) in the number of copies of the minor allele. snpdata = sub.10@.Data status = subject.support$cc
snp1 = as.numeric(snpdata[,1])
snp1[snp1==0] = NA
glm1 = glm(status ~ snp1,family="binomial")
tidy(glm1)
##          term   estimate std.error  statistic   p.value
## 1 (Intercept)  0.1122026 0.2322000  0.4832155 0.6289427
## 2        snp1 -0.1010806 0.2012159 -0.5023489 0.6154221

We can also easily code in other models. For example suppose we want to code a dominant model (so only an association of risk with the two copies of the common allele, now the coefficient on snp1_dom is the increase in log odds associated with two copies of the major allele).

snp1_dom = (snp1 == 1)
glm1_dom = glm(status ~ snp1_dom,family="binomial")
tidy(glm1_dom)
##           term   estimate std.error  statistic   p.value
## 1  (Intercept) -0.1112256 0.1927477 -0.5770531 0.5639036
## 2 snp1_domTRUE  0.1248313 0.2041740  0.6113966 0.5409371
tidy(glm1)
##          term   estimate std.error  statistic   p.value
## 1 (Intercept)  0.1122026 0.2322000  0.4832155 0.6289427
## 2        snp1 -0.1010806 0.2012159 -0.5023489 0.6154221

We can also easily adjust for other variables.

glm2 = glm(status ~ snp1 + pcs[,1:5],family="binomial")
tidy(glm2)
##          term    estimate std.error   statistic     p.value
## 1 (Intercept)  0.08993531 0.2341663  0.38406597 0.700929553
## 2        snp1 -0.08072045 0.2029343 -0.39776640 0.690802386
## 3 pcs[, 1:5]1  5.11070256 2.0261844  2.52232842 0.011658081
## 4 pcs[, 1:5]2  0.16534907 2.0365104  0.08119235 0.935288981
## 5 pcs[, 1:5]3  2.40608659 2.0434133  1.17748405 0.239002361
## 6 pcs[, 1:5]4 -5.39830830 2.0466337 -2.63765245 0.008348209
## 7 pcs[, 1:5]5  0.58408419 2.0355288  0.28694470 0.774154663

### Fit many glms at once

For logistic regression modeling of many SNPs at once we can use the snps.rhs.tests function which computes an asymptotic chi-squared statistic. This isnâ€™t quite the same thing as the F-statistics we have been calculating but can be used in the same way for significance calculations.

glm_all = snp.rhs.tests(status ~ 1,snp.data=sub.10)
slotNames(glm_all)
## [1] "snp.names" "var.names" "chisq"     "df"        "N"
qq.chisq(chi.squared(glm_all),df=1)

##           N     omitted      lambda
## 2851.000000    0.000000    1.657571

We can also adjust for variables like principal components

glm_all_adj = snp.rhs.tests(status ~ pcs,snp.data=sub.10)
qq.chisq(chi.squared(glm_all_adj),df=1)

##            N      omitted       lambda
## 2851.0000000    0.0000000    0.9770849

## Poisson/negative binomial regression

Here we are going to use some data from the paper Evaluating gene expression in C57BL/6J and DBA/2J mouse striatum using RNA-Seq and microarrays. that is a comparative RNA-seq analysis of different mouse strains.

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
close(con)
bot = bottomly.eset
pdata=pData(bot)
edata=as.matrix(exprs(bot))
fdata = fData(bot)
ls()
##  [1] "bot"             "bottomly.eset"   "con"
##  [4] "edata"           "evv"             "fdata"
## [10] "glm1_dom"        "glm2"            "pcs"
## [13] "pdata"           "snp.support"     "snp1"
## [16] "snp1_dom"        "snpdata"         "snps.10"
## [19] "status"          "sub.10"          "subject.support"
## [22] "tropical"        "use"             "xxmat"

### Transform the data

Here we remove lowly expressed genes but we will leave them as counts

edata = edata[rowMeans(edata) > 10, ]

### A single Poisson regression

The coefficient in this case is the increase in the log number of counts comparing one strain to the other. If you exponentiate the coefficients then $$\exp(intercept)$$ is expected count for C57BL/6J and $$\exp(intercept)\exp(strainDBA/2J)$$ is the expected count for the strain DBA/2J.

glm3 = glm(edata[1, ] ~ pdata$strain,family="poisson") tidy(glm3) ## term estimate std.error statistic p.value ## 1 (Intercept) 6.23988591 0.01396450 446.839073 0.000000000 ## 2 pdata$strainDBA/2J 0.04982967 0.01907011   2.612972 0.008975864

You can also fit a negative binomial regression one at a time in R with:

glm.nb1 = glm.nb(edata[1, ] ~ pdata$strain) tidy(glm.nb1) ## term estimate std.error statistic p.value ## 1 (Intercept) 6.23988591 0.1205228 51.7735102 0.0000000 ## 2 pdata$strainDBA/2J 0.04982967 0.1665002  0.2992768 0.7647288

### Multiple negative binomial regressions

We can use the DESeq2 package to perform many (moderated) negative binomial regressions at once. We first need to create a DESeq data set.

de = DESeqDataSetFromMatrix(edata, pdata, ~strain)
glm_all_nb = DESeq(de)
result_nb = results(glm_all_nb)
hist(result_nb\$stat)

You can find a lot more information on this model fitting strategy in:

## Session information

Here is the session information

devtools::session_info()
##  setting  value
##  version  R version 3.2.1 (2015-06-18)
##  system   x86_64, darwin10.8.0
##  ui       RStudio (0.99.447)
##  language (EN)
##  collate  en_US.UTF-8
##  tz       America/New_York
##
##  package           * version     date
##  acepack             1.3-3.3     2014-11-24
##  annotate            1.46.1      2015-07-11
##  AnnotationDbi     * 1.30.1      2015-04-26
##  assertthat          0.1         2013-12-06
##  BiasedUrn         * 1.06.1      2013-12-29
##  Biobase           * 2.28.0      2015-04-17
##  BiocGenerics      * 0.14.0      2015-04-17
##  BiocInstaller     * 1.18.4      2015-07-22
##  BiocParallel        1.2.20      2015-08-07
##  biomaRt             2.24.0      2015-04-17
##  Biostrings          2.36.3      2015-08-12
##  bitops              1.0-6       2013-08-17
##  broom             * 0.3.7       2015-05-06
##  caTools             1.17.1      2014-09-10
##  cluster             2.0.3       2015-07-21
##  colorspace          1.2-6       2015-03-11
##  corpcor             1.6.8       2015-07-08
##  curl                0.9.2       2015-08-08
##  DBI               * 0.3.1       2014-09-24
##  dendextend        * 1.1.0       2015-07-31
##  DESeq2            * 1.8.1       2015-05-02
##  devtools          * 1.8.0       2015-05-09
##  digest              0.6.8       2014-12-31
##  dplyr             * 0.4.3       2015-09-01
##  edge              * 2.1.0       2015-09-06
##  evaluate            0.7.2       2015-08-13
##  foreign             0.8-66      2015-08-19
##  formatR             1.2         2015-04-21
##  Formula           * 1.2-1       2015-04-07
##  futile.logger       1.4.1       2015-04-20
##  futile.options      1.0.0       2010-04-06
##  gdata               2.17.0      2015-07-04
##  genefilter        * 1.50.0      2015-04-17
##  geneLenDataBase   * 1.4.0       2015-09-06
##  geneplotter         1.46.0      2015-04-17
##  GenomeInfoDb      * 1.4.2       2015-08-15
##  GenomicAlignments   1.4.1       2015-04-24
##  GenomicFeatures     1.20.2      2015-08-14
##  GenomicRanges     * 1.20.5      2015-06-09
##  genstats          * 0.1.02      2015-09-05
##  ggplot2           * 1.0.1       2015-03-17
##  git2r               0.11.0      2015-08-12
##  GO.db               3.1.2       2015-09-06
##  goseq             * 1.20.0      2015-04-17
##  gplots            * 2.17.0      2015-05-02
##  gridExtra           2.0.0       2015-07-14
##  gtable              0.1.2       2012-12-05
##  gtools              3.5.0       2015-05-29
##  highr               0.5         2015-04-21
##  HistData          * 0.7-5       2014-04-26
##  Hmisc             * 3.16-0      2015-04-30
##  htmltools           0.2.6       2014-09-08
##  httr                1.0.0       2015-06-25
##  IRanges           * 2.2.7       2015-08-09
##  KernSmooth          2.23-15     2015-06-29
##  knitr             * 1.11        2015-08-14
##  lambda.r            1.1.7       2015-03-20
##  lattice           * 0.20-33     2015-07-14
##  latticeExtra        0.6-26      2013-08-15
##  lazyeval            0.1.10      2015-01-02
##  limma             * 3.24.15     2015-08-06
##  lme4                1.1-9       2015-08-20
##  locfit              1.5-9.1     2013-04-20
##  magrittr            1.5         2014-11-22
##  MASS              * 7.3-43      2015-07-16
##  Matrix            * 1.2-2       2015-07-08
##  MatrixEQTL        * 2.1.1       2015-02-03
##  memoise             0.2.1       2014-04-22
##  mgcv              * 1.8-7       2015-07-23
##  minqa               1.2.4       2014-10-09
##  mnormt              1.5-3       2015-05-25
##  munsell             0.4.2       2013-07-11
##  nlme              * 3.1-122     2015-08-19
##  nloptr              1.0.4       2014-08-04
##  nnet                7.3-10      2015-06-29
##  org.Hs.eg.db      * 3.1.2       2015-07-17
##  plyr                1.8.3       2015-06-12
##  preprocessCore    * 1.30.0      2015-04-17
##  proto               0.3-10      2012-12-22
##  psych               1.5.6       2015-07-08
##  qvalue            * 2.0.0       2015-04-17
##  R6                  2.1.1       2015-08-19
##  RColorBrewer        1.1-2       2014-12-07
##  Rcpp              * 0.12.0      2015-07-25
##  RCurl               1.95-4.7    2015-06-30
##  reshape2            1.4.1       2014-12-06
##  rmarkdown           0.7         2015-06-13
##  rpart               4.1-10      2015-06-29
##  Rsamtools           1.20.4      2015-06-01
##  RSkittleBrewer    * 1.1         2015-09-05
##  RSQLite           * 1.0.0       2014-10-25
##  rstudioapi          0.3.1       2015-04-07
##  rtracklayer         1.28.9      2015-08-19
##  rversions           1.0.2       2015-07-13
##  S4Vectors         * 0.6.5       2015-09-01
##  scales              0.3.0       2015-08-25
##  snm                 1.16.0      2015-04-17
##  snpStats          * 1.18.0      2015-04-17
##  stringi             0.5-5       2015-06-29
##  stringr             1.0.0       2015-04-30
##  survival          * 2.38-3      2015-07-02
##  sva               * 3.14.0      2015-04-17
##  tidyr               0.2.0       2014-12-05
##  UsingR            * 2.0-5       2015-08-06
##  whisker             0.3-2       2013-04-28
##  XML                 3.98-1.3    2015-06-30
##  xml2                0.1.2       2015-09-01
##  xtable              1.7-4       2014-09-12
##  XVector             0.8.0       2015-04-17
##  yaml                2.1.13      2014-06-12
##  zlibbioc            1.14.0      2015-04-17
##  source
##  CRAN (R 3.2.0)
##  Bioconductor
##  Bioconductor
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  CRAN (R 3.2.0)
##  Bioconductor
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.2)
##  Github (jdstorey/edge@a1947b5)
##  CRAN (R 3.2.2)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  Bioconductor
##  local
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.2)
##  Bioconductor
##  Bioconductor
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.2)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  Bioconductor
##  CRAN (R 3.2.2)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.1)
##  Bioconductor
##  Github (alyssafrazee/RSkittleBrewer@0a96a20)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.0)
##  Bioconductor
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.2)
##  Bioconductor
##  Bioconductor
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  Bioconductor
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.2)
##  CRAN (R 3.2.0)
##  CRAN (R 3.2.1)
##  CRAN (R 3.2.2)
##  CRAN (R 3.2.0)
##  Bioconductor
##  CRAN (R 3.2.0)
##  Bioconductor

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-06.