Dependencies

This document depends on the following packages:

  library(devtools)
  library(Biobase)
  library(broom)

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

install.packages(c("devtools","broom"))
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("Biobase"))

Download the data

Here we are going to use some data from the paper Detection of redundant fusion transcripts as biomarkers or disease-specific therapeutic targets in breast cancer. that uses data from different normal human tissues (called the Illumina BodyMap data).

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
load(file=con)
close(con)
bm = bodymap.eset
pdata=pData(bm)
edata=as.data.frame(exprs(bm))
fdata = fData(bm)
ls()
## [1] "bm"           "bodymap.eset" "con"          "edata"       
## [5] "fdata"        "pdata"        "tropical"

Fit a simple linear regression

Here we regress the counts for the first gene on the age of the people from whom the tissue was drawn.

edata = as.matrix(edata)
lm1 = lm(edata[1,] ~ pdata$age)
tidy(lm1)
##          term   estimate  std.error statistic      p.value
## 1 (Intercept) 2187.90554 402.998816  5.429062 8.884114e-05
## 2   pdata$age  -23.24999   6.390269 -3.638343 2.686225e-03

Is fitting the line a good idea?

Visual diagnostics are often some of the most helpful

plot(pdata$age,edata[1,], col=1)
abline(lm1$coeff[1],lm1$coeff[2], col=2,lwd=3)

What about categorical variables?

pdata$gender
##  [1] F    M    F    F    F    M    F    M    M    F    <NA> <NA> <NA> F   
## [15] M    M    M    F    M   
## Levels: F M
table(pdata$gender)
## 
## F M 
## 8 8

Visualize the difference between genders

boxplot(edata[1,] ~ pdata$gender)
points(edata[1,] ~ jitter(as.numeric(pdata$gender)),
       col=as.numeric(pdata$gender))

“dummy” variables

dummy_m = pdata$gender=="M"
dummy_m
##  [1] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE    NA
## [12]    NA    NA FALSE  TRUE  TRUE  TRUE FALSE  TRUE
dummy_f = pdata$gender=="F"
dummy_f
##  [1]  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE    NA
## [12]    NA    NA  TRUE FALSE FALSE FALSE  TRUE FALSE

R’s modeling formulae do this for you automatically

lm2 = lm(edata[1,] ~ pdata$gender)
tidy(lm2)
##            term estimate std.error  statistic     p.value
## 1   (Intercept)  837.000  228.8055  3.6581292 0.002582905
## 2 pdata$genderM -105.625  323.5798 -0.3264264 0.748930374

Peaking “under the hood” of the variables used in the model

mod2 = model.matrix(~pdata$gender)
mod2
##    (Intercept) pdata$genderM
## 1            1             0
## 2            1             1
## 3            1             0
## 4            1             0
## 5            1             0
## 6            1             1
## 7            1             0
## 8            1             1
## 9            1             1
## 10           1             0
## 14           1             0
## 15           1             1
## 16           1             1
## 17           1             1
## 18           1             0
## 19           1             1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`pdata$gender`
## [1] "contr.treatment"

Categorical variables with multiple levels

table(pdata$tissue.type)
## 
##          adipose          adrenal            brain           breast 
##                1                1                1                1 
##            colon            heart           kidney            liver 
##                1                1                1                1 
##             lung        lymphnode          mixture            ovary 
##                1                1                3                1 
##         prostate  skeletal_muscle           testes          thyroid 
##                1                1                1                1 
## white_blood_cell 
##                1
pdata$tissue.type == "adipose"
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
pdata$tissue.type == "adrenal"
##  [1] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

This leads to multiple coefficients for one variable

tidy(lm(edata[1,] ~ pdata$tissue.type ))
##                                 term  estimate std.error   statistic
## 1                        (Intercept)  1354.000  1017.983  1.33008119
## 2           pdata$tissue.typeadrenal -1138.000  1439.645 -0.79047247
## 3             pdata$tissue.typebrain -1139.000  1439.645 -0.79116709
## 4            pdata$tissue.typebreast  -430.000  1439.645 -0.29868468
## 5             pdata$tissue.typecolon  -629.000  1439.645 -0.43691317
## 6             pdata$tissue.typeheart -1229.000  1439.645 -0.85368248
## 7            pdata$tissue.typekidney  -558.000  1439.645 -0.38759547
## 8             pdata$tissue.typeliver   600.000  1439.645  0.41676932
## 9              pdata$tissue.typelung  -539.000  1439.645 -0.37439777
## 10        pdata$tissue.typelymphnode -1105.000  1439.645 -0.76755016
## 11          pdata$tissue.typemixture  4043.333  1175.465  3.43977205
## 12            pdata$tissue.typeovary    96.000  1439.645  0.06668309
## 13         pdata$tissue.typeprostate  -573.000  1439.645 -0.39801470
## 14  pdata$tissue.typeskeletal_muscle -1285.000  1439.645 -0.89258095
## 15           pdata$tissue.typetestes   534.000  1439.645  0.37092469
## 16          pdata$tissue.typethyroid  -371.000  1439.645 -0.25770236
## 17 pdata$tissue.typewhite_blood_cell -1351.000  1439.645 -0.93842558
##       p.value
## 1  0.31489301
## 2  0.51209555
## 3  0.51176898
## 4  0.79335656
## 5  0.70482162
## 6  0.48321178
## 7  0.73567624
## 8  0.71731923
## 9  0.74407734
## 10 0.52298771
## 11 0.07511726
## 12 0.95290026
## 13 0.72908593
## 14 0.46626631
## 15 0.74629792
## 16 0.82072898
## 17 0.44708910

Adjusting for variables

lm3 = lm(edata[1,] ~ pdata$age + pdata$gender)
tidy(lm3)
##            term  estimate  std.error  statistic     p.value
## 1   (Intercept) 2331.5814 438.181127  5.3210448 0.000138739
## 2     pdata$age  -23.9133   6.488058 -3.6857414 0.002743642
## 3 pdata$genderM -207.2565 236.430512 -0.8766066 0.396610372

Interactions (careful!)

You can add interaction terms, but there are a couple of things to keep in mind:

lm4 = lm(edata[1,] ~ pdata$age*pdata$gender)
tidy(lm4)
##                      term   estimate  std.error statistic    p.value
## 1             (Intercept) 1679.37818 610.165662  2.752332 0.01752641
## 2               pdata$age  -13.47805   9.425164 -1.430007 0.17823223
## 3           pdata$genderM  912.51047 793.378850  1.150157 0.27248454
## 4 pdata$age:pdata$genderM  -18.46210  12.536549 -1.472662 0.16658057

A few things to be aware of

Outliers can have a big impact on the regression, depending on where they land. Here there isn’t much impact

lm4 = lm(edata[6,] ~ pdata$age)
plot(pdata$age,edata[6,],col=2)
abline(lm4,col=1,lwd=3)

But in this case there is a huge impact

index = 1:19
lm5 = lm(edata[6,] ~ index)
plot(index,edata[6,],col=2)
abline(lm5,col=1,lwd=3)

lm6 = lm(edata[6,-19] ~ index[-19])
abline(lm6,col=3,lwd=3)

legend(5,1000,c("With outlier","Without outlier"),col=c(1,3),lwd=3)

In general you’d like your residuals to looks symmetrically (e.g. approximately Normally) distributed but they aren’t here. Outliers in the residuals aren’t great either.

par(mfrow=c(1,2))
hist(lm6$residuals,col=2)
hist(lm5$residuals,col=3)

Data transforms are often applied before regression and the residuals look a little better here.

gene1 = log2(edata[1,]+1)
lm7 = lm(gene1 ~ index)
hist(lm7$residuals,col=4)