Load packages

You will need the RSkittleBrewer package for this vignette to run. Installation instructions are available here:

https://github.com/alyssafrazee/RskittleBrewer

This code also accesses data from the ReCount database:

http://bowtie-bio.sourceforge.net/recount/

You will also need R version 3.1.0 or greater and Bioconductor 3.0 or greater. The zebrafishRNASeq package might need to be installed from source. These analyses are based on the devel version of sva (version 3.11.2 or greater).

library(zebrafishRNASeq)
library(RSkittleBrewer)
library(genefilter)
library(RUVSeq)
library(edgeR)
library(sva)
library(ffpe)

## Warning: replacing previous import by 'graphics::image' when loading
## 'methylumi'

library(RColorBrewer)
library(corrplot)
library(limma)
trop = RSkittleBrewer('tropical')

Read data from ReCount

load(url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData"))
mpdat= exprs(montpick.eset)
mppheno = pData(montpick.eset)
mpdat = mpdat[rowMeans(mpdat) > 5,]

Read pedigree info from Hapmap

# CEU

z = gzcon(url("http://hapmap.ncbi.nlm.nih.gov/downloads/samples_individuals/pedinfo2sample_CEU.txt.gz"))
raw = textConnection(readLines(z))
close(z)
ceuped = read.table(raw)
close(raw)


## YRI

z = gzcon(url("http://hapmap.ncbi.nlm.nih.gov/downloads/samples_individuals/pedinfo2sample_YRI.txt.gz"))
raw = textConnection(readLines(z))
close(z)
yriped = read.table(raw)
close(raw)

Match sex info with samples

Remove all samples where we can not identify sex information exactly.

mppheno$sex = rep(NA,129)
for(i in 1:dim(mppheno)[1]){
  if(mppheno$population[i] == "CEU"){
    ind = grep(mppheno$sample.id[i],ceuped[,7])
    if(length(ind)==1){
      mppheno$sex[i] = ceuped[ind,5]
    }
  }
  if(mppheno$population[i] == "YRI"){
    ind = grep(mppheno$sample.id[i],yriped[,7])
    if(length(ind)==1){
      mppheno$sex[i] = yriped[ind,5]
    }
  }  
}

dat0 = mpdat[,!is.na(mppheno$sex)]
mppheno = mppheno[!is.na(mppheno$sex),]
group = as.numeric(mppheno$sex)
study = as.numeric(mppheno$study)
table(study,group)

##      group
## study  1  2
##     1 23 22
##     2 25 29

Estimate latent factors with different methods

## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])

## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0,n.sv=1)$sv

## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5

## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]


## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W

## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

controls = rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W

Plot batch estimates

## Plot the results
plot(batch_unsup_sva ~ mppheno$study,pch=19,col=trop[1],main="unsupervised sva")

plot(batch_pca ~ mppheno$study,pch=19,col=trop[2],main="pca")

plot of chunk plotbatch

plot(batch_ruv_res ~ mppheno$study,pch=19,col=trop[3],main="residual ruv")

plot of chunk plotbatch

plot(batch_ruv_emp ~ mppheno$study,pch=19,col=trop[3],main="empirical controls ruv")

plot of chunk plotbatch

Plot absolute correlation between batch estimates

batchEstimates = cbind(group,study,batch_unsup_sva,batch_pca,
                       batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("group","study","usva","pca","ruvres","ruvemp")

corr = abs(cor(batchEstimates))
corr

##           group   study    usva     pca  ruvres   ruvemp
## group  1.000000 0.04797 0.01824 0.01657 0.01157 0.002352
## study  0.047971 1.00000 0.68805 0.68531 0.76133 0.687062
## usva   0.018238 0.68805 1.00000 0.99993 0.62335 0.998506
## pca    0.016571 0.68531 0.99993 1.00000 0.61640 0.998532
## ruvres 0.011568 0.76133 0.62335 0.61640 1.00000 0.622780
## ruvemp 0.002352 0.68706 0.99851 0.99853 0.62278 1.000000

cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")

plot of chunk corbatch

Compare results with different methods to just using study

dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = vector("list",6)
adj = c("+ study", "+ batch_unsup_sva",
         "+ batch_ruv_res", "+ batch_ruv_emp",
        "+ batch_pca","")


for(i in 1:6){
  design = model.matrix(as.formula(paste0("~ group",adj[i])))
  v <- voom(dge,design,plot=FALSE)
  fit <- lmFit(v,design)
  fit <- eBayes(fit)
  tstats[[i]] = abs(fit$t[,2])
  names(tstats[[i]]) = as.character(1:dim(dat0)[1])
  catplots[[i]] = CATplot(-rank(tstats[[i]]),-rank(tstats[[1]]),maxrank=1000,make.plot=F)
}

Make the CATplot

plot(catplots[[2]],ylim=c(0,1),col=trop[2],lwd=3,type="l",ylab="Concordance Between using study and different methods",xlab="Rank")
lines(catplots[[3]],col=trop[1],lwd=3,lty=2)
lines(catplots[[4]],col=trop[1],lwd=3)
lines(catplots[[5]],col=trop[3],lwd=3,lty=3)
lines(catplots[[6]],col=trop[4],lwd=3)

legend(200,0.5,legend=c("Unsup. svaseq","RUV Res.", "RUV Emp.","PCA","No adjustment"),col=trop[c(2,1,1,3,4)],lty=c(1,2,1,3,1),lwd=3)

plot of chunk catplot

Now try an unbalanced design

set.seed(35353)
index = c(sample(which(group==1 & study==1),size=20),sample(which(group==1 & study==2),size=10),
          sample(which(group==2 & study==1),size=10),sample(which(group==2 & study==2),size=20))
dat0 = dat0[,index]
group = group[index]
study = study[index]

Estimate latent factors with different methods

## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])

## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0,n.sv=1)$sv

## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5

## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]


## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W

## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

controls = rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W

Plot batch estimates

## Plot the results
plot(batch_unsup_sva ~ mppheno$study[index],pch=19,col=trop[1],main="unsupervised sva")

plot of chunk plotbatch2

plot(batch_pca ~ mppheno$study[index],pch=19,col=trop[2],main="pca")

plot of chunk plotbatch2

plot(batch_ruv_res ~ mppheno$study[index],pch=19,col=trop[3],main="residual ruv")

plot of chunk plotbatch2

plot(batch_ruv_emp ~ mppheno$study[index],pch=19,col=trop[3],main="empirical controls ruv")

plot of chunk plotbatch2

Plot absolute correlation between batch estimates

batchEstimates = cbind(group,study,batch_unsup_sva,batch_pca,
                       batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("group","study","usva","pca","ruvres","ruvemp")

corr = abs(cor(batchEstimates))
corr

##          group  study   usva    pca  ruvres ruvemp
## group  1.00000 0.3333 0.1840 0.1850 0.02194 0.1619
## study  0.33333 1.0000 0.7578 0.7571 0.78504 0.7521
## usva   0.18400 0.7578 1.0000 1.0000 0.67364 0.9982
## pca    0.18496 0.7571 1.0000 1.0000 0.67190 0.9982
## ruvres 0.02194 0.7850 0.6736 0.6719 1.00000 0.6634
## ruvemp 0.16188 0.7521 0.9982 0.9982 0.66336 1.0000

cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")

plot of chunk corbatch2

Compare results with different methods to just using study

dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = vector("list",6)
adj = c("+ study", "+ batch_unsup_sva",
         "+ batch_ruv_res", "+ batch_ruv_emp",
        "+ batch_pca","")


for(i in 1:6){
  design = model.matrix(as.formula(paste0("~ group",adj[i])))
  v <- voom(dge,design,plot=FALSE)
  fit <- lmFit(v,design)
  fit <- eBayes(fit)
  tstats[[i]] = abs(fit$t[,2])
  names(tstats[[i]]) = as.character(1:dim(dat0)[1])
  catplots[[i]] = CATplot(-rank(tstats[[i]]),-rank(tstats[[1]]),maxrank=1000,make.plot=F)
}

Make the CATplot

plot(catplots[[2]],ylim=c(0,1),col=trop[2],lwd=3,type="l",ylab="Concordance Between using study and different methods",xlab="Rank")
lines(catplots[[3]],col=trop[1],lwd=3,lty=2)
lines(catplots[[4]],col=trop[1],lwd=3)
lines(catplots[[5]],col=trop[3],lwd=3,lty=3)
lines(catplots[[6]],col=trop[4],lwd=3)



legend(200,0.5,legend=c("Unsup. svaseq","RUV Res.", "RUV Emp.","PCA","No adjustment"),col=trop[c(2,1,1,3,4)],lty=c(1,2,1,3,1),lwd=3)

plot of chunk catplot2

Session Info

sessionInfo()

## R version 3.1.0 (2014-04-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] zebrafishRNASeq_0.99.2   sva_3.11.1              
##  [3] RUVSeq_0.99.2            RSkittleBrewer_1.1      
##  [5] RColorBrewer_1.0-5       mgcv_1.7-29             
##  [7] nlme_3.1-117             genefilter_1.47.5       
##  [9] ffpe_1.9.0               TTR_0.22-0              
## [11] xts_0.9-7                zoo_1.7-11              
## [13] edgeR_3.7.2              limma_3.21.6            
## [15] EDASeq_1.99.1            ShortRead_1.23.11       
## [17] GenomicAlignments_1.1.11 Rsamtools_1.17.22       
## [19] GenomicRanges_1.17.17    GenomeInfoDb_1.1.6      
## [21] corrplot_0.73            corpcor_1.6.6           
## [23] Biostrings_2.33.10       XVector_0.5.6           
## [25] IRanges_1.99.15          S4Vectors_0.0.8         
## [27] BiocParallel_0.7.2       Biobase_2.25.0          
## [29] BiocGenerics_0.11.2      knitr_1.6               
## 
## loaded via a namespace (and not attached):
##  [1] affy_1.43.2            affyio_1.33.0          annotate_1.43.4       
##  [4] AnnotationDbi_1.27.7   aroma.light_2.1.0      base64_1.1            
##  [7] BatchJobs_1.2          BBmisc_1.6             beanplot_1.1          
## [10] BiocInstaller_1.15.5   biomaRt_2.21.0         bitops_1.0-6          
## [13] brew_1.0-6             BSgenome_1.33.8        bumphunter_1.5.3      
## [16] codetools_0.2-8        colorspace_1.2-4       DBI_0.2-7             
## [19] DESeq_1.17.0           digest_0.6.4           doRNG_1.6             
## [22] evaluate_0.5.5         fail_1.2               foreach_1.4.2         
## [25] formatR_0.10           geneplotter_1.43.0     GenomicFeatures_1.17.9
## [28] grid_3.1.0             htmltools_0.2.4        hwriter_1.3           
## [31] illuminaio_0.7.0       iterators_1.0.7        KernSmooth_2.23-12    
## [34] lattice_0.20-29        latticeExtra_0.6-26    locfit_1.5-9.1        
## [37] lumi_2.17.0            MASS_7.3-33            Matrix_1.1-3          
## [40] matrixStats_0.8.14     mclust_4.3             methylumi_2.11.1      
## [43] minfi_1.11.7           multtest_2.21.0        nleqslv_2.1.1         
## [46] nor1mix_1.1-4          pkgmaker_0.22          plyr_1.8.1            
## [49] preprocessCore_1.27.0  R.methodsS3_1.6.1      R.oo_1.18.0           
## [52] Rcpp_0.11.1            RCurl_1.95-4.1         registry_0.2          
## [55] reshape_0.8.5          rmarkdown_0.2.46       rngtools_1.2.4        
## [58] RSQLite_0.11.4         rtracklayer_1.25.9     sendmailR_1.1-2       
## [61] sfsmisc_1.0-25         siggenes_1.39.0        splines_3.1.0         
## [64] stats4_3.1.0           stringr_0.6.2          survival_2.37-7       
## [67] tools_3.1.0            XML_3.98-1.1           xtable_1.7-3          
## [70] yaml_2.1.11            zlibbioc_1.11.1

Analysis of Recount data

Jeff Leek

Load packages

Read data from ReCount

Read pedigree info from Hapmap

Match sex info with samples

Estimate latent factors with different methods

Plot batch estimates

Plot absolute correlation between batch estimates

Compare results with different methods to just using study

Make the CATplot

Now try an unbalanced design

Estimate latent factors with different methods

Plot batch estimates

Plot absolute correlation between batch estimates

Compare results with different methods to just using study

Make the CATplot

Session Info