Load packages

You will need the RSkittleBrewer, polyester, and ballgown packages for this vignette to run. Installation instructions are available here:

You will also need R version 3.1.0 or greater and Bioconductor 3.0 or greater. The zebrafishRNASeq package might need to be installed from source. These analyses are based on the devel version of sva (version 3.11.2 or greater).

library(zebrafishRNASeq)
library(RSkittleBrewer)
library(genefilter)
library(polyester)
library(RUVSeq)
library(edgeR)
library(sva)
library(ffpe)
library(RColorBrewer)
library(corrplot)
library(limma)
trop = RSkittleBrewer('tropical')
set.seed(3532333)

Load and filter zebrafish data

Here we are going to load the zebrafish data to get an overall feeling for the count data and relationship between mean and variance. We first filter data according to the RUVSeq vignette: http://bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.R.

data(zfGenes)
filter <- apply(zfGenes, 1, function(x) length(x[x>5])>=2)
counts <- zfGenes[filter,]

Look at mean variance relationship

plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1])

plot of chunk unnamed-chunk-1

Estimate zero inflated negative binomial parameters from the zebrafish data

## Estimate the zero inflated negative binomial parameters
params = get_params(counts)

plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1],main="zebrafish data w/fit")
lines(params$fit,col=trop[2])

plot of chunk nbparams

Generate an equal sized simulated data set and compare

## Create some data from the model

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],seed=53535)
## Generating data from baseline model.
par(mfrow=c(1,2))
plot(rowMeans(log(dat0+1)),rowVars(log(dat0+1)),pch=19,col=trop[2],main="simulated data",xlim=c(0,15),xlab="Mean(Count)", ylab="Var(Count)")
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1],main="zebrafish data",xlim=c(0,15),xlab="Mean(Count)", ylab="Var(Count)")

plot of chunk simdata

Make sure there is no differential expression using voom

group = rep(c(0,1),each=3)
design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)

plot of chunk nodiffexp

fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="No genes DE",breaks=100)

plot of chunk nodiffexp

Generate a data set with some differential expression and test

group = rep(c(-1,1),each=10)
mod = model.matrix(~-1 + group)
coeffs = cbind(c(rnorm(200), rep(0,800)))
dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=32332)

design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)

plot of chunk diffexp

fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="200 genes DE",breaks=100)

plot of chunk diffexp

Generate a data set with a batch effect uncorrelated with group

group = rep(c(-1,1),each=10)
batch = rep(c(-1,1),10)
gcoeffs = rep(0,1000)
bcoeffs = c(rep(0,100),rnorm(400,sd=2),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
mod = model.matrix(~-1 + batch + group)

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=323)

design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)

plot of chunk firstbatch

fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="400 genes batch/0 DE",breaks=100)

plot of chunk firstbatch

Generate a data set with orthogonal batch and group effects and compare methods.

Here we compare unsupervised svaseq, supervised svaseq, ruv with control probes, ruv with empirical controls, and principal components analysis for estimating batch

group = rep(c(-1,1),each=10)
batch = 1 - 2*rbinom(20,size=1,prob=0.5)
gcoeffs = c(rnorm(400),rep(0,600))
nullindex=401:1000
bcoeffs = c(rep(0,100),rnorm(400,sd=1),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
controls = (bcoeffs != 0) & (gcoeffs==0)
mod = model.matrix(~-1 + batch + group)

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=4353)
rm(mod)


## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])

## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0)$sv
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
## Estimate batch with svaseq (supervised)
batch_sup_sva = svaseq(dat0,mod1,mod0,controls=controls)$sv
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is:  1
## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]

## Estimate batch with ruv (controls known)
batch_ruv_cp <- RUVg(dat0, cIdx= controls, k=1)$W

## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W

## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

controls =rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W


## Plot the results
plot(batch,col=trop[1],pch=19,main="batch")

plot of chunk batchestimates

plot(batch_unsup_sva,pch=19,col=trop[2],main="unsupervised sva")

plot of chunk batchestimates

plot(batch_sup_sva,pch=19,col=trop[2],main="supervised sva")

plot of chunk batchestimates

plot(batch_pca,pch=19,col=trop[3],main="pca")

plot of chunk batchestimates

plot(batch_ruv_cp,pch=19,col=trop[4],main="control probes ruv")

plot of chunk batchestimates

plot(batch_ruv_res,pch=19,col=trop[4],main="residual ruv")

plot of chunk batchestimates

plot(batch_ruv_emp,pch=19,col=trop[4],main="empirical controls ruv")

plot of chunk batchestimates

Plot absolute correlation between batch estimates

In this case, all the estimates are highly correlated

batchEstimates = cbind(batch,group,batch_unsup_sva,batch_sup_sva,
                       batch_pca,batch_ruv_cp,batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("batch","group","usva","ssva","pca","ruvcp","ruvres","ruvemp")

corr = abs(cor(batchEstimates))
cols = colorRampPalette(c(trop[2],"white",trop[1]))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")

plot of chunk corbatch

dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = pvals = vector("list",8)
truevals = -abs(gcoeffs)
names(truevals) = as.character(1:1000)
adj = c("+ batch","+ batch_pca", "+ batch_sup_sva",
                "+ batch_unsup_sva", "+ batch_ruv_cp", "+ batch_ruv_res",
                "+ batch_ruv_emp", "")

for(i in 1:8){
  design = model.matrix(as.formula(paste0("~ group",adj[i])))
  v <- voom(dge,design,plot=FALSE)
  fit <- lmFit(v,design)
  fit <- eBayes(fit)
  tstats[[i]] = abs(fit$t[,2])
  pvals[[i]] = fit$p.value[,2]
  names(tstats[[i]]) = as.character(1:1000)
  catplots[[i]] = CATplot(-rank(tstats[[i]]),truevals,maxrank=400,make.plot=F)
}

plot(catplots[[1]],ylim=c(0,1),col=trop[1],lwd=3,type="l",ylab="Concordance Between True rank and rank with different methods",xlab="Rank")
lines(catplots[[2]],col=trop[2],lwd=3)
lines(catplots[[3]],col=trop[3],lwd=3,lty=2)
lines(catplots[[4]],col=trop[3],lwd=3,lty=1)
lines(catplots[[5]],col=trop[4],lwd=3,lty=2)
lines(catplots[[6]],col=trop[4],lwd=3,lty=1)
lines(catplots[[7]],col=trop[4],lwd=3,lty=3)
lines(catplots[[8]],col=trop[5],lwd=3)

legend(200,0.5,legend=c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment"),col=trop[c(1,2,3,3,4,4,4,5)],lty=c(1,1,1,2,2,1,3),lwd=3)

plot of chunk comparede

nullindex = 401:1000
par(mfrow=c(4,2))
colvec = trop[c(1,2,3,3,4,4,4,5)]
adjvec = c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment")
for(i in 1:8){
  hist(pvals[[i]][nullindex],main=adjvec[i],col=colvec[i])
}

plot of chunk pvals1

Generate a data set with correlated batch and group effects and compare methods.

Here we compare unsupervised svaseq, supervised svaseq, ruv with control probes, ruv with empirical controls, and principal components analysis for estimating batch

group = rep(c(-1,1),each=10)
coinflip = rbinom(20,size=1,prob=0.9)
batch = group*coinflip + -group*(1-coinflip)
gcoeffs = c(rnorm(400),rep(0,600))
bcoeffs = c(rep(0,100),rnorm(400,sd=1),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
controls = (bcoeffs != 0) & (gcoeffs==0)
mod = model.matrix(~-1 + batch + group)

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=333)
rm(mod)


## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])

## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0)$sv
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
## Estimate batch with svaseq (supervised)
batch_sup_sva = svaseq(dat0,mod1,mod0,controls=controls)$sv
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is:  1
## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]

## Estimate batch with ruv (controls known)
batch_ruv_cp <- RUVg(dat0, cIdx= controls, k=1)$W

## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W

## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

controls =rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W


## Plot the results
plot(batch,col=trop[1],pch=19,main="batch")

plot of chunk batchestimates2

plot(batch_unsup_sva,pch=19,col=trop[2],main="unsupervised sva")

plot of chunk batchestimates2

plot(batch_sup_sva,pch=19,col=trop[2],main="supervised sva")

plot of chunk batchestimates2

plot(batch_pca,pch=19,col=trop[3],main="pca")

plot of chunk batchestimates2

plot(batch_ruv_cp,pch=19,col=trop[4],main="control probes ruv")

plot of chunk batchestimates2

plot(batch_ruv_res,pch=19,col=trop[4],main="residual ruv")

plot of chunk batchestimates2

plot(batch_ruv_emp,pch=19,col=trop[4],main="empirical controls ruv")

plot of chunk batchestimates2

Plot absolute correlation between batch estimates

batchEstimates = cbind(batch,group,batch_unsup_sva,batch_sup_sva,
                       batch_pca,batch_ruv_cp,batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("batch","group","usva","ssva","pca","ruvcp","ruvres","ruvemp")

corr = abs(cor(batchEstimates))
cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")

plot of chunk corbatch2

Compare differential expression patterns with different adjustments

dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = pvals = vector("list",8)
truevals = -abs(gcoeffs)
names(truevals) = as.character(1:1000)
adj = c("+ batch","+ batch_pca", "+ batch_sup_sva",
                "+ batch_unsup_sva", "+ batch_ruv_cp", "+ batch_ruv_res",
                "+ batch_ruv_emp", "")

for(i in 1:8){
  design = model.matrix(as.formula(paste0("~ group",adj[i])))
  v <- voom(dge,design,plot=FALSE)
  fit <- lmFit(v,design)
  fit <- eBayes(fit)
  tstats[[i]] = abs(fit$t[,2])
  pvals[[i]] = fit$p.value[,2]
  names(tstats[[i]]) = as.character(1:1000)
  catplots[[i]] = CATplot(-rank(tstats[[i]]),truevals,maxrank=400,make.plot=F)
}

plot(catplots[[1]],ylim=c(0,1),col=trop[1],lwd=3,type="l",ylab="Concordance Between True rank and rank with different methods",xlab="Rank")
lines(catplots[[2]],col=trop[2],lwd=3)
lines(catplots[[3]],col=trop[3],lwd=3,lty=1)
lines(catplots[[4]],col=trop[3],lwd=3,lty=2)
lines(catplots[[5]],col=trop[4],lwd=3,lty=2)
lines(catplots[[6]],col=trop[4],lwd=3,lty=1)
lines(catplots[[7]],col=trop[4],lwd=3,lty=3)
lines(catplots[[8]],col=trop[5],lwd=3)

legend(200,0.5,legend=c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment"),col=trop[c(1,2,3,3,4,4,4,5)],lty=c(1,1,1,2,2,1,3),lwd=3)

plot of chunk comparede2

Compare p-value histograms with different batch adjustments

nullindex = 401:1000
par(mfrow=c(4,2))
colvec = trop[c(1,2,3,3,4,4,4,5)]
adjvec = c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment")
for(i in 1:8){
  hist(pvals[[i]][nullindex],main=adjvec[i],col=colvec[i])
}

plot of chunk pvals2

Session Info

sessionInfo()
## R version 3.1.0 (2014-04-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] corrplot_0.73            RColorBrewer_1.0-5      
##  [3] ffpe_1.9.0               TTR_0.22-0              
##  [5] xts_0.9-7                zoo_1.7-11              
##  [7] sva_3.11.3               mgcv_1.7-29             
##  [9] nlme_3.1-117             RUVSeq_0.99.2           
## [11] edgeR_3.7.2              limma_3.21.6            
## [13] EDASeq_1.99.1            ShortRead_1.23.11       
## [15] GenomicAlignments_1.1.11 Rsamtools_1.17.27       
## [17] GenomicRanges_1.17.17    GenomeInfoDb_1.1.6      
## [19] Biostrings_2.33.10       XVector_0.5.6           
## [21] IRanges_1.99.15          S4Vectors_0.0.8         
## [23] BiocParallel_0.7.2       Biobase_2.25.0          
## [25] BiocGenerics_0.11.2      polyester_0.99.0        
## [27] genefilter_1.47.5        RSkittleBrewer_1.1      
## [29] zebrafishRNASeq_0.99.2   knitr_1.6               
## 
## loaded via a namespace (and not attached):
##  [1] affy_1.43.2            affyio_1.33.0          annotate_1.43.4       
##  [4] AnnotationDbi_1.27.7   aroma.light_2.1.0      base64_1.1            
##  [7] BatchJobs_1.2          BBmisc_1.6             beanplot_1.1          
## [10] BiocInstaller_1.15.5   biomaRt_2.21.0         bitops_1.0-6          
## [13] brew_1.0-6             BSgenome_1.33.8        bumphunter_1.5.3      
## [16] codetools_0.2-8        colorspace_1.2-4       DBI_0.2-7             
## [19] DESeq_1.17.0           digest_0.6.4           doRNG_1.6             
## [22] evaluate_0.5.5         fail_1.2               foreach_1.4.2         
## [25] formatR_0.10           geneplotter_1.43.0     GenomicFeatures_1.17.9
## [28] grid_3.1.0             htmltools_0.2.4        hwriter_1.3           
## [31] illuminaio_0.7.0       iterators_1.0.7        KernSmooth_2.23-12    
## [34] lattice_0.20-29        latticeExtra_0.6-26    locfit_1.5-9.1        
## [37] lumi_2.17.0            MASS_7.3-33            Matrix_1.1-3          
## [40] matrixStats_0.8.14     mclust_4.3             methylumi_2.11.1      
## [43] minfi_1.11.7           multtest_2.21.0        nleqslv_2.1.1         
## [46] nor1mix_1.1-4          pkgmaker_0.22          plyr_1.8.1            
## [49] preprocessCore_1.27.0  R.methodsS3_1.6.1      R.oo_1.18.0           
## [52] Rcpp_0.11.1            RCurl_1.95-4.1         registry_0.2          
## [55] reshape_0.8.5          rmarkdown_0.2.46       rngtools_1.2.4        
## [58] RSQLite_0.11.4         rtracklayer_1.25.9     sendmailR_1.1-2       
## [61] sfsmisc_1.0-25         siggenes_1.39.0        splines_3.1.0         
## [64] stats4_3.1.0           stringr_0.6.2          survival_2.37-7       
## [67] tools_3.1.0            XML_3.98-1.1           xtable_1.7-3          
## [70] yaml_2.1.11            zlibbioc_1.11.1