You will need the RSkittleBrewer, polyester, and ballgown packages for this vignette to run. Installation instructions are available here:
You will also need R version 3.1.0 or greater and Bioconductor 3.0 or greater. The zebrafishRNASeq package might need to be installed from source. These analyses are based on the devel version of sva (version 3.11.2 or greater).
library(zebrafishRNASeq)
library(RSkittleBrewer)
library(genefilter)
library(polyester)
library(RUVSeq)
library(edgeR)
library(sva)
library(ffpe)
library(RColorBrewer)
library(corrplot)
library(limma)
trop = RSkittleBrewer('tropical')
set.seed(3532333)
Here we are going to load the zebrafish data to get an overall feeling for the count data and relationship between mean and variance. We first filter data according to the RUVSeq vignette: http://bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.R.
data(zfGenes)
filter <- apply(zfGenes, 1, function(x) length(x[x>5])>=2)
counts <- zfGenes[filter,]
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1])
## Estimate the zero inflated negative binomial parameters
params = get_params(counts)
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1],main="zebrafish data w/fit")
lines(params$fit,col=trop[2])
## Create some data from the model
dat0 = create_read_numbers(params$mu,params$fit,
params$p0,m=dim(counts)[1],n=dim(counts)[2],seed=53535)
## Generating data from baseline model.
par(mfrow=c(1,2))
plot(rowMeans(log(dat0+1)),rowVars(log(dat0+1)),pch=19,col=trop[2],main="simulated data",xlim=c(0,15),xlab="Mean(Count)", ylab="Var(Count)")
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1],main="zebrafish data",xlim=c(0,15),xlab="Mean(Count)", ylab="Var(Count)")
group = rep(c(0,1),each=3)
design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="No genes DE",breaks=100)
group = rep(c(-1,1),each=10)
mod = model.matrix(~-1 + group)
coeffs = cbind(c(rnorm(200), rep(0,800)))
dat0 = create_read_numbers(params$mu,params$fit,
params$p0,m=dim(counts)[1],n=dim(counts)[2],
beta=coeffs,mod=mod,seed=32332)
design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="200 genes DE",breaks=100)
Here we compare unsupervised svaseq, supervised svaseq, ruv with control probes, ruv with empirical controls, and principal components analysis for estimating batch
group = rep(c(-1,1),each=10)
batch = 1 - 2*rbinom(20,size=1,prob=0.5)
gcoeffs = c(rnorm(400),rep(0,600))
nullindex=401:1000
bcoeffs = c(rep(0,100),rnorm(400,sd=1),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
controls = (bcoeffs != 0) & (gcoeffs==0)
mod = model.matrix(~-1 + batch + group)
dat0 = create_read_numbers(params$mu,params$fit,
params$p0,m=dim(counts)[1],n=dim(counts)[2],
beta=coeffs,mod=mod,seed=4353)
rm(mod)
## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])
## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0)$sv
## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
## Estimate batch with svaseq (supervised)
batch_sup_sva = svaseq(dat0,mod1,mod0,controls=controls)$sv
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 1
## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]
## Estimate batch with ruv (controls known)
batch_ruv_cp <- RUVg(dat0, cIdx= controls, k=1)$W
## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf
x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W
## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)
controls =rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W
## Plot the results
plot(batch,col=trop[1],pch=19,main="batch")
plot(batch_unsup_sva,pch=19,col=trop[2],main="unsupervised sva")
plot(batch_sup_sva,pch=19,col=trop[2],main="supervised sva")
plot(batch_pca,pch=19,col=trop[3],main="pca")
plot(batch_ruv_cp,pch=19,col=trop[4],main="control probes ruv")
plot(batch_ruv_res,pch=19,col=trop[4],main="residual ruv")
plot(batch_ruv_emp,pch=19,col=trop[4],main="empirical controls ruv")
In this case, all the estimates are highly correlated
batchEstimates = cbind(batch,group,batch_unsup_sva,batch_sup_sva,
batch_pca,batch_ruv_cp,batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("batch","group","usva","ssva","pca","ruvcp","ruvres","ruvemp")
corr = abs(cor(batchEstimates))
cols = colorRampPalette(c(trop[2],"white",trop[1]))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = pvals = vector("list",8)
truevals = -abs(gcoeffs)
names(truevals) = as.character(1:1000)
adj = c("+ batch","+ batch_pca", "+ batch_sup_sva",
"+ batch_unsup_sva", "+ batch_ruv_cp", "+ batch_ruv_res",
"+ batch_ruv_emp", "")
for(i in 1:8){
design = model.matrix(as.formula(paste0("~ group",adj[i])))
v <- voom(dge,design,plot=FALSE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
tstats[[i]] = abs(fit$t[,2])
pvals[[i]] = fit$p.value[,2]
names(tstats[[i]]) = as.character(1:1000)
catplots[[i]] = CATplot(-rank(tstats[[i]]),truevals,maxrank=400,make.plot=F)
}
plot(catplots[[1]],ylim=c(0,1),col=trop[1],lwd=3,type="l",ylab="Concordance Between True rank and rank with different methods",xlab="Rank")
lines(catplots[[2]],col=trop[2],lwd=3)
lines(catplots[[3]],col=trop[3],lwd=3,lty=2)
lines(catplots[[4]],col=trop[3],lwd=3,lty=1)
lines(catplots[[5]],col=trop[4],lwd=3,lty=2)
lines(catplots[[6]],col=trop[4],lwd=3,lty=1)
lines(catplots[[7]],col=trop[4],lwd=3,lty=3)
lines(catplots[[8]],col=trop[5],lwd=3)
legend(200,0.5,legend=c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment"),col=trop[c(1,2,3,3,4,4,4,5)],lty=c(1,1,1,2,2,1,3),lwd=3)
nullindex = 401:1000
par(mfrow=c(4,2))
colvec = trop[c(1,2,3,3,4,4,4,5)]
adjvec = c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment")
for(i in 1:8){
hist(pvals[[i]][nullindex],main=adjvec[i],col=colvec[i])
}
batchEstimates = cbind(batch,group,batch_unsup_sva,batch_sup_sva,
batch_pca,batch_ruv_cp,batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("batch","group","usva","ssva","pca","ruvcp","ruvres","ruvemp")
corr = abs(cor(batchEstimates))
cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = pvals = vector("list",8)
truevals = -abs(gcoeffs)
names(truevals) = as.character(1:1000)
adj = c("+ batch","+ batch_pca", "+ batch_sup_sva",
"+ batch_unsup_sva", "+ batch_ruv_cp", "+ batch_ruv_res",
"+ batch_ruv_emp", "")
for(i in 1:8){
design = model.matrix(as.formula(paste0("~ group",adj[i])))
v <- voom(dge,design,plot=FALSE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
tstats[[i]] = abs(fit$t[,2])
pvals[[i]] = fit$p.value[,2]
names(tstats[[i]]) = as.character(1:1000)
catplots[[i]] = CATplot(-rank(tstats[[i]]),truevals,maxrank=400,make.plot=F)
}
plot(catplots[[1]],ylim=c(0,1),col=trop[1],lwd=3,type="l",ylab="Concordance Between True rank and rank with different methods",xlab="Rank")
lines(catplots[[2]],col=trop[2],lwd=3)
lines(catplots[[3]],col=trop[3],lwd=3,lty=1)
lines(catplots[[4]],col=trop[3],lwd=3,lty=2)
lines(catplots[[5]],col=trop[4],lwd=3,lty=2)
lines(catplots[[6]],col=trop[4],lwd=3,lty=1)
lines(catplots[[7]],col=trop[4],lwd=3,lty=3)
lines(catplots[[8]],col=trop[5],lwd=3)
legend(200,0.5,legend=c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment"),col=trop[c(1,2,3,3,4,4,4,5)],lty=c(1,1,1,2,2,1,3),lwd=3)
nullindex = 401:1000
par(mfrow=c(4,2))
colvec = trop[c(1,2,3,3,4,4,4,5)]
adjvec = c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment")
for(i in 1:8){
hist(pvals[[i]][nullindex],main=adjvec[i],col=colvec[i])
}
sessionInfo()
## R version 3.1.0 (2014-04-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] corrplot_0.73 RColorBrewer_1.0-5
## [3] ffpe_1.9.0 TTR_0.22-0
## [5] xts_0.9-7 zoo_1.7-11
## [7] sva_3.11.3 mgcv_1.7-29
## [9] nlme_3.1-117 RUVSeq_0.99.2
## [11] edgeR_3.7.2 limma_3.21.6
## [13] EDASeq_1.99.1 ShortRead_1.23.11
## [15] GenomicAlignments_1.1.11 Rsamtools_1.17.27
## [17] GenomicRanges_1.17.17 GenomeInfoDb_1.1.6
## [19] Biostrings_2.33.10 XVector_0.5.6
## [21] IRanges_1.99.15 S4Vectors_0.0.8
## [23] BiocParallel_0.7.2 Biobase_2.25.0
## [25] BiocGenerics_0.11.2 polyester_0.99.0
## [27] genefilter_1.47.5 RSkittleBrewer_1.1
## [29] zebrafishRNASeq_0.99.2 knitr_1.6
##
## loaded via a namespace (and not attached):
## [1] affy_1.43.2 affyio_1.33.0 annotate_1.43.4
## [4] AnnotationDbi_1.27.7 aroma.light_2.1.0 base64_1.1
## [7] BatchJobs_1.2 BBmisc_1.6 beanplot_1.1
## [10] BiocInstaller_1.15.5 biomaRt_2.21.0 bitops_1.0-6
## [13] brew_1.0-6 BSgenome_1.33.8 bumphunter_1.5.3
## [16] codetools_0.2-8 colorspace_1.2-4 DBI_0.2-7
## [19] DESeq_1.17.0 digest_0.6.4 doRNG_1.6
## [22] evaluate_0.5.5 fail_1.2 foreach_1.4.2
## [25] formatR_0.10 geneplotter_1.43.0 GenomicFeatures_1.17.9
## [28] grid_3.1.0 htmltools_0.2.4 hwriter_1.3
## [31] illuminaio_0.7.0 iterators_1.0.7 KernSmooth_2.23-12
## [34] lattice_0.20-29 latticeExtra_0.6-26 locfit_1.5-9.1
## [37] lumi_2.17.0 MASS_7.3-33 Matrix_1.1-3
## [40] matrixStats_0.8.14 mclust_4.3 methylumi_2.11.1
## [43] minfi_1.11.7 multtest_2.21.0 nleqslv_2.1.1
## [46] nor1mix_1.1-4 pkgmaker_0.22 plyr_1.8.1
## [49] preprocessCore_1.27.0 R.methodsS3_1.6.1 R.oo_1.18.0
## [52] Rcpp_0.11.1 RCurl_1.95-4.1 registry_0.2
## [55] reshape_0.8.5 rmarkdown_0.2.46 rngtools_1.2.4
## [58] RSQLite_0.11.4 rtracklayer_1.25.9 sendmailR_1.1-2
## [61] sfsmisc_1.0-25 siggenes_1.39.0 splines_3.1.0
## [64] stats4_3.1.0 stringr_0.6.2 survival_2.37-7
## [67] tools_3.1.0 XML_3.98-1.1 xtable_1.7-3
## [70] yaml_2.1.11 zlibbioc_1.11.1