You will need the RSkittleBrewer package for this vignette to run. Installation instructions are available here:
This code also accesses data from the ReCount database:
You will also need R version 3.1.0 or greater and Bioconductor 3.0 or greater. The zebrafishRNASeq package might need to be installed from source. These analyses are based on the devel version of sva (version 3.11.2 or greater).
library(zebrafishRNASeq)
library(RSkittleBrewer)
library(genefilter)
library(RUVSeq)
library(edgeR)
library(sva)
library(ffpe)
## Warning: replacing previous import by 'graphics::image' when loading
## 'methylumi'
library(RColorBrewer)
library(corrplot)
library(limma)
trop = RSkittleBrewer('tropical')
load(url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData"))
mpdat= exprs(montpick.eset)
mppheno = pData(montpick.eset)
mpdat = mpdat[rowMeans(mpdat) > 5,]
# CEU
z = gzcon(url("http://hapmap.ncbi.nlm.nih.gov/downloads/samples_individuals/pedinfo2sample_CEU.txt.gz"))
raw = textConnection(readLines(z))
close(z)
ceuped = read.table(raw)
close(raw)
## YRI
z = gzcon(url("http://hapmap.ncbi.nlm.nih.gov/downloads/samples_individuals/pedinfo2sample_YRI.txt.gz"))
raw = textConnection(readLines(z))
close(z)
yriped = read.table(raw)
close(raw)
Remove all samples where we can not identify sex information exactly.
mppheno$sex = rep(NA,129)
for(i in 1:dim(mppheno)[1]){
if(mppheno$population[i] == "CEU"){
ind = grep(mppheno$sample.id[i],ceuped[,7])
if(length(ind)==1){
mppheno$sex[i] = ceuped[ind,5]
}
}
if(mppheno$population[i] == "YRI"){
ind = grep(mppheno$sample.id[i],yriped[,7])
if(length(ind)==1){
mppheno$sex[i] = yriped[ind,5]
}
}
}
dat0 = mpdat[,!is.na(mppheno$sex)]
mppheno = mppheno[!is.na(mppheno$sex),]
group = as.numeric(mppheno$sex)
study = as.numeric(mppheno$study)
table(study,group)
## group
## study 1 2
## 1 23 22
## 2 25 29
## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])
## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0,n.sv=1)$sv
## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]
## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf
x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W
## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)
controls = rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W
## Plot the results
plot(batch_unsup_sva ~ mppheno$study,pch=19,col=trop[1],main="unsupervised sva")
plot(batch_pca ~ mppheno$study,pch=19,col=trop[2],main="pca")
plot(batch_ruv_res ~ mppheno$study,pch=19,col=trop[3],main="residual ruv")
plot(batch_ruv_emp ~ mppheno$study,pch=19,col=trop[3],main="empirical controls ruv")
batchEstimates = cbind(group,study,batch_unsup_sva,batch_pca,
batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("group","study","usva","pca","ruvres","ruvemp")
corr = abs(cor(batchEstimates))
corr
## group study usva pca ruvres ruvemp
## group 1.000000 0.04797 0.01824 0.01657 0.01157 0.002352
## study 0.047971 1.00000 0.68805 0.68531 0.76133 0.687062
## usva 0.018238 0.68805 1.00000 0.99993 0.62335 0.998506
## pca 0.016571 0.68531 0.99993 1.00000 0.61640 0.998532
## ruvres 0.011568 0.76133 0.62335 0.61640 1.00000 0.622780
## ruvemp 0.002352 0.68706 0.99851 0.99853 0.62278 1.000000
cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = vector("list",6)
adj = c("+ study", "+ batch_unsup_sva",
"+ batch_ruv_res", "+ batch_ruv_emp",
"+ batch_pca","")
for(i in 1:6){
design = model.matrix(as.formula(paste0("~ group",adj[i])))
v <- voom(dge,design,plot=FALSE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
tstats[[i]] = abs(fit$t[,2])
names(tstats[[i]]) = as.character(1:dim(dat0)[1])
catplots[[i]] = CATplot(-rank(tstats[[i]]),-rank(tstats[[1]]),maxrank=1000,make.plot=F)
}
plot(catplots[[2]],ylim=c(0,1),col=trop[2],lwd=3,type="l",ylab="Concordance Between using study and different methods",xlab="Rank")
lines(catplots[[3]],col=trop[1],lwd=3,lty=2)
lines(catplots[[4]],col=trop[1],lwd=3)
lines(catplots[[5]],col=trop[3],lwd=3,lty=3)
lines(catplots[[6]],col=trop[4],lwd=3)
legend(200,0.5,legend=c("Unsup. svaseq","RUV Res.", "RUV Emp.","PCA","No adjustment"),col=trop[c(2,1,1,3,4)],lty=c(1,2,1,3,1),lwd=3)
set.seed(35353)
index = c(sample(which(group==1 & study==1),size=20),sample(which(group==1 & study==2),size=10),
sample(which(group==2 & study==1),size=10),sample(which(group==2 & study==2),size=20))
dat0 = dat0[,index]
group = group[index]
study = study[index]
## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])
## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0,n.sv=1)$sv
## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]
## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf
x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W
## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)
controls = rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W
## Plot the results
plot(batch_unsup_sva ~ mppheno$study[index],pch=19,col=trop[1],main="unsupervised sva")
plot(batch_pca ~ mppheno$study[index],pch=19,col=trop[2],main="pca")
plot(batch_ruv_res ~ mppheno$study[index],pch=19,col=trop[3],main="residual ruv")
plot(batch_ruv_emp ~ mppheno$study[index],pch=19,col=trop[3],main="empirical controls ruv")
batchEstimates = cbind(group,study,batch_unsup_sva,batch_pca,
batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("group","study","usva","pca","ruvres","ruvemp")
corr = abs(cor(batchEstimates))
corr
## group study usva pca ruvres ruvemp
## group 1.00000 0.3333 0.1840 0.1850 0.02194 0.1619
## study 0.33333 1.0000 0.7578 0.7571 0.78504 0.7521
## usva 0.18400 0.7578 1.0000 1.0000 0.67364 0.9982
## pca 0.18496 0.7571 1.0000 1.0000 0.67190 0.9982
## ruvres 0.02194 0.7850 0.6736 0.6719 1.00000 0.6634
## ruvemp 0.16188 0.7521 0.9982 0.9982 0.66336 1.0000
cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = vector("list",6)
adj = c("+ study", "+ batch_unsup_sva",
"+ batch_ruv_res", "+ batch_ruv_emp",
"+ batch_pca","")
for(i in 1:6){
design = model.matrix(as.formula(paste0("~ group",adj[i])))
v <- voom(dge,design,plot=FALSE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
tstats[[i]] = abs(fit$t[,2])
names(tstats[[i]]) = as.character(1:dim(dat0)[1])
catplots[[i]] = CATplot(-rank(tstats[[i]]),-rank(tstats[[1]]),maxrank=1000,make.plot=F)
}
plot(catplots[[2]],ylim=c(0,1),col=trop[2],lwd=3,type="l",ylab="Concordance Between using study and different methods",xlab="Rank")
lines(catplots[[3]],col=trop[1],lwd=3,lty=2)
lines(catplots[[4]],col=trop[1],lwd=3)
lines(catplots[[5]],col=trop[3],lwd=3,lty=3)
lines(catplots[[6]],col=trop[4],lwd=3)
legend(200,0.5,legend=c("Unsup. svaseq","RUV Res.", "RUV Emp.","PCA","No adjustment"),col=trop[c(2,1,1,3,4)],lty=c(1,2,1,3,1),lwd=3)
sessionInfo()
## R version 3.1.0 (2014-04-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] zebrafishRNASeq_0.99.2 sva_3.11.1
## [3] RUVSeq_0.99.2 RSkittleBrewer_1.1
## [5] RColorBrewer_1.0-5 mgcv_1.7-29
## [7] nlme_3.1-117 genefilter_1.47.5
## [9] ffpe_1.9.0 TTR_0.22-0
## [11] xts_0.9-7 zoo_1.7-11
## [13] edgeR_3.7.2 limma_3.21.6
## [15] EDASeq_1.99.1 ShortRead_1.23.11
## [17] GenomicAlignments_1.1.11 Rsamtools_1.17.22
## [19] GenomicRanges_1.17.17 GenomeInfoDb_1.1.6
## [21] corrplot_0.73 corpcor_1.6.6
## [23] Biostrings_2.33.10 XVector_0.5.6
## [25] IRanges_1.99.15 S4Vectors_0.0.8
## [27] BiocParallel_0.7.2 Biobase_2.25.0
## [29] BiocGenerics_0.11.2 knitr_1.6
##
## loaded via a namespace (and not attached):
## [1] affy_1.43.2 affyio_1.33.0 annotate_1.43.4
## [4] AnnotationDbi_1.27.7 aroma.light_2.1.0 base64_1.1
## [7] BatchJobs_1.2 BBmisc_1.6 beanplot_1.1
## [10] BiocInstaller_1.15.5 biomaRt_2.21.0 bitops_1.0-6
## [13] brew_1.0-6 BSgenome_1.33.8 bumphunter_1.5.3
## [16] codetools_0.2-8 colorspace_1.2-4 DBI_0.2-7
## [19] DESeq_1.17.0 digest_0.6.4 doRNG_1.6
## [22] evaluate_0.5.5 fail_1.2 foreach_1.4.2
## [25] formatR_0.10 geneplotter_1.43.0 GenomicFeatures_1.17.9
## [28] grid_3.1.0 htmltools_0.2.4 hwriter_1.3
## [31] illuminaio_0.7.0 iterators_1.0.7 KernSmooth_2.23-12
## [34] lattice_0.20-29 latticeExtra_0.6-26 locfit_1.5-9.1
## [37] lumi_2.17.0 MASS_7.3-33 Matrix_1.1-3
## [40] matrixStats_0.8.14 mclust_4.3 methylumi_2.11.1
## [43] minfi_1.11.7 multtest_2.21.0 nleqslv_2.1.1
## [46] nor1mix_1.1-4 pkgmaker_0.22 plyr_1.8.1
## [49] preprocessCore_1.27.0 R.methodsS3_1.6.1 R.oo_1.18.0
## [52] Rcpp_0.11.1 RCurl_1.95-4.1 registry_0.2
## [55] reshape_0.8.5 rmarkdown_0.2.46 rngtools_1.2.4
## [58] RSQLite_0.11.4 rtracklayer_1.25.9 sendmailR_1.1-2
## [61] sfsmisc_1.0-25 siggenes_1.39.0 splines_3.1.0
## [64] stats4_3.1.0 stringr_0.6.2 survival_2.37-7
## [67] tools_3.1.0 XML_3.98-1.1 xtable_1.7-3
## [70] yaml_2.1.11 zlibbioc_1.11.1