This document depends on the following packages:
library(devtools)
library(Biobase)
library(goseq)
library(DESeq2)
To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE
from the chunk.)
install.packages(c("devtools","MatrixEQTL"))
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("Biobase","goseq","DESeq2"))
Here we are going to follow along with the tutorial on goseq. You can see what genomes are supported by this package
head(supportedGenomes())
## db species date name
## 1 hg38 Human Dec. 2013 Genome Reference Consortium GRCh38
## 2 hg19 Human Feb. 2009 Genome Reference Consortium GRCh37
## 3 hg18 Human Mar. 2006 NCBI Build 36.1
## 4 hg17 Human May 2004 NCBI Build 35
## 5 hg16 Human Jul. 2003 NCBI Build 34
## 6 vicPac2 Alpaca Mar. 2013 Broad Institute Vicugna_pacos-2.0.1
## AvailableGeneIDs
## 1
## 2 ccdsGene,ensGene,exoniphy,geneSymbol,knownGene,nscanGene,refGene,xenoRefGene
## 3 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,knownGeneOld3,refGene,sgpGene,sibGene,xenoRefGene
## 4 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene,vegaGene,vegaPseudoGene,xenoRefGene
## 5 acembly,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene
## 6
head(supportedGeneIDs())
## db track subtrack GeneID
## 1 knownGene UCSC Genes <NA> Entrez Gene ID
## 2 knownGeneOld3 Old UCSC Genes <NA>
## 3 ccdsGene CCDS <NA>
## 4 refGene RefSeq Genes <NA> Entrez Gene ID
## 5 xenoRefGene Other RefSeq <NA>
## 6 vegaGene Vega Genes Vega Protein Genes HAVANA Pseudogene ID
## AvailableGenomes
## 1 hg16,hg17,hg18,hg19,mm7,mm8,mm9,rn3,rn4
## 2 hg18
## 3 hg17,hg18,hg19,mm8,mm9
## 4 bosTau2,bosTau3,bosTau4,canFam1,canFam2,ce2,ce4,ce6,ci1,ci2,danRer3,danRer4,danRer5,danRer6,dm1,dm2,dm3,equCab1,equCab2,felCat3,galGal2,galGal3,hg16,hg17,hg18,hg19,mm7,mm8,mm9,monDom4,monDom5,ornAna1,oryLat2,panTro2,ponAbe2,rheMac2,rn3,rn4,strPur1,strPur2,taeGut1,xenTro2
## 5 anoCar1,aplCal1,braFlo1,caeJap1,caePb1,caePb2,caeRem2,caeRem3,calJac1,canFam1,canFam2,cavPor3,cb1,cb3,ce4,ce6,ci1,ci2,danRer6,dp2,dp3,droAna1,droAna2,droEre1,droGri1,droMoj1,droMoj2,droPer1,droSec1,droSim1,droVir1,droVir2,droYak1,droYak2,equCab2,felCat3,galGal3,hg17,hg18,hg19,loxAfr3,mm7,mm8,mm9,monDom4,monDom5,ornAna1,oryLat2,panTro1,panTro2,petMar1,ponAbe2,priPac1,rheMac2,rn3,rn4,strPur1,strPur2,taeGut1
## 6 danRer5,hg17
Here we load the example frmo the goseq
package.
temp_data =read.table(system.file("extdata","Li_sum.txt",
package="goseq"),sep="\t",
header=TRUE,
stringsAsFactors=FALSE)
expr= temp_data[,-1]
rownames(expr) = temp_data[,1]
expr = expr[rowMeans(expr) > 5,]
grp=factor(rep(c("Control","Treated"),times=c(4,3)))
pdata = data.frame(grp)
Now we perform a differential expression analysis for the group variable with DESeq2
de = DESeqDataSetFromMatrix(expr, pdata, ~grp)
de_fit = DESeq(de)
de_results = results(de_fit)
Get the differentially expressed genes after FDR correction
genes = as.integer(de_results$padj < 0.05)
not_na = !is.na(genes)
names(genes) = rownames(expr)
genes = genes[not_na]
Here we look at some of the automatically supported genomes
head(supportedGenomes(),n=12)[,1:4]
## db species date name
## 1 hg38 Human Dec. 2013 Genome Reference Consortium GRCh38
## 2 hg19 Human Feb. 2009 Genome Reference Consortium GRCh37
## 3 hg18 Human Mar. 2006 NCBI Build 36.1
## 4 hg17 Human May 2004 NCBI Build 35
## 5 hg16 Human Jul. 2003 NCBI Build 34
## 6 vicPac2 Alpaca Mar. 2013 Broad Institute Vicugna_pacos-2.0.1
## 7 vicPac1 Alpaca Jul. 2008 Broad Institute VicPac1.0
## 8 dasNov3 Armadillo Dec. 2011 Broad Institute DasNov3
## 9 otoGar3 Bushbaby Mar. 2011 Broad Institute OtoGar3
## 10 papHam1 Baboon Nov. 2008 Baylor College of Medicine HGSC Pham_1.0
## 11 papAnu2 Baboon Mar. 2012 Baylor College of Medicine Panu_2.0
## 12 felCat5 Cat Sep. 2011 ICGSC Felis_catus-6.2
We need to set up a weighting function for all the genes in that genome
pwf=nullp(genes,"hg19","ensGene")
head(pwf)
## DEgenes bias.data pwf
## ENSG00000124208 0 1978 0.2362794
## ENSG00000224628 0 1510 0.2265504
## ENSG00000125835 0 954 0.2147844
## ENSG00000125834 0 6013 0.2972752
## ENSG00000197818 0 6295 0.2996675
## ENSG00000101181 0 2644 0.2520835
Here we use a parametric test to look for differences in enrichment with respect to different categories. This is the 2 x 2 table approach. You can also use random sampling to define the null distribution by setting the parameters method="Sampling"
and repcnt=1000
for the number of null repititions.
GO.wall=goseq(pwf,"hg19","ensGene")
head(GO.wall)
## category over_represented_pvalue under_represented_pvalue
## 9656 GO:0044763 2.243186e-18 1
## 9578 GO:0044421 2.244623e-18 1
## 2058 GO:0005576 3.691127e-18 1
## 6756 GO:0031982 6.248427e-18 1
## 6761 GO:0031988 1.472125e-17 1
## 9636 GO:0044699 5.082258e-17 1
## numDEInCat numInCat term ontology
## 9656 1750 5932 single-organism cellular process BP
## 9578 616 1798 extracellular region part CC
## 2058 654 1930 extracellular region CC
## 6756 669 1984 vesicle CC
## 6761 651 1929 membrane-bounded vesicle CC
## 9636 1894 6537 single-organism process BP
Suppose there is a particular category or function you are interested in. You can limit to just that category
GO.MF=goseq(pwf,"hg19","ensGene",test.cats=c("GO:MF"))
head(GO.MF)
## category over_represented_pvalue under_represented_pvalue
## 909 GO:0005198 5.318741e-08 1.0000000
## 1004 GO:0005515 2.398716e-05 0.9999808
## 2566 GO:0044822 2.639927e-05 0.9999810
## 205 GO:0003723 6.656537e-05 0.9999495
## 1941 GO:0023023 1.045388e-04 0.9999963
## 1942 GO:0023026 1.045388e-04 0.9999963
## numDEInCat numInCat term ontology
## 909 129 334 structural molecule activity MF
## 1004 1677 6006 protein binding MF
## 2566 322 1024 poly(A) RNA binding MF
## 205 395 1297 RNA binding MF
## 1941 8 9 MHC protein complex binding MF
## 1942 8 9 MHC class II protein complex binding MF
Gene set and other enrichment analyses are widely used to make sense of genomic results. Here are a couple of good places to start
Here is the session information
devtools::session_info()
## setting value
## version R version 3.2.1 (2015-06-18)
## system x86_64, darwin10.8.0
## ui RStudio (0.99.447)
## language (EN)
## collate en_US.UTF-8
## tz America/New_York
##
## package * version date
## acepack 1.3-3.3 2014-11-24
## annotate 1.46.1 2015-07-11
## AnnotationDbi * 1.30.1 2015-04-26
## assertthat 0.1 2013-12-06
## BiasedUrn * 1.06.1 2013-12-29
## Biobase * 2.28.0 2015-04-17
## BiocGenerics * 0.14.0 2015-04-17
## BiocInstaller * 1.18.4 2015-07-22
## BiocParallel 1.2.20 2015-08-07
## biomaRt 2.24.0 2015-04-17
## Biostrings 2.36.3 2015-08-12
## bitops 1.0-6 2013-08-17
## bladderbatch * 1.6.0 2015-08-26
## broom * 0.3.7 2015-05-06
## caTools 1.17.1 2014-09-10
## cluster 2.0.3 2015-07-21
## colorspace 1.2-6 2015-03-11
## corpcor 1.6.8 2015-07-08
## curl 0.9.2 2015-08-08
## DBI * 0.3.1 2014-09-24
## dendextend * 1.1.0 2015-07-31
## DESeq2 * 1.8.1 2015-05-02
## devtools * 1.8.0 2015-05-09
## digest 0.6.8 2014-12-31
## dplyr * 0.4.3 2015-09-01
## edge * 2.1.0 2015-09-06
## evaluate 0.7.2 2015-08-13
## foreign 0.8-66 2015-08-19
## formatR 1.2 2015-04-21
## Formula * 1.2-1 2015-04-07
## futile.logger 1.4.1 2015-04-20
## futile.options 1.0.0 2010-04-06
## gdata 2.17.0 2015-07-04
## genefilter * 1.50.0 2015-04-17
## geneLenDataBase * 1.4.0 2015-09-06
## geneplotter 1.46.0 2015-04-17
## GenomeInfoDb * 1.4.2 2015-08-15
## GenomicAlignments 1.4.1 2015-04-24
## GenomicFeatures 1.20.2 2015-08-14
## GenomicRanges * 1.20.5 2015-06-09
## genstats * 0.1.02 2015-09-05
## ggplot2 * 1.0.1 2015-03-17
## git2r 0.11.0 2015-08-12
## GO.db 3.1.2 2015-09-06
## goseq * 1.20.0 2015-04-17
## gplots * 2.17.0 2015-05-02
## gridExtra 2.0.0 2015-07-14
## gtable 0.1.2 2012-12-05
## gtools 3.5.0 2015-05-29
## highr 0.5 2015-04-21
## HistData * 0.7-5 2014-04-26
## Hmisc * 3.16-0 2015-04-30
## htmltools 0.2.6 2014-09-08
## httr 1.0.0 2015-06-25
## IRanges * 2.2.7 2015-08-09
## KernSmooth 2.23-15 2015-06-29
## knitr * 1.11 2015-08-14
## lambda.r 1.1.7 2015-03-20
## lattice * 0.20-33 2015-07-14
## latticeExtra 0.6-26 2013-08-15
## lazyeval 0.1.10 2015-01-02
## limma * 3.24.15 2015-08-06
## lme4 1.1-9 2015-08-20
## locfit 1.5-9.1 2013-04-20
## magrittr 1.5 2014-11-22
## MASS * 7.3-43 2015-07-16
## Matrix * 1.2-2 2015-07-08
## MatrixEQTL * 2.1.1 2015-02-03
## memoise 0.2.1 2014-04-22
## mgcv * 1.8-7 2015-07-23
## minqa 1.2.4 2014-10-09
## mnormt 1.5-3 2015-05-25
## munsell 0.4.2 2013-07-11
## nlme * 3.1-122 2015-08-19
## nloptr 1.0.4 2014-08-04
## nnet 7.3-10 2015-06-29
## org.Hs.eg.db * 3.1.2 2015-07-17
## plyr 1.8.3 2015-06-12
## preprocessCore * 1.30.0 2015-04-17
## proto 0.3-10 2012-12-22
## psych 1.5.6 2015-07-08
## qvalue * 2.0.0 2015-04-17
## R6 2.1.1 2015-08-19
## RColorBrewer 1.1-2 2014-12-07
## Rcpp * 0.12.0 2015-07-25
## RcppArmadillo * 0.5.400.2.0 2015-08-17
## RCurl 1.95-4.7 2015-06-30
## reshape2 1.4.1 2014-12-06
## rmarkdown 0.7 2015-06-13
## rpart 4.1-10 2015-06-29
## Rsamtools 1.20.4 2015-06-01
## RSkittleBrewer * 1.1 2015-09-05
## RSQLite * 1.0.0 2014-10-25
## rstudioapi 0.3.1 2015-04-07
## rtracklayer 1.28.9 2015-08-19
## rversions 1.0.2 2015-07-13
## S4Vectors * 0.6.5 2015-09-01
## scales 0.3.0 2015-08-25
## snm 1.16.0 2015-04-17
## snpStats * 1.18.0 2015-04-17
## stringi 0.5-5 2015-06-29
## stringr 1.0.0 2015-04-30
## survival * 2.38-3 2015-07-02
## sva * 3.14.0 2015-04-17
## tidyr 0.2.0 2014-12-05
## UsingR * 2.0-5 2015-08-06
## whisker 0.3-2 2013-04-28
## XML 3.98-1.3 2015-06-30
## xml2 0.1.2 2015-09-01
## xtable 1.7-4 2014-09-12
## XVector 0.8.0 2015-04-17
## yaml 2.1.13 2014-06-12
## zlibbioc 1.14.0 2015-04-17
## source
## CRAN (R 3.2.0)
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.2)
## Github (jdstorey/edge@a1947b5)
## CRAN (R 3.2.2)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## local
## CRAN (R 3.2.0)
## CRAN (R 3.2.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.1)
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.1)
## Bioconductor
## Github (alyssafrazee/RSkittleBrewer@0a96a20)
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.2)
## Bioconductor
## Bioconductor
## CRAN (R 3.2.1)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## CRAN (R 3.2.1)
## CRAN (R 3.2.2)
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.2.0)
## Bioconductor
It is also useful to compile the time the document was processed. This document was processed on: 2015-09-06.