Dependencies

This document depends on the following packages:

  library(gplots)
  library(devtools)
  library(Biobase)
  library(RSkittleBrewer)
  library(org.Hs.eg.db)
  library(AnnotationDbi)

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

install.packages(c("devtools","gplots"))
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("Biobase","org.Hs.eg.db","AnnotationDbi"))
biocLite("alyssafrazee/RSkittleBrewer")

General principles

  • Use plots as often as possible
  • Use tables for phenotype data
  • Look for
  • Missing values
  • Outlier values
  • Mislabeled samples
  • Naming consistency

Make the plots pretty

Typically we will use color to explore data sets and label different values. There are a large number of color options in R. I like the RSkittleBrewer package, but you can also check out Jenny Bryan’s excellent lecture on colors in R for more information.

Load the library and set the color palette with the palette function. Now when I type col = 1 it will look for the first color in the trop colors. We also set the character to be a filled dot with par(pch=19).

library(RSkittleBrewer)
# Make the colors pretty
trop = RSkittleBrewer("tropical")
palette(trop)
par(pch=19)

Load some data

We will use this expression set to look at how we use plots and tables to check for different characteristics

con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
load(file=con)
close(con)
bm = bodymap.eset
pdata=pData(bm)
edata=exprs(bm)
fdata = fData(bm)
ls()
## [1] "bm"           "bodymap.eset" "con"          "edata"       
## [5] "fdata"        "pdata"        "tropical"

Tables for factor/character variables

Tables are good for looking at factor or character variables, especially in phenotype data

table(pdata$gender)
## 
## F M 
## 8 8
table(pdata$gender,pdata$race)
##    
##     african_american caucasian
##   F                1         7
##   M                0         8

Look for missing values

First check a summary of the distribution to look for scale, this is also one way to check for NA values.

summary(edata)
##    ERS025098           ERS025092          ERS025085       
##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :      0.0   Median :     0.0   Median :     0.0  
##  Mean   :    455.6   Mean   :   361.1   Mean   :   399.3  
##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
##  Max.   :1584528.0   Max.   :499802.0   Max.   :808641.0  
##    ERS025088           ERS025089         ERS025082      
##  Min.   :      0.0   Min.   :      0   Min.   :      0  
##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:      0  
##  Median :      0.0   Median :      0   Median :      0  
##  Mean   :    445.5   Mean   :    445   Mean   :    509  
##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:      0  
##  Max.   :1014579.0   Max.   :1415741   Max.   :2484692  
##    ERS025081           ERS025096         ERS025099       
##  Min.   :      0.0   Min.   :      0   Min.   :     0.0  
##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:     0.0  
##  Median :      0.0   Median :      0   Median :     0.0  
##  Mean   :    430.4   Mean   :    558   Mean   :   445.5  
##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:     0.0  
##  Max.   :1356643.0   Max.   :3517964   Max.   :572374.0  
##    ERS025086          ERS025084          ERS025087     
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0  
##  Median :     0.0   Median :     0.0   Median :     0  
##  Mean   :   370.7   Mean   :   592.1   Mean   :  1097  
##  3rd Qu.:     0.0   3rd Qu.:     0.0   3rd Qu.:     0  
##  Max.   :458168.0   Max.   :287539.0   Max.   :519683  
##    ERS025093           ERS025083          ERS025095        
##  Min.   :      0.0   Min.   :     0.0   Min.   :      0.0  
##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:      0.0  
##  Median :      0.0   Median :     0.0   Median :      0.0  
##  Mean   :    997.7   Mean   :   434.7   Mean   :    479.6  
##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:      0.0  
##  Max.   :1332827.0   Max.   :626824.0   Max.   :1605570.0  
##    ERS025097           ERS025094          ERS025090       
##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :      0.0   Median :     0.0   Median :     0.0  
##  Mean   :    540.1   Mean   :   518.5   Mean   :   465.3  
##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
##  Max.   :1888083.0   Max.   :776623.0   Max.   :535410.0  
##    ERS025091        
##  Min.   :      0.0  
##  1st Qu.:      0.0  
##  Median :      0.0  
##  Mean   :    530.1  
##  3rd Qu.:      0.0  
##  Max.   :1221921.0

NA is the most common character for missing values, but sometimes they are coded as spaces, 999, -1 or “missing”. Check for missing values in a variety of ways

# Use option useNA to include NA's in table
table(pdata$age,useNA="ifany")
## 
##   19   29   37   47   58   60   65   68   73   77   86 <NA> 
##    1    1    1    1    1    3    1    1    2    3    1    3
# is.na checks for NA values
table(is.na(pdata$age))
## 
## FALSE  TRUE 
##    16     3
# Check for other common missing names
sum(pdata$age==" ")
## [1] NA
# Check genomic data for NAs
sum(is.na(edata))
## [1] 0
# Make the distribution of NA's by genes
gene_na = rowSums(is.na(edata))
table(gene_na)
## gene_na
##     0 
## 52580
# Make the distribution of NA's by samples
sample_na = rowSums(is.na(edata))
table(sample_na)
## sample_na
##     0 
## 52580

Make sure dimensions match up

The number of rows of the feature data should match the number of rows of the genomic data (both are the number of genes). The number of rows of the phenotype data should match the number of columns of the genomic data (both are the number of samples).

dim(fdata)
## [1] 52580     1
dim(pdata)
## [1] 19  6
dim(edata)
## [1] 52580    19

Look at overall distributions

Here we see that there are a lot of outliers

boxplot(log2(edata+1),col=2,range=0)

We can also look at this sample by sample with histograms

par(mfrow=c(1,2))
hist(log2(edata[,1]+1),col=2)
hist(log2(edata[,2]+1),col=2)

Or with density plots

plot(density(log2(edata[,1]+1)),col=2)
lines(density(log2(edata[,2]+1)),col=3)

A very common task is to compare distributions of measurements (say before normalization). You can do this with a qq-plot

qqplot(log2(edata[,1]+1), log2(edata[,2]+1),col=3)

A very widely used plot is what is known as a M-A plot, sometimes called a Bland Altman plot. The basic idea is to plot the sum of the two values on the x-axis and the difference on the y-axis. This can be used to see any difference between the (samples, averages, etc.) and to see if there is any intensity-specific biases.

mm = log2(edata[,1]+1) - log2(edata[,2]+1)
aa = log2(edata[,1]+1) + log2(edata[,2]+1)
plot(aa,mm,col=2)