Replication paper

Load packages you need

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.2

library(RCurl)
library(grid)
library(ggplot2)
library(downloader)
library(gridExtra)
library(extrafont)
library(Hmisc)

Download the data

We follow the data reading approach given in the code reproducing the paper.

download("https://osf.io/fgjvw/?action=download",destfile="../data/rpp_data.csv")
download("https://osf.io/bhcsf/?action=download",destfile="../data/rpp_data_codebook.csv")
date_downloaded = date()
date()

## [1] "Thu Sep 17 14:33:12 2015"

Reproducing Figure 3 from the paper

First we download the data following the lines 30-43 of the original code and updating because we had trouble getting the functions to work from OSF.

## Here we follow 

dat = read.csv("../data/rpp_data.csv",stringsAsFactors=F)
fdat = dplyr::filter(dat, !is.na(T_pval_USE..O.), !is.na(T_pval_USE..R.))

## 99 expected
nrow(fdat)

## [1] 99

idOK <- complete.cases(fdat$T_r..O.,fdat$T_r..R.)

## 97 expected
sum(idOK)

## [1] 97

Next make the plot following lines 192-250 of the original code and updating because we had trouble getting the functions to work from OSF

## Got this from https://github.com/FredHasselman/toolboxR/blob/master/C-3PR.R was having trouble with the
## sourcing

source("utils.R")
mytheme <- gg.theme("clean")

## Line 149 needed to make numeric
fdat$Power..R. <- as.numeric(fdat$Power..R.)

fdat$oriSig = "Not Significant"
fdat$oriSig[fdat$T_pval_USE..O.<=.06] = "Significant"
fdat$oriSig = factor(fdat$oriSig)

fdat$repSig = "Not Significant"
fdat$repSig[fdat$T_pval_USE..R.<=.05] = "Significant"
fdat$repSig = factor(fdat$repSig)

blankPlot <- plotHolder()


xDense <- ggplot(fdat, aes(x=T_r..O., fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))


yDense <- ggplot(fdat, aes(x=T_r..R., fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 


scatterP<-
  ggplot(fdat,aes(x=T_r..O.,y=T_r..R.)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=Power..R.,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(2,9)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.9,.6), plot.margin = unit(c(-2,-1.5,2,2), "lines")) 

grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

dim(fdat)

## [1]  99 140

Now we transform to Fisher

We follow the lines 490-516 to get the Fisher transforms include the degrees of freedom adjustments used by the original authors.

colnames(dat)[1] = "ID"

cor_orig = dat$T_r..O.
cor_rep = dat$T_r..R.

n_orig = dat$T_df2..O. + 2
n_rep = dat$T_df2..R. + 2


### Partial correlation, so degrees of freedom plus 2 in order to get N
n_orig[dat$ID == 82] <- dat$T_df1..O.[82]+2
n_rep[dat$ID == 82] <- dat$T_df1..R.[82]+2

### Correlation
n_orig[dat$ID == 120] <- dat$T_N..O.[120]
n_rep[dat$ID == 120] <- dat$T_N..R.[120]
n_orig[dat$ID == 154] <- dat$T_N..O.[154]
n_rep[dat$ID == 154] <- dat$T_N..R.[154]
n_orig[dat$ID == 155] <- dat$T_N..O.[155]
n_rep[dat$ID == 155] <- dat$T_N..R.[155]

### t
n_orig[dat$ID == 121] <- dat$T_N..O.[121]
n_rep[dat$ID == 121] <- dat$T_N..R.[121]

### Transform to Fisher's z
fish_orig = atanh(cor_orig)
fish_rep = atanh(cor_rep)

Calculate the prediction interval

Here we calculate prediction intervals based on Fisher’s z-scores.

se_total <- sqrt(1/(n_orig-3) + 1/(n_rep-3))
low = tanh(fish_orig - se_total * 1.96)
high = tanh(fish_orig + se_total * 1.96)
too_high = (cor_rep > high)
too_low = (cor_rep < low)
use_index = (!is.na(dat$T_pval_USE..O.) & !is.na(dat$T_pval_USE..R.))

pi_dat = data.frame(cor_orig, cor_rep, low, high, se_total, too_low, too_high, n_orig, n_rep,val = (too_low+2*too_high+1),use_index)

pi_dat = dplyr::filter(pi_dat, use_index > 0)

Here we make a plot. We haven’t filtered out the cases where we have more than one degree of freedom for the test yet.

pi_dat_nona = pi_dat[rowSums(is.na(pi_dat))==0,]
cols1 = c("grey","hotpink","dodgerblue")
plot(pi_dat_nona$cor_orig,pi_dat_nona$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",col=cols1[pi_dat_nona$val],pch=19)
abline(c(0,1),lwd=3,lty=2)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_nona$cor_orig,pi_dat_nona$low,pi_dat_nona$cor_orig,pi_dat_nona$high,col=cols1[pi_dat_nona$val],lwd=0.75)
legend(0.55,-0.5,col=c("grey","hotpink","dodgerblue"),pch=19,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"))

Here we save the plot to a pdf

font_import(pattern="Lato",prompt=FALSE)
loadfonts()
pdf(file = "../results/pi_figure_nofilter.pdf",family="Lato")
plot(pi_dat_nona$cor_orig,pi_dat_nona$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",bg=cols1[pi_dat_nona$val], col="darkgrey",
     pch=21,cex.lab=1.2,cex.axis=1.2,cex=1.5)
abline(c(0,1),lwd=3)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_nona$cor_orig,pi_dat_nona$low,pi_dat_nona$cor_orig,pi_dat_nona$high,col=cols1[pi_dat_nona$val],lwd=0.75)
legend(0.55,-0.5,pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()

## quartz_off_screen 
##                 2

Now plot sample size colored the same way

plot(log10(pi_dat_nona$n_orig),log10(pi_dat_nona$n_rep),
     xlab="Original n",ylab="Replication n",col=cols1[pi_dat_nona$val],pch=19,
     xaxt="n",yaxt="n")
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)

Save the sample size plot in a pdf

pdf(file = "../results/samplesize_figure_nofilter.pdf",family="Lato")
plot(log10(pi_dat_nona$n_orig),log10(pi_dat_nona$n_rep),
     xlab="Original n",ylab="Replication n",col="darkgrey",bg=cols1[pi_dat_nona$val],pch=21,
     xaxt="n",yaxt="n",cex=1.5)
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
dev.off()

## quartz_off_screen 
##                 2

What percentage are in the table in various ways

table(pi_dat_nona$cor_rep < pi_dat_nona$low)

## 
## FALSE  TRUE 
##    71    21

table(pi_dat_nona$cor_rep < pi_dat_nona$high)

## 
## FALSE  TRUE 
##     2    90

Now lets do the same thing, but filtering as in the paper

pi_dat = cbind(pi_dat,stat=as.character(fdat$T_Test.Statistic..R.),df1 = fdat$T_df1..O.)
pi_dat = mutate(pi_dat, stat_index= (stat == "F" & df1 == 1) | stat == "t" | stat == "r")
pi_dat_filt = filter(pi_dat,stat_index > 0)

Now we make the plot after filtering

cols1 = c("grey","hotpink","dodgerblue")
plot(pi_dat_filt$cor_orig,pi_dat_filt$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",col=cols1[pi_dat_filt$val],pch=19)
abline(c(0,1),lwd=3,lty=2)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_filt$cor_orig,pi_dat_filt$low,pi_dat_filt$cor_orig,pi_dat_filt$high,col=cols1[pi_dat_filt$val],lwd=0.75)
legend(0.55,-0.5,col=c("grey","hotpink","dodgerblue"),pch=19,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"))

Here we save the plot to a pdf

pdf(file = "../results/pi_figure_filter.pdf",family="Lato")
plot(pi_dat_filt$cor_orig,pi_dat_filt$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",bg=cols1[pi_dat_filt$val], col="darkgrey",
     pch=21,cex.lab=1.2,cex.axis=1.2,cex=1.2)
abline(c(0,1),lwd=3)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_filt$cor_orig,pi_dat_filt$low,pi_dat_filt$cor_orig,pi_dat_filt$high,col=cols1[pi_dat_filt$val],lwd=1)
legend(0.55,-0.5,pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()

## quartz_off_screen 
##                 2

Now plot sample size colored the same way

plot(log10(pi_dat_filt$n_orig),log10(pi_dat_filt$n_rep),
     xlab="Original n",ylab="Replication n",col=cols1[pi_dat_filt$val],pch=19,
     xaxt="n",yaxt="n")
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)

Save the sample size plot in a pdf

pdf(file = "../results/samplesize_figure_filter.pdf",family="Lato")
plot(log10(pi_dat_filt$n_orig),log10(pi_dat_filt$n_rep),
     xlab="Original n",ylab="Replication n",col="darkgrey",bg=cols1[pi_dat_filt$val],pch=21,
     xaxt="n",yaxt="n",cex=1.5)
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
legend(log10(9),log10(2000),pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()

## quartz_off_screen 
##                 2

What percentage are in the table in various ways

table(pi_dat_filt$cor_rep < pi_dat_filt$low)

## 
## FALSE  TRUE 
##    53    20

table(pi_dat_filt$cor_rep < pi_dat_filt$high)

## 
## FALSE  TRUE 
##     2    71

P-value simulation…how often should the p-values agree?

set.seed(35353)
get_ps <- function(cor_orig, cor_rep, n_orig, n_rep,nproject){
    # If we don't have a sample size for the replicate, use the sample size
    # of the original study. If that is also NA, use the median original sample
    # size.
    n_rep_tmp <- ifelse(is.na(n_rep), ifelse(is.na(n_orig), 55, n_orig), n_rep)

    # If we don't have an original effect size, use the median orignal
    # effect size.
    cor_orig_tmp <- ifelse(is.na(cor_orig), 0.3501, cor_orig)
    mean_term <- 0.5*log((1 + tanh(cor_orig_tmp))/(1 - tanh(cor_orig_tmp)))

    # Simulate 100 arcanh(r) assuming the original association is the true
    # one, but using the sample size of the replication study to compute the SD
    new_arc_rs <- rnorm(nproject, mean_term, 1/sqrt(n_rep_tmp - 3))

    # Convert to R^2
    new_r2 <- tanh(new_arc_rs)^2

    # Convert to F(1, df2), where df2 is the degrees of freedom from the new study (n_rep_tmp -2)
    f_stats <- new_r2/((1/(n_rep_tmp - 2)) - new_r2*(1/(n_rep_tmp - 2)))

    # Calculate p-values for all replicates
    pf(f_stats, df1=1, df2=(n_rep_tmp-2), lower.tail = FALSE)
}

#tmp <- get_ps(fish_orig[1], fish_rep[1], n_orig[1], n_rep[1])

p.vals <- mapply(get_ps, pi_dat_filt$cor_orig, pi_dat_filt$cor_rep, pi_dat_filt$n_orig, pi_dat_filt$n_rep, nproject=100, SIMPLIFY=TRUE)

# These are the number of times P < 0.05 for each study
p_means <- apply(p.vals, 1, function(x){mean(x < 0.05)})

summary(p_means)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6575  0.7637  0.7945  0.7901  0.8116  0.8904

# Make the plot for the studies

p_per_study <- apply(p.vals, 2, function(x){sum(x < 0.05)})

pdf(file = "../results/pval-replication.pdf",family="Lato")
# Code to make figures (assumes output is just # p vals < 0.05
df <- data.frame("cor_orig"=pi_dat_filt$cor_orig, "cor_rep"=pi_dat_filt$cor_rep, "psums"= p_per_study)

colfunc <- colorRampPalette(c("darkorange1", "purple3"))
cols <- colfunc(100)

plot(df$cor_orig, df$cor_rep, col=cols[df$psums], pch=19, cex=2, xlim=c(0,1), ylim=c(-1, 1), main="Effect sizes colored by fraction of P < 0.05 in replication", xlab="Original Effect Size", ylab="Replication Effect Size")
abline(0,1)
abline(h=0, lty="dashed")

legend(0.65,0,col=cols[seq(0,100,by=20)],pch=19,cex=1.5,
       legend=c("0%","20%","40%","60%","80%","100%"),title="% Rep. P < 0.05")
dev.off()

## quartz_off_screen 
##                 2

Replication paper

Jeff Leek

August 31, 2015

Load packages you need

Download the data

Reproducing Figure 3 from the paper

Now we transform to Fisher

Calculate the prediction interval

What percentage are in the table in various ways

Now lets do the same thing, but filtering as in the paper

Now we make the plot after filtering

What percentage are in the table in various ways

P-value simulation…how often should the p-values agree?