Load packages you need

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
library(RCurl)
library(grid)
library(ggplot2)
library(downloader)
library(gridExtra)
library(extrafont)
library(Hmisc)

Download the data

We follow the data reading approach given in the code reproducing the paper.

download("https://osf.io/fgjvw/?action=download",destfile="../data/rpp_data.csv")
download("https://osf.io/bhcsf/?action=download",destfile="../data/rpp_data_codebook.csv")
date_downloaded = date()
date()
## [1] "Thu Sep 17 14:33:12 2015"

Reproducing Figure 3 from the paper

First we download the data following the lines 30-43 of the original code and updating because we had trouble getting the functions to work from OSF.

## Here we follow 

dat = read.csv("../data/rpp_data.csv",stringsAsFactors=F)
fdat = dplyr::filter(dat, !is.na(T_pval_USE..O.), !is.na(T_pval_USE..R.))

## 99 expected
nrow(fdat)
## [1] 99
idOK <- complete.cases(fdat$T_r..O.,fdat$T_r..R.)

## 97 expected
sum(idOK)
## [1] 97

Next make the plot following lines 192-250 of the original code and updating because we had trouble getting the functions to work from OSF

## Got this from https://github.com/FredHasselman/toolboxR/blob/master/C-3PR.R was having trouble with the
## sourcing

source("utils.R")
mytheme <- gg.theme("clean")

## Line 149 needed to make numeric
fdat$Power..R. <- as.numeric(fdat$Power..R.)

fdat$oriSig = "Not Significant"
fdat$oriSig[fdat$T_pval_USE..O.<=.06] = "Significant"
fdat$oriSig = factor(fdat$oriSig)

fdat$repSig = "Not Significant"
fdat$repSig[fdat$T_pval_USE..R.<=.05] = "Significant"
fdat$repSig = factor(fdat$repSig)

blankPlot <- plotHolder()


xDense <- ggplot(fdat, aes(x=T_r..O., fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))


yDense <- ggplot(fdat, aes(x=T_r..R., fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 


scatterP<-
  ggplot(fdat,aes(x=T_r..O.,y=T_r..R.)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=Power..R.,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(2,9)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.9,.6), plot.margin = unit(c(-2,-1.5,2,2), "lines")) 

grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

dim(fdat)
## [1]  99 140

Now we transform to Fisher

We follow the lines 490-516 to get the Fisher transforms include the degrees of freedom adjustments used by the original authors.

colnames(dat)[1] = "ID"

cor_orig = dat$T_r..O.
cor_rep = dat$T_r..R.

n_orig = dat$T_df2..O. + 2
n_rep = dat$T_df2..R. + 2


### Partial correlation, so degrees of freedom plus 2 in order to get N
n_orig[dat$ID == 82] <- dat$T_df1..O.[82]+2
n_rep[dat$ID == 82] <- dat$T_df1..R.[82]+2

### Correlation
n_orig[dat$ID == 120] <- dat$T_N..O.[120]
n_rep[dat$ID == 120] <- dat$T_N..R.[120]
n_orig[dat$ID == 154] <- dat$T_N..O.[154]
n_rep[dat$ID == 154] <- dat$T_N..R.[154]
n_orig[dat$ID == 155] <- dat$T_N..O.[155]
n_rep[dat$ID == 155] <- dat$T_N..R.[155]

### t
n_orig[dat$ID == 121] <- dat$T_N..O.[121]
n_rep[dat$ID == 121] <- dat$T_N..R.[121]

### Transform to Fisher's z
fish_orig = atanh(cor_orig)
fish_rep = atanh(cor_rep)

Calculate the prediction interval

Here we calculate prediction intervals based on Fisher’s z-scores.

se_total <- sqrt(1/(n_orig-3) + 1/(n_rep-3))
low = tanh(fish_orig - se_total * 1.96)
high = tanh(fish_orig + se_total * 1.96)
too_high = (cor_rep > high)
too_low = (cor_rep < low)
use_index = (!is.na(dat$T_pval_USE..O.) & !is.na(dat$T_pval_USE..R.))

pi_dat = data.frame(cor_orig, cor_rep, low, high, se_total, too_low, too_high, n_orig, n_rep,val = (too_low+2*too_high+1),use_index)

pi_dat = dplyr::filter(pi_dat, use_index > 0)

Here we make a plot. We haven’t filtered out the cases where we have more than one degree of freedom for the test yet.

pi_dat_nona = pi_dat[rowSums(is.na(pi_dat))==0,]
cols1 = c("grey","hotpink","dodgerblue")
plot(pi_dat_nona$cor_orig,pi_dat_nona$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",col=cols1[pi_dat_nona$val],pch=19)
abline(c(0,1),lwd=3,lty=2)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_nona$cor_orig,pi_dat_nona$low,pi_dat_nona$cor_orig,pi_dat_nona$high,col=cols1[pi_dat_nona$val],lwd=0.75)
legend(0.55,-0.5,col=c("grey","hotpink","dodgerblue"),pch=19,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"))

Here we save the plot to a pdf

font_import(pattern="Lato",prompt=FALSE)
loadfonts()
pdf(file = "../results/pi_figure_nofilter.pdf",family="Lato")
plot(pi_dat_nona$cor_orig,pi_dat_nona$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",bg=cols1[pi_dat_nona$val], col="darkgrey",
     pch=21,cex.lab=1.2,cex.axis=1.2,cex=1.5)
abline(c(0,1),lwd=3)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_nona$cor_orig,pi_dat_nona$low,pi_dat_nona$cor_orig,pi_dat_nona$high,col=cols1[pi_dat_nona$val],lwd=0.75)
legend(0.55,-0.5,pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()
## quartz_off_screen 
##                 2

Now plot sample size colored the same way

plot(log10(pi_dat_nona$n_orig),log10(pi_dat_nona$n_rep),
     xlab="Original n",ylab="Replication n",col=cols1[pi_dat_nona$val],pch=19,
     xaxt="n",yaxt="n")
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)

Save the sample size plot in a pdf

pdf(file = "../results/samplesize_figure_nofilter.pdf",family="Lato")
plot(log10(pi_dat_nona$n_orig),log10(pi_dat_nona$n_rep),
     xlab="Original n",ylab="Replication n",col="darkgrey",bg=cols1[pi_dat_nona$val],pch=21,
     xaxt="n",yaxt="n",cex=1.5)
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
dev.off()
## quartz_off_screen 
##                 2

What percentage are in the table in various ways

table(pi_dat_nona$cor_rep < pi_dat_nona$low)
## 
## FALSE  TRUE 
##    71    21
table(pi_dat_nona$cor_rep < pi_dat_nona$high)
## 
## FALSE  TRUE 
##     2    90

Now lets do the same thing, but filtering as in the paper

pi_dat = cbind(pi_dat,stat=as.character(fdat$T_Test.Statistic..R.),df1 = fdat$T_df1..O.)
pi_dat = mutate(pi_dat, stat_index= (stat == "F" & df1 == 1) | stat == "t" | stat == "r")
pi_dat_filt = filter(pi_dat,stat_index > 0)

Now we make the plot after filtering

cols1 = c("grey","hotpink","dodgerblue")
plot(pi_dat_filt$cor_orig,pi_dat_filt$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",col=cols1[pi_dat_filt$val],pch=19)
abline(c(0,1),lwd=3,lty=2)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_filt$cor_orig,pi_dat_filt$low,pi_dat_filt$cor_orig,pi_dat_filt$high,col=cols1[pi_dat_filt$val],lwd=0.75)
legend(0.55,-0.5,col=c("grey","hotpink","dodgerblue"),pch=19,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"))

Here we save the plot to a pdf

pdf(file = "../results/pi_figure_filter.pdf",family="Lato")
plot(pi_dat_filt$cor_orig,pi_dat_filt$cor_rep,ylim=c(-1,1),xlim=c(0,1),
     xlab="Original Effect",ylab="Replication Effect",bg=cols1[pi_dat_filt$val], col="darkgrey",
     pch=21,cex.lab=1.2,cex.axis=1.2,cex=1.2)
abline(c(0,1),lwd=3)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_filt$cor_orig,pi_dat_filt$low,pi_dat_filt$cor_orig,pi_dat_filt$high,col=cols1[pi_dat_filt$val],lwd=1)
legend(0.55,-0.5,pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()
## quartz_off_screen 
##                 2

Now plot sample size colored the same way

plot(log10(pi_dat_filt$n_orig),log10(pi_dat_filt$n_rep),
     xlab="Original n",ylab="Replication n",col=cols1[pi_dat_filt$val],pch=19,
     xaxt="n",yaxt="n")
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)

Save the sample size plot in a pdf

pdf(file = "../results/samplesize_figure_filter.pdf",family="Lato")
plot(log10(pi_dat_filt$n_orig),log10(pi_dat_filt$n_rep),
     xlab="Original n",ylab="Replication n",col="darkgrey",bg=cols1[pi_dat_filt$val],pch=21,
     xaxt="n",yaxt="n",cex=1.5)
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
     labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
legend(log10(9),log10(2000),pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
       legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()
## quartz_off_screen 
##                 2

What percentage are in the table in various ways

table(pi_dat_filt$cor_rep < pi_dat_filt$low)
## 
## FALSE  TRUE 
##    53    20
table(pi_dat_filt$cor_rep < pi_dat_filt$high)
## 
## FALSE  TRUE 
##     2    71

P-value simulation…how often should the p-values agree?

set.seed(35353)
get_ps <- function(cor_orig, cor_rep, n_orig, n_rep,nproject){
    # If we don't have a sample size for the replicate, use the sample size
    # of the original study. If that is also NA, use the median original sample
    # size.
    n_rep_tmp <- ifelse(is.na(n_rep), ifelse(is.na(n_orig), 55, n_orig), n_rep)

    # If we don't have an original effect size, use the median orignal
    # effect size.
    cor_orig_tmp <- ifelse(is.na(cor_orig), 0.3501, cor_orig)
    mean_term <- 0.5*log((1 + tanh(cor_orig_tmp))/(1 - tanh(cor_orig_tmp)))

    # Simulate 100 arcanh(r) assuming the original association is the true
    # one, but using the sample size of the replication study to compute the SD
    new_arc_rs <- rnorm(nproject, mean_term, 1/sqrt(n_rep_tmp - 3))

    # Convert to R^2
    new_r2 <- tanh(new_arc_rs)^2

    # Convert to F(1, df2), where df2 is the degrees of freedom from the new study (n_rep_tmp -2)
    f_stats <- new_r2/((1/(n_rep_tmp - 2)) - new_r2*(1/(n_rep_tmp - 2)))

    # Calculate p-values for all replicates
    pf(f_stats, df1=1, df2=(n_rep_tmp-2), lower.tail = FALSE)
}

#tmp <- get_ps(fish_orig[1], fish_rep[1], n_orig[1], n_rep[1])

p.vals <- mapply(get_ps, pi_dat_filt$cor_orig, pi_dat_filt$cor_rep, pi_dat_filt$n_orig, pi_dat_filt$n_rep, nproject=100, SIMPLIFY=TRUE)

# These are the number of times P < 0.05 for each study
p_means <- apply(p.vals, 1, function(x){mean(x < 0.05)})

summary(p_means)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6575  0.7637  0.7945  0.7901  0.8116  0.8904
# Make the plot for the studies

p_per_study <- apply(p.vals, 2, function(x){sum(x < 0.05)})

pdf(file = "../results/pval-replication.pdf",family="Lato")
# Code to make figures (assumes output is just # p vals < 0.05
df <- data.frame("cor_orig"=pi_dat_filt$cor_orig, "cor_rep"=pi_dat_filt$cor_rep, "psums"= p_per_study)

colfunc <- colorRampPalette(c("darkorange1", "purple3"))
cols <- colfunc(100)

plot(df$cor_orig, df$cor_rep, col=cols[df$psums], pch=19, cex=2, xlim=c(0,1), ylim=c(-1, 1), main="Effect sizes colored by fraction of P < 0.05 in replication", xlab="Original Effect Size", ylab="Replication Effect Size")
abline(0,1)
abline(h=0, lty="dashed")

legend(0.65,0,col=cols[seq(0,100,by=20)],pch=19,cex=1.5,
       legend=c("0%","20%","40%","60%","80%","100%"),title="% Rep. P < 0.05")
dev.off()
## quartz_off_screen 
##                 2