library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
library(RCurl)
library(grid)
library(ggplot2)
library(downloader)
library(gridExtra)
library(extrafont)
library(Hmisc)
We follow the data reading approach given in the code reproducing the paper.
download("https://osf.io/fgjvw/?action=download",destfile="../data/rpp_data.csv")
download("https://osf.io/bhcsf/?action=download",destfile="../data/rpp_data_codebook.csv")
date_downloaded = date()
date()
## [1] "Thu Sep 17 14:33:12 2015"
First we download the data following the lines 30-43 of the original code and updating because we had trouble getting the functions to work from OSF.
## Here we follow
dat = read.csv("../data/rpp_data.csv",stringsAsFactors=F)
fdat = dplyr::filter(dat, !is.na(T_pval_USE..O.), !is.na(T_pval_USE..R.))
## 99 expected
nrow(fdat)
## [1] 99
idOK <- complete.cases(fdat$T_r..O.,fdat$T_r..R.)
## 97 expected
sum(idOK)
## [1] 97
Next make the plot following lines 192-250 of the original code and updating because we had trouble getting the functions to work from OSF
## Got this from https://github.com/FredHasselman/toolboxR/blob/master/C-3PR.R was having trouble with the
## sourcing
source("utils.R")
mytheme <- gg.theme("clean")
## Line 149 needed to make numeric
fdat$Power..R. <- as.numeric(fdat$Power..R.)
fdat$oriSig = "Not Significant"
fdat$oriSig[fdat$T_pval_USE..O.<=.06] = "Significant"
fdat$oriSig = factor(fdat$oriSig)
fdat$repSig = "Not Significant"
fdat$repSig[fdat$T_pval_USE..R.<=.05] = "Significant"
fdat$repSig = factor(fdat$repSig)
blankPlot <- plotHolder()
xDense <- ggplot(fdat, aes(x=T_r..O., fill=oriSig)) +
geom_density(aes(y= ..count..),trim=F,alpha=.5) +
xlab("") + ylab("") + xlim(0,1) +
gg.theme("noax") +
theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))
yDense <- ggplot(fdat, aes(x=T_r..R., fill=repSig)) +
geom_density(aes(y= ..count..),trim=F,alpha=.5) +
xlab("") + ylab("") + xlim(-.5,1) +
coord_flip() +
gg.theme("noax") +
theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines"))
scatterP<-
ggplot(fdat,aes(x=T_r..O.,y=T_r..R.)) +
geom_hline(aes(yintercept=0),linetype=2) +
geom_abline(intercept=0,slope=1,color="Grey60")+
geom_point(aes(size=Power..R.,fill=repSig),color="Grey30",shape=21,alpha=.8) +
geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) +
geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) +
scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) +
scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) +
ggtitle("") + xlab("") + ylab("") +
scale_size_continuous(name="Replication Power",range=c(2,9)) +
scale_color_discrete(name="p-value") +
scale_fill_discrete(name="p-value") +
gg.theme("clean") +
theme(legend.position=c(.9,.6), plot.margin = unit(c(-2,-1.5,2,2), "lines"))
grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))
dim(fdat)
## [1] 99 140
We follow the lines 490-516 to get the Fisher transforms include the degrees of freedom adjustments used by the original authors.
colnames(dat)[1] = "ID"
cor_orig = dat$T_r..O.
cor_rep = dat$T_r..R.
n_orig = dat$T_df2..O. + 2
n_rep = dat$T_df2..R. + 2
### Partial correlation, so degrees of freedom plus 2 in order to get N
n_orig[dat$ID == 82] <- dat$T_df1..O.[82]+2
n_rep[dat$ID == 82] <- dat$T_df1..R.[82]+2
### Correlation
n_orig[dat$ID == 120] <- dat$T_N..O.[120]
n_rep[dat$ID == 120] <- dat$T_N..R.[120]
n_orig[dat$ID == 154] <- dat$T_N..O.[154]
n_rep[dat$ID == 154] <- dat$T_N..R.[154]
n_orig[dat$ID == 155] <- dat$T_N..O.[155]
n_rep[dat$ID == 155] <- dat$T_N..R.[155]
### t
n_orig[dat$ID == 121] <- dat$T_N..O.[121]
n_rep[dat$ID == 121] <- dat$T_N..R.[121]
### Transform to Fisher's z
fish_orig = atanh(cor_orig)
fish_rep = atanh(cor_rep)
Here we calculate prediction intervals based on Fisher’s z-scores.
se_total <- sqrt(1/(n_orig-3) + 1/(n_rep-3))
low = tanh(fish_orig - se_total * 1.96)
high = tanh(fish_orig + se_total * 1.96)
too_high = (cor_rep > high)
too_low = (cor_rep < low)
use_index = (!is.na(dat$T_pval_USE..O.) & !is.na(dat$T_pval_USE..R.))
pi_dat = data.frame(cor_orig, cor_rep, low, high, se_total, too_low, too_high, n_orig, n_rep,val = (too_low+2*too_high+1),use_index)
pi_dat = dplyr::filter(pi_dat, use_index > 0)
Here we make a plot. We haven’t filtered out the cases where we have more than one degree of freedom for the test yet.
pi_dat_nona = pi_dat[rowSums(is.na(pi_dat))==0,]
cols1 = c("grey","hotpink","dodgerblue")
plot(pi_dat_nona$cor_orig,pi_dat_nona$cor_rep,ylim=c(-1,1),xlim=c(0,1),
xlab="Original Effect",ylab="Replication Effect",col=cols1[pi_dat_nona$val],pch=19)
abline(c(0,1),lwd=3,lty=2)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_nona$cor_orig,pi_dat_nona$low,pi_dat_nona$cor_orig,pi_dat_nona$high,col=cols1[pi_dat_nona$val],lwd=0.75)
legend(0.55,-0.5,col=c("grey","hotpink","dodgerblue"),pch=19,
legend=c("In prediction interval","Below prediction interval","Above prediction interval"))
Here we save the plot to a pdf
font_import(pattern="Lato",prompt=FALSE)
loadfonts()
pdf(file = "../results/pi_figure_nofilter.pdf",family="Lato")
plot(pi_dat_nona$cor_orig,pi_dat_nona$cor_rep,ylim=c(-1,1),xlim=c(0,1),
xlab="Original Effect",ylab="Replication Effect",bg=cols1[pi_dat_nona$val], col="darkgrey",
pch=21,cex.lab=1.2,cex.axis=1.2,cex=1.5)
abline(c(0,1),lwd=3)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_nona$cor_orig,pi_dat_nona$low,pi_dat_nona$cor_orig,pi_dat_nona$high,col=cols1[pi_dat_nona$val],lwd=0.75)
legend(0.55,-0.5,pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()
## quartz_off_screen
## 2
Now plot sample size colored the same way
plot(log10(pi_dat_nona$n_orig),log10(pi_dat_nona$n_rep),
xlab="Original n",ylab="Replication n",col=cols1[pi_dat_nona$val],pch=19,
xaxt="n",yaxt="n")
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
Save the sample size plot in a pdf
pdf(file = "../results/samplesize_figure_nofilter.pdf",family="Lato")
plot(log10(pi_dat_nona$n_orig),log10(pi_dat_nona$n_rep),
xlab="Original n",ylab="Replication n",col="darkgrey",bg=cols1[pi_dat_nona$val],pch=21,
xaxt="n",yaxt="n",cex=1.5)
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
dev.off()
## quartz_off_screen
## 2
table(pi_dat_nona$cor_rep < pi_dat_nona$low)
##
## FALSE TRUE
## 71 21
table(pi_dat_nona$cor_rep < pi_dat_nona$high)
##
## FALSE TRUE
## 2 90
pi_dat = cbind(pi_dat,stat=as.character(fdat$T_Test.Statistic..R.),df1 = fdat$T_df1..O.)
pi_dat = mutate(pi_dat, stat_index= (stat == "F" & df1 == 1) | stat == "t" | stat == "r")
pi_dat_filt = filter(pi_dat,stat_index > 0)
cols1 = c("grey","hotpink","dodgerblue")
plot(pi_dat_filt$cor_orig,pi_dat_filt$cor_rep,ylim=c(-1,1),xlim=c(0,1),
xlab="Original Effect",ylab="Replication Effect",col=cols1[pi_dat_filt$val],pch=19)
abline(c(0,1),lwd=3,lty=2)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_filt$cor_orig,pi_dat_filt$low,pi_dat_filt$cor_orig,pi_dat_filt$high,col=cols1[pi_dat_filt$val],lwd=0.75)
legend(0.55,-0.5,col=c("grey","hotpink","dodgerblue"),pch=19,
legend=c("In prediction interval","Below prediction interval","Above prediction interval"))
Here we save the plot to a pdf
pdf(file = "../results/pi_figure_filter.pdf",family="Lato")
plot(pi_dat_filt$cor_orig,pi_dat_filt$cor_rep,ylim=c(-1,1),xlim=c(0,1),
xlab="Original Effect",ylab="Replication Effect",bg=cols1[pi_dat_filt$val], col="darkgrey",
pch=21,cex.lab=1.2,cex.axis=1.2,cex=1.2)
abline(c(0,1),lwd=3)
abline(h=0,lwd=3,lty=2)
segments(pi_dat_filt$cor_orig,pi_dat_filt$low,pi_dat_filt$cor_orig,pi_dat_filt$high,col=cols1[pi_dat_filt$val],lwd=1)
legend(0.55,-0.5,pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()
## quartz_off_screen
## 2
Now plot sample size colored the same way
plot(log10(pi_dat_filt$n_orig),log10(pi_dat_filt$n_rep),
xlab="Original n",ylab="Replication n",col=cols1[pi_dat_filt$val],pch=19,
xaxt="n",yaxt="n")
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
Save the sample size plot in a pdf
pdf(file = "../results/samplesize_figure_filter.pdf",family="Lato")
plot(log10(pi_dat_filt$n_orig),log10(pi_dat_filt$n_rep),
xlab="Original n",ylab="Replication n",col="darkgrey",bg=cols1[pi_dat_filt$val],pch=21,
xaxt="n",yaxt="n",cex=1.5)
axis(1,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
axis(2,at=log10(c(1,10,100,500,1000,5000,1e5,2.5e5)),
labels = c("1","10","100","500","1000","5000","100,000","250,000"),las=2)
abline(c(0,1),lwd=3)
legend(log10(9),log10(2000),pt.bg=c("grey","hotpink","dodgerblue"),col="darkgrey",pch=21,
legend=c("In prediction interval","Below prediction interval","Above prediction interval"),cex=1.2)
dev.off()
## quartz_off_screen
## 2
table(pi_dat_filt$cor_rep < pi_dat_filt$low)
##
## FALSE TRUE
## 53 20
table(pi_dat_filt$cor_rep < pi_dat_filt$high)
##
## FALSE TRUE
## 2 71
set.seed(35353)
get_ps <- function(cor_orig, cor_rep, n_orig, n_rep,nproject){
# If we don't have a sample size for the replicate, use the sample size
# of the original study. If that is also NA, use the median original sample
# size.
n_rep_tmp <- ifelse(is.na(n_rep), ifelse(is.na(n_orig), 55, n_orig), n_rep)
# If we don't have an original effect size, use the median orignal
# effect size.
cor_orig_tmp <- ifelse(is.na(cor_orig), 0.3501, cor_orig)
mean_term <- 0.5*log((1 + tanh(cor_orig_tmp))/(1 - tanh(cor_orig_tmp)))
# Simulate 100 arcanh(r) assuming the original association is the true
# one, but using the sample size of the replication study to compute the SD
new_arc_rs <- rnorm(nproject, mean_term, 1/sqrt(n_rep_tmp - 3))
# Convert to R^2
new_r2 <- tanh(new_arc_rs)^2
# Convert to F(1, df2), where df2 is the degrees of freedom from the new study (n_rep_tmp -2)
f_stats <- new_r2/((1/(n_rep_tmp - 2)) - new_r2*(1/(n_rep_tmp - 2)))
# Calculate p-values for all replicates
pf(f_stats, df1=1, df2=(n_rep_tmp-2), lower.tail = FALSE)
}
#tmp <- get_ps(fish_orig[1], fish_rep[1], n_orig[1], n_rep[1])
p.vals <- mapply(get_ps, pi_dat_filt$cor_orig, pi_dat_filt$cor_rep, pi_dat_filt$n_orig, pi_dat_filt$n_rep, nproject=100, SIMPLIFY=TRUE)
# These are the number of times P < 0.05 for each study
p_means <- apply(p.vals, 1, function(x){mean(x < 0.05)})
summary(p_means)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6575 0.7637 0.7945 0.7901 0.8116 0.8904
# Make the plot for the studies
p_per_study <- apply(p.vals, 2, function(x){sum(x < 0.05)})
pdf(file = "../results/pval-replication.pdf",family="Lato")
# Code to make figures (assumes output is just # p vals < 0.05
df <- data.frame("cor_orig"=pi_dat_filt$cor_orig, "cor_rep"=pi_dat_filt$cor_rep, "psums"= p_per_study)
colfunc <- colorRampPalette(c("darkorange1", "purple3"))
cols <- colfunc(100)
plot(df$cor_orig, df$cor_rep, col=cols[df$psums], pch=19, cex=2, xlim=c(0,1), ylim=c(-1, 1), main="Effect sizes colored by fraction of P < 0.05 in replication", xlab="Original Effect Size", ylab="Replication Effect Size")
abline(0,1)
abline(h=0, lty="dashed")
legend(0.65,0,col=cols[seq(0,100,by=20)],pch=19,cex=1.5,
legend=c("0%","20%","40%","60%","80%","100%"),title="% Rep. P < 0.05")
dev.off()
## quartz_off_screen
## 2