# Suppose we have population size and cost-per-sample without a risk score as follows
N=100000
k1=0.4
# Kernel width and variance for GP
k_width=5000
var_u=8000000
# Suppose we begin with k2() estimates at n-values
nset=c(10000,20000,30000)
# with cost-per-individual estimates
# (note that since empirical k2(n) is non-monotonic, it cannot be perfectly
# approximated with a power-law function)
k2=c(0.35,0.26,0.28)
# and associated error on those estimates
var_k2=c(0.02^2,0.01^2,0.03^2)
# We estimate theta from these three points
theta=powersolve(nset,k2,y_var=var_k2)$par
# We will estimate the posterior at these values of n
n=seq(1000,50000,length=1000)
# Mean and variance
p_mu=mu_fn(n,nset=nset,k2=k2,var_k2 = var_k2, N=N,k1=k1,theta=theta,
k_width=k_width,var_u=var_u)
p_var=psi_fn(n,nset=nset,N=N,var_k2 = var_k2,k_width=k_width,var_u=var_u)
# Plot
plot(0,xlim=range(n),ylim=c(20000,60000),type="n",
xlab="Training/holdout set size",
ylab="Total cost (= num. cases)")
lines(n,p_mu,col="blue")
lines(n,p_mu - 3*sqrt(p_var),col="red")
lines(n,p_mu + 3*sqrt(p_var),col="red")
points(nset,k1*nset + k2*(N-nset),pch=16,col="purple")
lines(n,k1*n + powerlaw(n,theta)*(N-n),lty=2)
segments(nset,k1*nset + (k2 - 3*sqrt(var_k2))*(N-nset),
nset,k1*nset + (k2 + 3*sqrt(var_k2))*(N-nset))
legend("topright",
c(expression(mu(n)),
expression(mu(n) %+-% 3*sqrt(psi(n))),
"prior(n)",
"d",
"3SD(d|n)"),
lty=c(1,1,2,NA,NA),lwd=c(1,1,1,NA,NA),pch=c(NA,NA,NA,16,124),
pt.cex=c(NA,NA,NA,1,1),
col=c("blue","red","black","purple","black"),bg="white")
Run the code above in your browser using DataLab