# Set seed.
set.seed(24015)
# Kernel width and Gaussian process variance
kw0=5000
vu0=1e7
# Include legend on plots or not; inclusion can obscure plot elements on small figures
inc_legend=FALSE
# Suppose we have population size and cost-per-sample without a risk score as follows
N=100000
k1=0.4
# Suppose that true values of a,b,c are given by
theta_true=c(10000,1.2,0.2)
theta_lower=c(1,0.5,0.1) # lower bounds for estimating theta
theta_upper=c(20000,2,0.5) # upper bounds for estimating theta
# We start with five random holdout set sizes (nset0),
# with corresponding cost-per-individual estimates k2_0 derived
# with various errors var_k2_0
nstart=10
vwmin=0.001; vwmax=0.005
nset0=round(runif(nstart,1000,N/2))
var_k2_0=runif(nstart,vwmin,vwmax)
k2_0=rnorm(nstart,mean=powerlaw(nset0,theta_true),sd=sqrt(var_k2_0))
# We estimate theta from these three points
theta0=powersolve(nset0,k2_0,y_var=var_k2_0,lower=theta_lower,upper=theta_upper,init=theta_true)$par
# We will estimate the posterior at these values of n
n=seq(1000,N,length=1000)
# Mean and variance
p_mu=mu_fn(n,nset=nset0,k2=k2_0,var_k2 = var_k2_0, N=N,k1=k1,theta=theta0,k_width=kw0,var_u=vu0)
p_var=psi_fn(n,nset=nset0,N=N,var_k2 = var_k2_0,k_width=kw0,var_u=vu0)
# Plot
yrange=c(-30000,100000)
plot(0,xlim=range(n),ylim=yrange,type="n",
xlab="Training/holdout set size",
ylab="Total cost (= num. cases)")
lines(n,p_mu,col="blue")
lines(n,p_mu - 3*sqrt(p_var),col="red")
lines(n,p_mu + 3*sqrt(p_var),col="red")
points(nset0,k1*nset0 + k2_0*(N-nset0),pch=16,col="purple")
lines(n,k1*n + powerlaw(n,theta0)*(N-n),lty=2)
lines(n,k1*n + powerlaw(n,theta_true)*(N-n),lty=3,lwd=3)
if (inc_legend) {
legend("topright",
c(expression(mu(n)),
expression(mu(n) %+-% 3*sqrt(psi(n))),
"prior(n)",
"True",
"d"),
lty=c(1,1,2,3,NA),lwd=c(1,1,1,3,NA),pch=c(NA,NA,NA,NA,16),pt.cex=c(NA,NA,NA,NA,1),
col=c("blue","red","black","purple"),bg="white")
}
## Add line corresponding to recommended new point. This is slow.
nn=seq(1000,N,length=20)
exp_imp <- next_n(nn,nset=nset0,k2=k2_0,var_k2 = var_k2_0, N=N,k1=k1,nmed=10,
lower=theta_lower,upper=theta_upper)
abline(v=nn[which.min(exp_imp)])
Run the code above in your browser using DataLab