# Artificial population of 50 568 business establishments and 5 business
# sectors (data from Valliant, R., Dever, J. A., & Kreuter, F. (2013).
# Practical tools for designing and weighting survey samples. Springer.
# https://doi.org/10.1007/978-1-4614-6449-5, Example 5.2 pages 133-9)
# See also
# file: Code 5.3 constrOptim.example.R
Nh <- c(6221, 11738, 4333, 22809, 5467) # stratum sizes
ch <- c(120, 80, 80, 90, 150) # stratum-specific cost of surveying
# Revenues
mh.rev <- c(85, 11, 23, 17, 126) # mean revenue
Sh.rev <- c(170.0, 8.8, 23.0, 25.5, 315.0) # standard deviation revenue
# Employees
mh.emp <- c(511, 21, 70, 32, 157) # mean number of employees
Sh.emp <- c(255.50, 5.25, 35.00, 32.00, 471.00) # std. dev. employees
# Proportion of estabs claiming research credit
ph.rsch <- c(0.8, 0.2, 0.5, 0.3, 0.9)
# Proportion of estabs with offshore affiliates
ph.offsh <- c(0.06, 0.03, 0.03, 0.21, 0.77)
budget <- 300000 # overall available budget
n.min <- 100 # minimum stratum-specific sample size
# Examples
#----------------------------------------------------------------------------
# Example 1: Minimization of the variation of estimates for revenue subject
# to cost restrictions and precision restrictions to the coefficient of
# variation of estimates for the proportion of businesses with offshore
# affiliates.
l <- rep(n.min, 5) # minimum sample size per stratum
u <- Nh # maximum sample size per stratum
C <- rbind(ch,
ch * c(-1, -1, -1, 0, 0))
c <- c(budget, # Maximum overall survey budget
- 0.5 * budget) # Minimum overall budget for strata 1-3
# We require at maximum 5 % relative standard error for estimates of
# proportion of businesses with offshore affiliates
A <- matrix(ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2,
nrow = 1)
a <- sum(ph.offsh * (1 - ph.offsh) * Nh**2/(Nh - 1)
)/sum(Nh * ph.offsh)**2 + 0.05**2
D <- matrix(Sh.rev**2 * Nh**2, nrow = 1) # objective variance components
d <- sum(Sh.rev**2 * Nh) # finite population correction
opts <- list(sense = "max_precision",
f = NULL, df = NULL, Hf = NULL,
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 100L, print_pm = FALSE)
sol <- mosalloc(D = D, d = d, A = A, a = a, C = C, c = c, l = l, u = u,
opts = opts)
# Check solution statement of the internal solver to verify feasibility
sol$Ecosolver$Ecoinfostring # [1] "Optimal solution found"
# Check constraints
c(C[1, ] %*% sol$n) # [1] 3e+05
c(C[2, ] %*% sol$n) # [1] -150000
c(sqrt(A %*% (1 / sol$n) - A %*% (1 / Nh))) # 5 % rel. std. err.
#----------------------------------------------------------------------------
# Example 2: Minimization of the maximum relative variation of estimates for
# the total revenue, the number of employee, the number of businesses claimed
# research credit, and the number of businesses with offshore affiliates
# subject to cost restrictions
l <- rep(n.min, 5) # minimum sample size ber stratum
u <- Nh # maximum sample size per stratum
C <- rbind(ch, ch * c(-1, -1, -1, 0, 0))
c <- c(budget, - 0.5 * budget)
A <- NULL # no precision constraint
a <- NULL # no precision constraint
# Precision components (Variance / Totals^2) for multidimensional objective
D <- rbind(Sh.rev**2 * Nh**2/sum(Nh * mh.rev)**2,
Sh.emp**2 * Nh**2/sum(Nh * mh.emp)**2,
ph.rsch * (1 - ph.rsch) * Nh**3/(Nh - 1)/sum(Nh * ph.rsch)**2,
ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2)
d <- as.vector(D %*% (1 / Nh)) # finite population correction
opts <- list(sense = "max_precision",
f = NULL, df = NULL, Hf = NULL,
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 100L, print_pm = FALSE)
sol <- mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
# Obtain optimal objective value
sol$J # [1] 0.0017058896 0.0004396972 0.0006428475 0.0017058896
# Obtain corresponding normal vector
sol$Normal # [1] 6.983113e-01 1.337310e-11 1.596167e-11 3.016887e-01
# => Revenue and offshore affiliates are dominating the solution with a
# ratio of approximately 2:1 (sol$Normal[1] / sol$Normal[4])
#----------------------------------------------------------------------------
# Example 3: Example 2 with preference weighting
w <- c(1, 3.85, 3.8, 1.3) # preference weighting
l <- rep(n.min, 5) # minimum sample size ber stratum
u <- Nh # maximum sample size per stratum
C <- rbind(ch, ch * c(-1, -1, -1, 0, 0))
c <- c(budget, - 0.5 * budget)
A <- NULL # no precision constraint
a <- NULL # no precision constraint
D <- rbind(Sh.rev**2 * Nh**2/sum(Nh * mh.rev)**2,
Sh.emp**2 * Nh**2/sum(Nh * mh.emp)**2,
ph.rsch * (1 - ph.rsch) * Nh**3/(Nh - 1)/sum(Nh * ph.rsch)**2,
ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2)
d <- as.vector(D %*% (1 / Nh))
opts <- list(sense = "max_precision",
f = NULL, df = NULL, Hf = NULL,
init_w = w,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 100L, print_pm = FALSE)
mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
#----------------------------------------------------------------------------
# Example 4: Example 2 with multiple preference weightings for simultaneous
# evaluation
w <- matrix(c(1.0, 1.0, 1.0, 1.0, # matrix of preference weightings
1.0, 3.9, 3.9, 1.3,
0.8, 4.2, 4.8, 1.5,
1.2, 3.5, 4.8, 2.0,
2.0, 1.0, 1.0, 2.0), 5, 4, byrow = TRUE)
w <- w / w[,1] # rescale w (ensure the first weighting to be one)
l <- rep(n.min, 5) # minimum sample size ber stratum
u <- Nh # maximum sample size per stratum
C <- rbind(ch, ch * c(-1, -1, -1, 0, 0))
c <- c(budget, - 0.5 * budget)
A <- NULL # no precision constraint
a <- NULL # no precision constraint
D <- rbind(Sh.rev**2 * Nh**2/sum(Nh * mh.rev)**2,
Sh.emp**2 * Nh**2/sum(Nh * mh.emp)**2,
ph.rsch * (1 - ph.rsch) * Nh**3/(Nh - 1)/sum(Nh * ph.rsch)**2,
ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2)
d <- as.vector(D %*% (1 / Nh))
opts <- list(sense = "max_precision",
f = NULL, df = NULL, Hf = NULL,
init_w = w,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 100L, print_pm = FALSE)
sols <- mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
lapply(sols, function(sol){sol$Qbounds})
#----------------------------------------------------------------------------
# Example 5: Example 2 where a weighted sum scalarization of the objective
# components is minimized
l <- rep(n.min, 5) # minimum sample size ber stratum
u <- Nh # maximum sample size per stratum
C <- matrix(ch, nrow = 1)
c <- budget
A <- NULL # no precision constraint
a <- NULL # no precision constraint
# Objective variance components
D <- rbind(Sh.rev**2 * Nh**2/sum(Nh * mh.rev)**2,
Sh.emp**2 * Nh**2/sum(Nh * mh.emp)**2,
ph.rsch * (1 - ph.rsch) * Nh**3/(Nh - 1)/sum(Nh * ph.rsch)**2,
ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2)
d <- as.vector(D %*% (1 / Nh)) # finite population correction
# Simple weighted sum as decision functional
wss <- c(1, 1, 0.5, 0.5) # preference weighting (weighted sum scalarization)
Dw <- wss %*% D
dw <- as.vector(Dw %*% (1 / Nh))
opts <- list(sense = "max_precision",
f = NULL, df = NULL, Hf = NULL,
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 1000L, print_pm = FALSE)
# Solve weighted sum scalarization (WSS) via mosalloc
sol_wss <- mosalloc(D = Dw, d = dw, C = C, c = c, l = l, u = u, opts = opts)
# Obtain optimal objective values
J <- D %*% (1 / sol_wss$n) - d
# Reconstruct solution via a weighted Chebyshev minimization
wcm <- J[1] / J
opts = list(sense = "max_precision",
f = NULL, df = NULL, Hf = NULL,
init_w = matrix(wcm, 1),
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 1000L, print_pm = FALSE)
sol_wcm <- mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
# Compare solutions
rbind(t(J), sol_wcm$J)
# [,1] [,2] [,3] [,4]
# [1,] 0.00155645 0.0004037429 0.0005934474 0.001327165
# [2,] 0.00155645 0.0004037429 0.0005934474 0.001327165
rbind(sol_wss$n, sol_wcm$n)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 582.8247 236.6479 116.7866 839.5988 841.4825
# [2,] 582.8226 236.6475 116.7871 839.5989 841.4841
rbind(wss, sol_wcm$Normal / sol_wcm$Normal[1])
# [,1] [,2] [,3] [,4]
#wss 1 1.0000000 0.5000000 0.5000000
# 1 0.9976722 0.4997552 0.4997462
#----------------------------------------------------------------------------
# Example 6: Example 1 with two subpopulations and a p-norm as decision
# functional
l <- rep(n.min, 5) # minimum sample size per stratum
u <- Nh # maximum sample size per stratum
C <- rbind(ch, ch * c(-1, -1, -1, 0, 0))
c <- c(budget, - 0.5 * budget)
# At maximum 5 % relative standard error for estimates of proportion of
# businesses with offshore affiliates
A <- matrix(ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2,
nrow = 1)
a <- sum(ph.offsh * (1 - ph.offsh) * Nh**2/(Nh - 1)
)/sum(Nh * ph.offsh)**2 + 0.05**2
D <- rbind((Sh.rev**2 * Nh**2)*c(0,0,1,1,0),
(Sh.rev**2 * Nh**2)*c(1,1,0,0,1))# objective variance components
d <- as.vector(D %*% (1 / Nh)) # finite population correction
# p-norm solution
p <- 5 # p-norm
opts <- list(sense = "max_precision",
f = function(x) sum(x**p),
df = function(x) p * x**(p - 1),
Hf = function(x) diag(p * (p - 1) * x**(p - 2)),
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 1000L, print_pm = TRUE)
sol <- mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
c(sol$Normal/sol$dfJ)/mean(c(sol$Normal/sol$dfJ))
# [1] 0.9999972 1.0000028
#----------------------------------------------------------------------------
# Example 7: Example 2 with p-norm as decision functional and only one
# overall cost constraint
l <- rep(n.min, 5) # minimum sample size ber stratum
u <- Nh # maximum sample size per stratum
C <- matrix(ch, nrow = 1)
c <- budget
A <- NULL # no precision constraint
a <- NULL # no precision constraint
# Objective precision components
D <- rbind(Sh.rev**2 * Nh**2/sum(Nh * mh.rev)**2,
Sh.emp**2 * Nh**2/sum(Nh * mh.emp)**2,
ph.rsch * (1 - ph.rsch) * Nh**3/(Nh - 1)/sum(Nh * ph.rsch)**2,
ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2)
d <- as.vector(D %*% (1 / Nh)) # finite population correction
# p-norm solution
p <- 5 # p-norm
opts <- list(sense = "max_precision",
f = function(x) sum(x**p),
df = function(x) p * x**(p - 1),
Hf = function(x) diag(p * (p - 1) * x**(p - 2)),
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 1000L, print_pm = TRUE)
sol <- mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
c(sol$Normal/sol$dfJ)/mean(c(sol$Normal/sol$dfJ))
# [1] 1.0014362 0.9780042 1.0197807 1.0007789
#----------------------------------------------------------------------------
# Example 8: Minimization of sample sizes subject to precision constraints
l <- rep(n.min, 5) # minimum sample size ber stratum
u <- Nh # maximum sample size per stratum
# We require at maximum 4.66 % relative standard error for the estimate of
# total revenuee, 5 % for the number of employees, 3 % for the proportion of
# businesses claiming research credit, and 3 % for the proportion of
# businesses with offshore affiliates
A <- rbind(Sh.rev**2 * Nh**2/sum(Nh * mh.rev)**2,
Sh.emp**2 * Nh**2/sum(Nh * mh.emp)**2,
ph.rsch * (1 - ph.rsch) * Nh**3/(Nh - 1)/sum(Nh * ph.rsch)**2,
ph.offsh * (1 - ph.offsh) * Nh**3/(Nh - 1)/sum(Nh * ph.offsh)**2)
a <- as.vector(A%*%(1 / Nh) + c(0.0466, 0.05, 0.03, 0.03)**2)
# We do not consider any additional sample size or cost constraints
C <- NULL # no cost constraint
c <- NULL # no cost constraint
# Since we minimize the sample size, we define D and d as follows:
D <- matrix(1, nrow = 1, ncol = length(Nh)) # objective cost components
d <- as.vector(0) # vector of possible fixed cost
opts <- list(sense = "min_cost", # Sense of optimization is survey cost
f = NULL,
df = NULL,
Hf = NULL,
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 100L, print_pm = TRUE)
sol <- mosalloc(D = D, d = d, A = A, a = a, l = l, u = u, opts = opts)
sum(sol$n) # [1] 2843.219
sol$J # [1] 2843.219
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# Note: Sample size optimization for two-stage cluster sampling can be
# reduced to the structure of optimal stratified random samplin when
# considering expected costs. Therefore, mosalloc() can handle such
# designs. A benefit is that mosalloc() allows relatively complex
# sample size restrictions such as box constraints for subsampling.
# Optimal sample sizes at secondary stages have to be reconstructed
# from sol$n.
#
# Example 9: Optimal number of primary sampling units (PSU) and secondary
# sampling units (SSU) in 2-stage cluster sampling.
set.seed(1234)
pop <- data.frame(value = rnorm(100, 100, 35),
cluster = sample(1:4, 100, replace = TRUE))
CI <- 36 # Sampling cost per PSU
CII <- 10 # Average sampling cost per SSU
NI <- 4 # Number of PSUs
NII <- table(pop$cluster) # PSU/cluster sizes
S2I <- var(by(pop$value, pop$cluster, sum)) # between PSU variance
S2II <- by(pop$value, pop$cluster, var) # within PSU variances
D <- matrix(c(NI**2 * S2I - NI * sum(NII * S2II), NI * NII**2 * S2II), 1)
d <- as.vector(D %*% (1 / c(NI, NI * NII))) # = NI * S2I
C <- cbind(c(CI, rep(2, NI), -NII),
rbind(rep(CII / NI, 4), -diag(4), diag(4)))
c <- as.vector(c(500, rep(0, 8)))
l <- c(2, rep(4, 4))
u <- c(NI, NI * NII)
opts <- list(sense = "max_precision",
f = NULL,
df = NULL,
Hf = NULL,
init_w = 1,
mc_cores = 1L, pm_tol = 1e-05,
max_iters = 100L, print_pm = TRUE)
sol <- mosalloc(D = D, d = d, C = C, c = c, l = l, u = u, opts = opts)
# Optimum number of clusters to be drawn
sol$n[1] # [1] 2.991551
# Optimum number of elements to be drawn within clusters
sol$n[-1] / sol$n[1] # [1] 12.16454 11.60828 15.87949 12.80266
Run the code above in your browser using DataLab