# NOT RUN {
# Compare CV MSEs and their SEs under 3 linear models
# for a stratified sample and a one-stage cluster sample,
# using data from the `survey` package
library(survey)
data("api", package = "survey")
# stratified sample
dstrat <- svydesign(id = ~1, strata = ~stype, weights = ~pw, data = apistrat,
fpc = ~fpc)
cv.svydesign(formulae = c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
design_object = dstrat, nfolds = 5)
# one-stage cluster sample
dclus1 <- svydesign(id = ~dnum, weights = ~pw, data = apiclus1, fpc = ~fpc)
cv.svydesign(formulae = c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
design_object = dclus1, nfolds = 5)
# Compare CV MSEs and their SEs under 3 linear models
# for a stratified cluster sample with clusters nested within strata
data(NSFG_data)
library(splines)
NSFG.svydes <- svydesign(id = ~SECU, strata = ~strata, nest = TRUE,
weights = ~wgt, data = NSFG_data)
cv.svydesign(formulae = c("income ~ ns(age, df = 2)",
"income ~ ns(age, df = 3)",
"income ~ ns(age, df = 4)"),
design_object = NSFG.svydes, nfolds = 4)
# Logistic regression example, using the same stratified cluster sample;
# instead of CV MSE, we calculate CV binary cross-entropy loss,
# where (as with MSE) lower values indicate better fitting models
# (NOTE: na.rm=TRUE is not usually ideal;
# it's used below purely for convenience, to keep the example short,
# but a thorough analysis would look for better ways to handle the missing data)
cv.svydesign(formulae = c("KnowPreg ~ ns(age, df = 1)",
"KnowPreg ~ ns(age, df = 2)",
"KnowPreg ~ ns(age, df = 3)"),
design_object = NSFG.svydes, nfolds = 4,
method = "logistic", na.rm = TRUE)
# }
Run the code above in your browser using DataLab