# Check that aregImpute can almost exactly estimate missing values when
# there is a perfect nonlinear relationship between two variables
# Fit restricted cubic splines with 4 knots for x1 and x2, linear for x3
set.seed(3)
x1 <- rnorm(200)
x2 <- x1^2
x3 <- runif(200)
m <- 30
x2[1:m] <- NA
a <- aregImpute(~x1+x2+I(x3), n.impute=5, nk=4, match='closest')
a
matplot(x1[1:m]^2, a$imputed$x2)
abline(a=0, b=1, lty=2)
x1[1:m]^2
a$imputed$x2
# Multiple imputation and estimation of variances and covariances of
# regression coefficient estimates accounting for imputation
# Example 1: large sample size, much missing data, no overlap in
# NAs across variables
x1 <- factor(sample(c('a','b','c'),1000,TRUE))
x2 <- (x1=='b') + 3*(x1=='c') + rnorm(1000,0,2)
x3 <- rnorm(1000)
y <- x2 + 1*(x1=='c') + .2*x3 + rnorm(1000,0,2)
orig.x1 <- x1[1:250]
orig.x2 <- x2[251:350]
x1[1:250] <- NA
x2[251:350] <- NA
d <- data.frame(x1,x2,x3,y)
# Find value of nk that yields best validating imputation models
# tlinear=FALSE means to not force the target variable to be linear
f <- aregImpute(~y + x1 + x2 + x3, nk=c(0,3:5), tlinear=FALSE,
data=d, B=10) # normally B=75
f
# Try forcing target variable (x1, then x2) to be linear while allowing
# predictors to be nonlinear (could also say tlinear=TRUE)
f <- aregImpute(~y + x1 + x2 + x3, nk=c(0,3:5), data=d, B=10)
f
## Not run:
# # Use 100 imputations to better check against individual true values
# f <- aregImpute(~y + x1 + x2 + x3, n.impute=100, data=d)
# f
# par(mfrow=c(2,1))
# plot(f)
# modecat <- function(u) {
# tab <- table(u)
# as.numeric(names(tab)[tab==max(tab)][1])
# }
# table(orig.x1,apply(f$imputed$x1, 1, modecat))
# par(mfrow=c(1,1))
# plot(orig.x2, apply(f$imputed$x2, 1, mean))
# fmi <- fit.mult.impute(y ~ x1 + x2 + x3, lm, f,
# data=d)
# sqrt(diag(vcov(fmi)))
# fcc <- lm(y ~ x1 + x2 + x3)
# summary(fcc) # SEs are larger than from mult. imputation
# ## End(Not run)
## Not run:
# # Example 2: Very discriminating imputation models,
# # x1 and x2 have some NAs on the same rows, smaller n
# set.seed(5)
# x1 <- factor(sample(c('a','b','c'),100,TRUE))
# x2 <- (x1=='b') + 3*(x1=='c') + rnorm(100,0,.4)
# x3 <- rnorm(100)
# y <- x2 + 1*(x1=='c') + .2*x3 + rnorm(100,0,.4)
# orig.x1 <- x1[1:20]
# orig.x2 <- x2[18:23]
# x1[1:20] <- NA
# x2[18:23] <- NA
# #x2[21:25] <- NA
# d <- data.frame(x1,x2,x3,y)
# n <- naclus(d)
# plot(n); naplot(n) # Show patterns of NAs
# # 100 imputations to study them; normally use 5 or 10
# f <- aregImpute(~y + x1 + x2 + x3, n.impute=100, nk=0, data=d)
# par(mfrow=c(2,3))
# plot(f, diagnostics=TRUE, maxn=2)
# # Note: diagnostics=TRUE makes graphs similar to those made by:
# # r <- range(f$imputed$x2, orig.x2)
# # for(i in 1:6) { # use 1:2 to mimic maxn=2
# # plot(1:100, f$imputed$x2[i,], ylim=r,
# # ylab=paste("Imputations for Obs.",i))
# # abline(h=orig.x2[i],lty=2)
# # }
#
# table(orig.x1,apply(f$imputed$x1, 1, modecat))
# par(mfrow=c(1,1))
# plot(orig.x2, apply(f$imputed$x2, 1, mean))
#
#
# fmi <- fit.mult.impute(y ~ x1 + x2, lm, f,
# data=d)
# sqrt(diag(vcov(fmi)))
# fcc <- lm(y ~ x1 + x2)
# summary(fcc) # SEs are larger than from mult. imputation
# ## End(Not run)
## Not run:
# # Study relationship between smoothing parameter for weighting function
# # (multiplier of mean absolute distance of transformed predicted
# # values, used in tricube weighting function) and standard deviation
# # of multiple imputations. SDs are computed from average variances
# # across subjects. match="closest" same as match="weighted" with
# # small value of fweighted.
# # This example also shows problems with predicted mean
# # matching almost always giving the same imputed values when there is
# # only one predictor (regression coefficients change over multiple
# # imputations but predicted values are virtually 1-1 functions of each
# # other)
#
# set.seed(23)
# x <- runif(200)
# y <- x + runif(200, -.05, .05)
# r <- resid(lsfit(x,y))
# rmse <- sqrt(sum(r^2)/(200-2)) # sqrt of residual MSE
#
# y[1:20] <- NA
# d <- data.frame(x,y)
# f <- aregImpute(~ x + y, n.impute=10, match='closest', data=d)
# # As an aside here is how to create a completed dataset for imputation
# # number 3 as fit.mult.impute would do automatically. In this degenerate
# # case changing 3 to 1-2,4-10 will not alter the results.
# imputed <- impute.transcan(f, imputation=3, data=d, list.out=TRUE,
# pr=FALSE, check=FALSE)
# sd <- sqrt(mean(apply(f$imputed$y, 1, var)))
#
# ss <- c(0, .01, .02, seq(.05, 1, length=20))
# sds <- ss; sds[1] <- sd
#
# for(i in 2:length(ss)) {
# f <- aregImpute(~ x + y, n.impute=10, fweighted=ss[i])
# sds[i] <- sqrt(mean(apply(f$imputed$y, 1, var)))
# }
#
# plot(ss, sds, xlab='Smoothing Parameter', ylab='SD of Imputed Values',
# type='b')
# abline(v=.2, lty=2) # default value of fweighted
# abline(h=rmse, lty=2) # root MSE of residuals from linear regression
# ## End(Not run)
## Not run:
# # Do a similar experiment for the Titanic dataset
# getHdata(titanic3)
# h <- lm(age ~ sex + pclass + survived, data=titanic3)
# rmse <- summary(h)$sigma
# set.seed(21)
# f <- aregImpute(~ age + sex + pclass + survived, n.impute=10,
# data=titanic3, match='closest')
# sd <- sqrt(mean(apply(f$imputed$age, 1, var)))
#
# ss <- c(0, .01, .02, seq(.05, 1, length=20))
# sds <- ss; sds[1] <- sd
#
# for(i in 2:length(ss)) {
# f <- aregImpute(~ age + sex + pclass + survived, data=titanic3,
# n.impute=10, fweighted=ss[i])
# sds[i] <- sqrt(mean(apply(f$imputed$age, 1, var)))
# }
#
# plot(ss, sds, xlab='Smoothing Parameter', ylab='SD of Imputed Values',
# type='b')
# abline(v=.2, lty=2) # default value of fweighted
# abline(h=rmse, lty=2) # root MSE of residuals from linear regression
# ## End(Not run)
d <- data.frame(x1=rnorm(50), x2=c(rep(NA, 10), runif(40)),
x3=c(runif(4), rep(NA, 11), runif(35)))
reformM(~ x1 + x2 + x3, data=d)
reformM(~ x1 + x2 + x3, data=d, nperm=2)
# Give result or one of the results as the first argument to aregImpute
Run the code above in your browser using DataLab