if (FALSE) {
#############################################################################
# EXAMPLE 1: Simulated example linear regression with interaction effects
#############################################################################
# The interaction model stats::lm( Y ~ X + Z + X:Z) is of substantive interest.
# There can be missing values in all variables.
data(data.mb01)
dat <- data.mb01$missing
#******************************************
# Model 1: ML approach
#--- specify models
# define integration nodes
xnodes <- seq(-4,4,len=11) # nodes for X
ynodes <- seq(-10,10,len=13)
# nodes for Y. These ynodes are not really necessary for this dataset because
# Y has no missing values.
# define model for dependent variable Y
dep <- list("model"="linreg", "formula"=Y ~ X*Z, "nodes"=ynodes )
# model P(X|Z)
ind_X <- list( "model"="linreg", "formula"=X ~ Z, "nodes"=xnodes )
# all models for covariates
ind <- list( "X"=ind_X )
#--- estimate model with numerical integration
mod1 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind )
summary(mod1)
# extract some informations
coef(mod1)
vcov(mod1)
logLik(mod1)
#******************************************
# Model 2: Fully Bayesian approach / Multiple Imputation
#--- define models
dep <- list("model"="linreg", "formula"=Y ~ X*Z )
ind_X <- list( "model"="linreg", "formula"=X ~ Z )
ind_Z <- list( "model"="linreg", "formula"=Z ~ 1 )
ind <- list( "X"=ind_X, Z=ind_Z)
#--- estimate model
mod2 <- mdmb::frm_fb(dat, dep, ind, burnin=200, iter=1000)
summary(mod2)
#* plot parameters
plot(mod2)
#--- create list of multiply imputed datasets
datlist <- mdmb::frm2datlist(mod2)
# convert into object of class mids
imp2 <- miceadds::datlist2mids(datlist)
# estimate linear model on multiply imputed datasets
mod2c <- with(imp2, stats::lm( Y ~ X*Z ) )
summary( mice::pool(mod2c) )
#******************************************
# Model 3: Multiple imputation in jomo package
library(jomo)
# impute with substantive model containing interaction effects
formula <- Y ~ X*Z
imp <- jomo::jomo.lm( formula=formula, data=dat, nburn=100, nbetween=100)
# convert to object of class mids
datlist <- miceadds::jomo2mids( imp )
# estimate linear model
mod3 <- with(datlist, lm( Y ~ X*Z ) )
summary( mice::pool(mod3) )
#############################################################################
# EXAMPLE 2: Simulated example logistic regression with interaction effects
#############################################################################
# Interaction model within a logistic regression Y ~ X + Z + X:Z
# Y and Z are dichotomous variables.
# attach data
data(data.mb02)
dat <- data.mb02$missing
#******************************************
# Model 1: ML approach
#--- specify model
# define nodes
xnodes <- seq(-5,5,len=15) # X - normally distributed variable
ynodes <- c(0,1) # Y and Z dichotomous variable
# model P(Y|X,Z) for dependent variable
dep <- list("model"="logistic", "formula"=Y ~ X*Z, "nodes"=ynodes )
# model P(X|Z)
ind_X <- list( "model"="linreg", "formula"=X ~ Z, "nodes"=xnodes )
# model P(Z)
ind_Z <- list( "model"="logistic", "formula"=Z ~ 1, "nodes"=ynodes )
ind <- list( "Z"=ind_Z, "X"=ind_X )
#--- estimate model with numerical integration
mod1 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind )
summary(mod1)
#******************************************
# Model 2: Fully Bayesian approach
#--- specify model
dep <- list("model"="logistic", "formula"=Y ~ X*Z )
ind_X <- list( "model"="linreg", "formula"=X ~ Z )
ind_Z <- list( "model"="logistic", "formula"=Z ~ 1 )
ind <- list( "Z"=ind_Z, "X"=ind_X )
#--- Bayesian estimation
mod2 <- mdmb::frm_fb(dat=dat, dep=dep, ind=ind, burnin=500, iter=1000 )
summary(mod2)
#############################################################################
# EXAMPLE 3: Confirmatory factor analysis
#############################################################################
# A latent variable can be considered as missing data and the 'frm_em' function
# is used to estimate the latent variable model.
#--- simulate data
N <- 1000
set.seed(91834)
# latent variable
theta <- stats::rnorm(N)
# simulate items
y1 <- 1.5 + 1*theta + stats::rnorm(N, sd=.7 )
y2 <- 1.9 + .7*theta + stats::rnorm(N, sd=1 )
y3 <- .9 + .7*theta + stats::rnorm(N, sd=.2 )
dat <- data.frame(y1,y2,y3)
dat$theta <- NA
#******************************************
# Model 1: ML approach
#--- define model
nodes <- seq(-4,4,len=21)
ind_y1 <- list("model"="linreg", "formula"=y1 ~ offset(1*theta),
"nodes"=nodes )
ind_y2 <- list( "model"="linreg", "formula"=y2 ~ theta, "nodes"=nodes,
"coef_inits"=c(NA,1) )
ind_y3 <- list( "model"="linreg", "formula"=y3 ~ theta, "nodes"=nodes,
"coef_inits"=c(1,1) )
dep <- list( "model"="linreg", "formula"=theta ~ 0, "nodes"=nodes )
ind <- list( "y1"=ind_y1, "y2"=ind_y2, "y3"=ind_y3)
#*** estimate model with mdmb::frm_em
mod1 <- mdmb::frm_em(dat, dep, ind)
summary(mod1)
#*** estimate model in lavaan
library(lavaan)
lavmodel <- "
theta=~ 1*y1 + y2 + y3
theta ~~ theta
"
mod1b <- lavaan::cfa( model=lavmodel, data=dat )
summary(mod1b)
# compare likelihood
logLik(mod1)
logLik(mod1b)
#############################################################################
# EXAMPLE 4: Rasch model
#############################################################################
#--- simulate data
set.seed(91834)
N <- 500
# latent variable
theta0 <- theta <- stats::rnorm(N)
# number of items
I <- 7
dat <- sirt::sim.raschtype( theta, b=seq(-1.5,1.5,len=I) )
colnames(dat) <- paste0("I",1:I)
dat$theta <- NA
#******************************************
# Model 1: ML approach
#--- define model
nodes <- seq(-4,4,len=13)
dep <- list("model"="linreg", "formula"=theta ~ 0, "nodes"=nodes )
ind <- list()
for (ii in 1:I){
ind_ii <- list( "model"="logistic", formula=
stats::as.formula( paste0("I",ii, " ~ offset(1*theta)") ) )
ind[[ii]] <- ind_ii
}
names(ind) <- colnames(dat)[-(I+1)]
#--- estimate Rasch model with mdmb::frm_em
mod1 <- mdmb::frm_em(dat, dep, ind )
summary(mod1)
#--- estmate Rasch model with sirt package
library(sirt)
mod2 <- sirt::rasch.mml2( dat[,-(I+1)], theta.k=nodes, use.freqpatt=FALSE)
summary(mod2)
#** compare estimated parameters
round(cbind(coef(mod1), c( mod2$sd.trait, -mod2$item$thresh[ seq(I,1)] ) ), 3)
#############################################################################
# EXAMPLE 5: Regression model with measurement error in covariates
#############################################################################
#--- simulate data
set.seed(768)
N <- 1000
# true score
theta <- stats::rnorm(N)
# heterogeneous error variance
var_err <- stats::runif(N, .5, 1)
# simulate observed score
x <- theta + stats::rnorm(N, sd=sqrt(var_err) )
# simulate outcome
y <- .3 + .7 * theta + stats::rnorm( N, sd=.8 )
dat0 <- dat <- data.frame( y=y, x=x, theta=theta )
#*** estimate model with true scores (which are unobserved in real datasets)
mod0 <- stats::lm( y ~ theta, data=dat0 )
summary(mod0)
#******************************************
# Model 1: Model-based approach
#--- specify model
dat$theta <- NA
nodes <- seq(-4,4,len=15)
dep <- list( "model"="linreg", "formula"=y ~ theta, "nodes"=nodes,
"coef_inits"=c(NA, .4 ) )
ind <- list()
ind[["theta"]] <- list( "model"="linreg", "formula"=theta ~ 1,
"nodes"=nodes )
ind[["x"]] <- list( "model"="linreg", "formula"=x ~ 0 + offset(theta),
"nodes"=nodes )
# assumption of heterogeneous known error variance
ind[["x"]]$sigma_fixed <- sqrt( var_err )
#--- estimate regression model
mod1 <- mdmb::frm_em(dat, dep, ind )
summary(mod1)
#******************************************
# Model 2: Fully Bayesian estimation
#--- specify model
dep <- list( "model"="linreg", "formula"=y ~ theta )
ind <- list()
ind[["theta"]] <- list( "model"="linreg", "formula"=theta ~ 1 )
ind[["x"]] <- list( "model"="linreg", "formula"=x ~ 0 + offset(theta) )
# assumption of heterogeneous known error variance
ind[["x"]]$sigma_fixed <- sqrt( var_err )
data_init <- dat
data_init$theta <- dat$x
# estimate model
mod2 <- mdmb::frm_fb(dat, dep, ind, burnin=200, iter=1000, data_init=data_init)
summary(mod2)
plot(mod2)
#############################################################################
# EXAMPLE 6: Non-normally distributed covariates:
# Positive values with Box-Cox transformation
#############################################################################
# simulate data with chi-squared distributed covariate from
# regression model Y ~ X
set.seed(876)
n <- 1500
df <- 2
x <- stats::rchisq( n, df=df )
x <- x / sqrt( 2*df)
y <- 0 + 1*x
R2 <- .25 # explained variance
y <- y + stats::rnorm(n, sd=sqrt( (1-R2)/R2 * 1) )
dat0 <- dat <- data.frame( y=y, x=x )
# simulate missing responses
prop_miss <- .5
cor_miss <- .7
resp_tend <- cor_miss*(dat$y-mean(y) )/ stats::sd(y) +
stats::rnorm(n, sd=sqrt( 1 - cor_miss^2) )
dat[ resp_tend < stats::qnorm( prop_miss ), "x" ] <- NA
summary(dat)
#-- complete data
mod0 <- stats::lm( y ~ x, data=dat0 )
summary(mod0)
#-- listwise deletion
mod1 <- stats::lm( y ~ x, data=dat )
summary(mod1)
# normal distribution assumption for frm
# define models
dep <- list("model"="linreg", "formula"=y ~ x )
# normally distributed data
ind_x1 <- list( "model"="linreg", "formula"=x ~ 1 )
# Box-Cox normal distribution
ind_x2 <- list( "model"="bctreg", "formula"=x ~ 1,
nodes=c( seq(0.05, 3, len=31), seq( 3.5, 9, by=.5 ) ) )
ind1 <- list( "x"=ind_x1 )
ind2 <- list( "x"=ind_x2 )
#--- incorrect normal distribution assumption
mod1 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind1 )
summary(mod1)
#--- model chi-square distribution of predictor with Box-Cox transformation
mod2 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind2 )
summary(mod2)
#############################################################################
# EXAMPLE 7: Latent interaction model
#############################################################################
# A latent interaction model Y ~ FX + FZ is of interest. Y is directly observed,
# FX and FZ are both indirectly observed by three items
#--- simulate data
N <- 1000
set.seed(987)
# latent variable
FX <- stats::rnorm(N)
FZ <- stats::rnorm(N)
# simulate items
x1 <- 1.5 + 1*FX + stats::rnorm(N, sd=.7 )
x2 <- 1.9 + .7*FX + stats::rnorm(N, sd=1 )
x3 <- .9 + .7*FX + stats::rnorm(N, sd=.2 )
z1 <- 1.5 + 1*FZ + stats::rnorm(N, sd=.7 )
z2 <- 1.9 + .7*FZ + stats::rnorm(N, sd=1 )
z3 <- .9 + .7*FZ + stats::rnorm(N, sd=.2 )
dat <- data.frame(x1,x2,x3,z1,z2,z3)
dat$FX <- NA
dat$FZ <- NA
dat$y <- 2 + .5*FX + .3*FZ + .4*FX*FZ + rnorm( N, sd=1 )
# estimate interaction model with ML
#--- define model
nodes <- seq(-4,4,len=11)
ind_x1 <- list("model"="linreg", "formula"=x1 ~ offset(1*FX),
"nodes"=nodes )
ind_x2 <- list( "model"="linreg", "formula"=x2 ~ FX, "nodes"=nodes,
"coef_inits"=c(NA,1) )
ind_x3 <- list( "model"="linreg", "formula"=x3 ~ FX, "nodes"=nodes,
"coef_inits"=c(1,1) )
ind_FX <- list( "model"="linreg", "formula"=FX ~ 0, "nodes"=nodes )
ind_z1 <- list("model"="linreg", "formula"=z1 ~ offset(1*FZ),
"nodes"=nodes )
ind_z2 <- list( "model"="linreg", "formula"=z2 ~ FZ, "nodes"=nodes,
"coef_inits"=c(NA,1) )
ind_z3 <- list( "model"="linreg", "formula"=z3 ~ FZ, "nodes"=nodes,
"coef_inits"=c(1,1) )
ind_FZ <- list( "model"="linreg", "formula"=FZ ~ 0 + FX, "nodes"=nodes )
ind <- list( "x1"=ind_x1, "x2"=ind_x2, "x3"=ind_x3, "FX"=ind_FX,
"z1"=ind_z1, "z2"=ind_z2, "z3"=ind_z3, "FX"=ind_FZ )
dep <- list( "model"="linreg", formula=y ~ FX+FZ+FX*FZ, "coef_inits"=c(1,.2,.2,0) )
#*** estimate model with mdmb::frm_em
mod1 <- mdmb::frm_em(dat, dep, ind)
summary(mod1)
#############################################################################
# EXAMPLE 8: Non-ignorable data in Y
#############################################################################
# regression Y ~ X in which Y is missing depending Y itself
library(mvtnorm)
cor_XY <- .4 # correlation between X and Y
prop_miss <- .5 # missing proportion
cor_missY <- .7 # correlation with missing propensity
N <- 3000 # sample size
#----- simulate data
set.seed(790)
Sigma <- matrix( c(1, cor_XY, cor_XY, 1), 2, 2 )
mu <- c(0,0)
dat <- mvtnorm::rmvnorm( N, mean=mu, sigma=Sigma )
colnames(dat) <- c("X","Y")
dat <- as.data.frame(dat)
#-- generate missing responses on Y depending on Y itself
y1 <- dat$Y
miss_tend <- cor_missY * y1 + rnorm(N, sd=sqrt( 1 - cor_missY^2) )
dat$Y[ miss_tend < quantile( miss_tend, prop_miss ) ] <- NA
#--- ML estimation under assumption of ignorability
nodes <- seq(-5,5,len=15)
dep <- list("model"="linreg", "formula"=Y ~ X, "nodes"=nodes )
ind_X <- list( "model"="linreg", "formula"=X ~ 1, "nodes"=nodes )
ind <- list( "X"=ind_X )
mod1 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind)
summary(mod1)
#--- ML estimation under assumption with specifying a model for non-ignorability
# for response indicator resp_Y
dat$resp_Y <- 1* ( 1 - is.na(dat$Y) )
dep <- list("model"="linreg", "formula"=Y ~ X, "nodes"=nodes )
ind_X <- list( "model"="linreg", "formula"=X ~ 1, "nodes"=nodes )
ind_respY <- list( "model"="logistic", "formula"=resp_Y ~ Y, "nodes"=nodes )
ind <- list( "X"=ind_X, "resp_Y"=ind_respY )
mod2 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind)
summary(mod2)
#############################################################################
# EXAMPLE 9: Ordinal variables: Graded response model
#############################################################################
#--- simulate data
N <- 2000
set.seed(91834)
# latent variable
theta <- stats::rnorm(N)
# simulate items
y1 <- 1*theta + stats::rnorm(N)
y2 <- .7*theta + stats::rnorm(N)
y3 <- .7*theta + stats::rnorm(N)
# discretize variables
y1 <- as.numeric( cut( y1, breaks=c(-Inf, -.5, 0.4, 1, Inf ) ) ) - 1
y2 <- as.numeric( cut( y2, breaks=c(-Inf, 0.3, 1, Inf ) ) ) - 1
y3 <- as.numeric( cut( y3, breaks=c(-Inf, .2, Inf ) ) ) - 1
# define dataset
dat <- data.frame(y1,y2,y3)
dat$theta <- NA
#******************************************
# Model 1: Fully Bayesian estimation
#--- define model
ind_y1 <- list( "model"="oprobit", "formula"=y1 ~ offset(1*theta) )
ind_y2 <- list( "model"="oprobit", "formula"=y2 ~ theta )
ind_y3 <- list( "model"="oprobit", "formula"=y3 ~ theta )
dep <- list( "model"="linreg", "formula"=theta ~ 0 )
ind <- list( "y1"=ind_y1, "y2"=ind_y2, "y3"=ind_y3)
# initial data
data_init <- dat
data_init$theta <- as.numeric( scale(dat$y1) ) + stats::rnorm(N, sd=.4 )
#-- estimate model
iter <- 3000; burnin <- 1000
mod1 <- mdmb::frm_fb(dat=dat, dep=dep, ind=ind, data_init=data_init,
iter=iter, burnin=burnin)
summary(mod1)
plot(mod1)
#############################################################################
# EXAMPLE 10: Imputation for missig predictors in models with interaction
# effects in multilevel regression models
#############################################################################
library(miceadds)
data(data.mb04, package="mdmb")
dat <- data.mb04
#*** model specification
mcmc_iter <- 4 # number of MCMC iterations for model parameter sampling
model_formula <- y ~ cwc(x, idcluster) + gm(x, idcluster) + w + w*cwc(x, idcluster) +
w*gm(x, idcluster) + ( 1 + cwc(x, idcluster) | idcluster)
dep <- list("model"="mlreg", "formula"=model_formula,
R_args=list(iter=mcmc_iter, outcome="normal") )
ind_x <- list( "model"="mlreg", "formula"=x ~ w + (1|idcluster), R_args=list(iter=mcmc_iter),
sampling_level="idcluster" )
# group means of x are involved in the outcome model. Therefore, Metropolis-Hastings
# sampling of missing values in x should be conducted at the level of clusters,
# i.e. specifying sampling_level
ind <- list("x"=ind_x)
# --- estimate model
mod1 <- mdmb::frm_fb(dat, dep, ind, aggregation=TRUE)
# argument aggregation is necessary because group means are involved in regression formulas
#-------------
#*** imputation of a continuous level-2 variable w
# create artificially some missings on w
dat[ dat$idcluster %%3==0, "w" ] <- NA
# define level-2 model with argument variable_level
ind_w <- list( "model"="linreg", "formula"=w ~ 1, "variable_level"="idcluster" )
ind <- list( x=ind_x, w=ind_w)
#* conduct imputations
mod2 <- mdmb::frm_fb(dat, dep, ind, aggregation=TRUE)
summary(mod2)
#--- Model 1 with user-defined prior distributions for covariance matrices
model_formula <- y ~ cwc(x, idcluster) + gm(x, idcluster) + w + w*cwc(x, idcluster) +
w*gm(x, idcluster) + ( 1 + cwc(x, idcluster) | idcluster)
# define scale degrees of freedom (nu) and scale matrix (S) for inverse Wishart distribution
psi_nu0_list <- list( -3 )
psi_S0_list <- list( diag(0,2) )
dep <- list("model"="mlreg", "formula"=model_formula,
R_args=list(iter=mcmc_iter, outcome="normal",
psi_nu0_list=psi_nu0_list, psi_S0_list=psi_S0_list ) )
# define nu and S parameters for covariate model
psi_nu0_list <- list( .4 )
psi_S0_list <- list( matrix(.2, nrow=1, ncol=1) )
ind_x <- list( "model"="mlreg", "formula"=x ~ w + (1|idcluster),
R_args=list(iter=mcmc_iter, psi_nu0_list=psi_nu0_list,
psi_S0_list=psi_S0_list),
sampling_level="idcluster" )
ind <- list("x"=ind_x)
# --- estimate model
mod3 <- mdmb::frm_fb(dat, dep, ind, aggregation=TRUE)
#############################################################################
# EXAMPLE 11: Bounded variable combined with Yeo-Johnson transformation
#############################################################################
#*** simulate data
set.seed(876)
n <- 1500
x <- mdmb::ryjt_scaled( n, location=-.2, shape=.8, lambda=.9, probit=TRUE)
R2 <- .25 # explained variance
y <- 1*x + stats::rnorm(n, sd=sqrt( (1-R2)/R2 * stats::var(x)) )
dat0 <- dat <- data.frame( y=y, x=x )
# simulate missing responses
prop_miss <- .5
cor_miss <- .7
resp_tend <- cor_miss*(dat$y-mean(y) )/ stats::sd(y) +
stats::rnorm(n, sd=sqrt( 1 - cor_miss^2) )
dat[ resp_tend < stats::qnorm(prop_miss), "x" ] <- NA
summary(dat)
#*** define models
dep <- list("model"="linreg", "formula"=y ~ x )
# distribution according to Yeo-Johnson transformation
ind_x1 <- list( "model"="yjtreg", "formula"=x ~ 1 )
# distribution according to Probit Yeo-Johnson transformation
ind_x2 <- list( "model"="yjtreg", "formula"=x ~ 1, R_args=list("probit"=TRUE ) )
ind1 <- list( "x"=ind_x1 )
ind2 <- list( "x"=ind_x2 )
#--- complete data
mod0 <- stats::lm( y~x, data=dat0)
summary(mod0)
#--- Yeo-Johnson normal distribution (for unbounded variables)
mod1 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind1 )
summary(mod1)
#--- Probit Yeo-Johnson normal distribution (for bounded variable on (0,1))
mod2 <- mdmb::frm_em(dat=dat, dep=dep, ind=ind2)
summary(mod2)
#--- same model, but MCMC estimation
mod3 <- mdmb::frm_fb(dat, dep, ind=ind2, burnin=2000, iter=5000)
summary(mod3)
plot(mod3)
#############################################################################
# EXAMPLE 12: Yeo-Johnson transformation with estimated degrees of freedom
#############################################################################
#*** simulate data
set.seed(876)
n <- 1500
x <- mdmb::ryjt_scaled( n, location=-.2, shape=.8, lambda=.9, df=10 )
R2 <- .25 # explained variance
y <- 1*x + stats::rnorm(n, sd=sqrt( (1-R2)/R2 * stats::var(x)) )
dat0 <- dat <- data.frame( y=y, x=x )
# simulate missing responses
prop_miss <- .5
cor_miss <- .7
resp_tend <- cor_miss*(dat$y-mean(y) )/ stats::sd(y) +
stats::rnorm(n, sd=sqrt( 1-cor_miss^2) )
dat[ resp_tend < stats::qnorm(prop_miss), "x" ] <- NA
summary(dat)
#*** define models
dep <- list("model"="linreg", "formula"=y ~ x )
# specify distribution with estimated degrees of freedom
ind_x <- list( "model"="yjtreg", "formula"=x ~ 1, R_args=list(est_df=TRUE ) )
ind <- list( "x"=ind_x )
#--- Yeo-Johnson t distribution
mod1 <- mdmb::frm_fb(dat=dat, dep=dep, ind=ind, iter=3000, burnin=1000 )
summary(mod1)
}
Run the code above in your browser using DataLab