# NOT RUN {
## An artificial dataset is lazyloaded to illustrate the concept of GIM method
## It contains:
## A continuous outcome y.
## Four covariates x1, x2, x3, x4 (character).
## A binary outcome d
head(dat)
## internal data of 500 samples
dat0 <- dat[1:500, ]
## three external datasets.
## dat2 and dat3 share some samples
dat1 <- dat[501:1500, c('y', 'x1', 'x2')]
dat2 <- dat[1501:2500, c('y', 'x1', 'x3', 'x4')]
dat3 <- dat[2001:3000, c('y', 'x3', 'x4')]
## four working models are fitted
form1 <- 'y ~ I(x1 < 0) + I(x2 > 0)'
form2 <- 'y ~ x3 + x4'
form3 <- 'y ~ I(x4 == "a")'
form4 <- 'y ~ sqrt(x3)'
## two working models are fitted on dat3
## thus nsample is a 4x4 matrix
nsample <- matrix(c(1000, 0, 0, 0,
0, 1000, 500, 500,
0, 500, 1000, 1000,
0, 500, 1000, 1000),
4, 4)
fit1 <- summary(lm(form1, dat1))$coef
fit2 <- summary(lm(form2, dat2))$coef
fit3 <- summary(lm(form3, dat3))$coef ## <-- dat3 is used twice
fit4 <- summary(lm(form4, dat3))$coef ## <-- dat3 is used twice
options(stringsAsFactors = FALSE)
model <- list()
## partial information is available
model[[1]] <- list(form = form1,
info = data.frame(var = rownames(fit1)[2],
bet = fit1[2, 1]))
## intercept is provided, but miss estimate of a covariate
model[[2]] <- list(form = form2,
info = data.frame(var = rownames(fit2)[1:2],
bet = fit2[1:2, 1]))
model[[3]] <- list(form = form3,
info = data.frame(var = rownames(fit3)[2],
bet = fit3[2, 1]))
model[[4]] <- list(form = form4,
info = data.frame(var = rownames(fit4)[2],
bet = fit4[2, 1]))
form <- 'y ~ I(x1 < 0) + I(x1 > 1) + x2 * x4 + log(x3) - 1'
fit <- gim(form, 'gaussian', dat0, model, nsample)
summary(fit)
coef(fit)
confint(fit)
# one can compare the gim estimates with those estimated from internal data
fit0 <- lm(form, dat0)
summary(fit0)
# by default, covariates in dat is used as reference in gim
# which assumes that the external and internal populations are the same
fit1 <- gim(form, 'gaussian', dat0, model, nsample, ref = dat0)
all(coef(fit) == coef(fit1)) # TRUE
# if additional reference is available,
# and it comes from the internal population from which dat is sampled
# gim can use it
add.ref <- dat[3001:3500, ]
add.ref$y <- NA ## <-- outcome is unavailable in reference
ref <- rbind(dat0, add.ref)
fit2 <- gim(form, 'gaussian', dat0, model, nsample, ref = ref)
# if the external population is different from the internal population
# then reference for summary data specified in model needs to be provided
ext.ref <- dat[3501:4000, ] ## <-- as an example, assume ext.ref is different
## from dat0
fit3 <- gim(form, 'gaussian', dat0, model, nsample, ref = ext.ref)
# }
Run the code above in your browser using DataLab