importData: Import data, define various nodes, define dummies for factor columns and define OData R6 object

Description

Import data, define various nodes, define dummies for factor columns and define OData R6 object

Usage

importData(data, ID = "Subject_ID", t_name = "time_period", covars, CENS = "C", TRT = "A", MONITOR = "N", OUTCOME = "Y", noCENScat = 0L, verbose = getOption("stremr.verbose"))

Arguments

data

Input data in long format. Can be a data.frame or a data.table with named columns, containing the time-varying covariates (covars), the right-censoring event indicator(s) (CENS), the exposure variable(s) (TRT), the monitoring process variable(s) (MONITOR) and the survival OUTCOME variable (OUTCOME).

Unique subject identifier column name in data.

t_name

The name of the time/period variable in data.

covars

Vector of names with time varying and baseline covariates in data. This argument does not need to be specified, by default all variables that are not in ID, t, CENS, TRT, MONITOR and OUTCOME will be considered as covariates.

CENS

Column name of the censoring variable(s) in data. Each separate variable specified in CENS can be either binary (0/1 valued integer) or categorical (integer). For binary indicators of CENSoring, the value of 1 indicates the CENSoring or end of follow-up event (this cannot be changed). For categorical CENSoring variables, by default the value of 0 indicates no CENSoring / continuation of follow-up and other values indicate different reasons for CENSoring. Use the argument noCENScat to change the reference (continuation of follow-up) category from default 0 to any other value. (NOTE: Changing noCENScat has zero effect on coding of the binary CENSoring variables, those have to always use 1 to code the CENSoring event). Note that factors are not allowed in CENS.

TRT

A column name in data for the exposure/treatment variable(s).

MONITOR

A column name in data for the indicator(s) of monitoring events.

OUTCOME

A column name in data for the survival OUTCOME variable name, code as 1 for the outcome event.

noCENScat

The level (integer) that indicates CONTINUATION OF FOLLOW-UP for ALL censoring variables. Defaults is 0. Use this to modify the default reference category (no CENSoring / continuation of follow-up) for variables specifed in CENS.

verbose

Set to TRUE to print messages on status and information to the console. Turn this on by default using options(stremr.verbose=TRUE).

Value

...

Examples

Run this code

options(stremr.verbose = TRUE)
require("data.table")
set_all_stremr_options(fit.package = "speedglm", fit.algorithm = "glm")

# ----------------------------------------------------------------------
# Simulated Data
# ----------------------------------------------------------------------
data(OdataNoCENS)
OdataDT <- as.data.table(OdataNoCENS, key=c(ID, t))

# define lagged N, first value is always 1 (always monitored at the first time point):
OdataDT[, ("N.tminus1") := shift(get("N"), n = 1L, type = "lag", fill = 1L), by = ID]
OdataDT[, ("TI.tminus1") := shift(get("TI"), n = 1L, type = "lag", fill = 1L), by = ID]

# ----------------------------------------------------------------------
# Define intervention (always treated):
# ----------------------------------------------------------------------
OdataDT[, ("TI.set1") := 1L]
OdataDT[, ("TI.set0") := 0L]

# ----------------------------------------------------------------------
# Import Data
# ----------------------------------------------------------------------
OData <- importData(OdataDT, ID = "ID", t = "t", covars = c("highA1c", "lastNat1", "N.tminus1"),
                    CENS = "C", TRT = "TI", MONITOR = "N", OUTCOME = "Y.tplus1")

# ----------------------------------------------------------------------
# Model the Propensity Scores
# ----------------------------------------------------------------------
gform_CENS <- "C ~ highA1c + lastNat1"
gform_TRT = "TI ~ CVD + highA1c + N.tminus1"
gform_MONITOR <- "N ~ 1"
stratify_CENS <- list(C=c("t < 16", "t == 16"))

# ----------------------------------------------------------------------
# Fit Propensity Scores
# ----------------------------------------------------------------------
OData <- fitPropensity(OData, gform_CENS = gform_CENS,
                        gform_TRT = gform_TRT,
                        gform_MONITOR = gform_MONITOR,
                        stratify_CENS = stratify_CENS)

# ----------------------------------------------------------------------
# IPW Ajusted KM or Saturated MSM
# ----------------------------------------------------------------------
require("magrittr")
AKME.St.1 <- getIPWeights(OData, intervened_TRT = "TI.set1") %>%
             survNPMSM(OData) %$%
             IPW_estimates
AKME.St.1

# ----------------------------------------------------------------------
# Bounded IPW
# ----------------------------------------------------------------------
IPW.St.1 <- getIPWeights(OData, intervened_TRT = "TI.set1") %>%
             survDirectIPW(OData)
IPW.St.1[]

# ----------------------------------------------------------------------
# IPW-MSM for hazard
# ----------------------------------------------------------------------
wts.DT.1 <- getIPWeights(OData = OData, intervened_TRT = "TI.set1", rule_name = "TI1")
wts.DT.0 <- getIPWeights(OData = OData, intervened_TRT = "TI.set0", rule_name = "TI0")
survMSM_res <- survMSM(list(wts.DT.1, wts.DT.0), OData, t_breaks = c(1:8,12,16)-1,)
survMSM_res$St

# ----------------------------------------------------------------------
# Sequential G-COMP
# ----------------------------------------------------------------------
t.surv <- c(0:15)
Qforms <- rep.int("Q.kplus1 ~ CVD + highA1c + N + lastNat1 + TI + TI.tminus1", (max(t.surv)+1))
params = list(fit.package = "speedglm", fit.algorithm = "glm")

## Not run: 
# gcomp_est <- fitSeqGcomp(OData, t_periods = t.surv, intervened_TRT = "TI.set1",
#                           Qforms = Qforms, params_Q = params, stratifyQ_by_rule = FALSE)
# gcomp_est[]
# ## End(Not run)
# ----------------------------------------------------------------------
# TMLE
# ----------------------------------------------------------------------
## Not run: 
# tmle_est <- fitTMLE(OData, t_periods = t.surv, intervened_TRT = "TI.set1",
#                     Qforms = Qforms, params_Q = params, stratifyQ_by_rule = TRUE)
# tmle_est[]
# ## End(Not run)

# ----------------------------------------------------------------------
# Running IPW-Adjusted KM with optional user-specified weights:
# ----------------------------------------------------------------------
addedWts_DT <- OdataDT[, c("ID", "t"), with = FALSE]
addedWts_DT[, new.wts := sample.int(10, nrow(OdataDT), replace = TRUE)/10]
survNP_res_addedWts <- survNPMSM(wts.DT.1, OData, weights = addedWts_DT)

# ----------------------------------------------------------------------
# Multivariate Propensity Score Regressions
# ----------------------------------------------------------------------
gform_CENS <- "C + TI + N ~ highA1c + lastNat1"
OData <- fitPropensity(OData, gform_CENS = gform_CENS, gform_TRT = gform_TRT,
                        gform_MONITOR = gform_MONITOR)

# ----------------------------------------------------------------------
# Fitting Propensity scores with Random Forests:
# ----------------------------------------------------------------------
## Not run: 
# set_all_stremr_options(fit.package = "h2o", fit.algorithm = "randomForest")
# require("h2o")
# h2o::h2o.init(nthreads = -1)
# gform_CENS <- "C ~ highA1c + lastNat1"
# OData <- fitPropensity(OData, gform_CENS = gform_CENS,
#                         gform_TRT = gform_TRT,
#                         gform_MONITOR = gform_MONITOR,
#                         stratify_CENS = stratify_CENS)
# 
# # For Gradient Boosting machines:
# set_all_stremr_options(fit.package = "h2o", fit.algorithm = "gbm")
# # Use `H2O-3` distributed implementation of GLM
# set_all_stremr_options(fit.package = "h2o", fit.algorithm = "glm")
# # Use Deep Neural Nets:
# set_all_stremr_options(fit.package = "h2o", fit.algorithm = "deeplearning")
# ## End(Not run)

# ----------------------------------------------------------------------
# Fitting different models with different algorithms
# Fine tuning modeling with optional tuning parameters.
# ----------------------------------------------------------------------
## Not run: 
# params_TRT = list(fit.package = "h2o", fit.algorithm = "gbm", ntrees = 50,
#     learn_rate = 0.05, sample_rate = 0.8, col_sample_rate = 0.8,
#     balance_classes = TRUE)
# params_CENS = list(fit.package = "speedglm", fit.algorithm = "glm")
# params_MONITOR = list(fit.package = "speedglm", fit.algorithm = "glm")
# OData <- fitPropensity(OData,
#             gform_CENS = gform_CENS, stratify_CENS = stratify_CENS, params_CENS = params_CENS,
#             gform_TRT = gform_TRT, params_TRT = params_TRT,
#             gform_MONITOR = gform_MONITOR, params_MONITOR = params_MONITOR)
# ## End(Not run)

# ----------------------------------------------------------------------
# Running TMLE based on the previous fit of the propensity scores.
# Also applying Random Forest to estimate the sequential outcome model
# ----------------------------------------------------------------------
## Not run: 
# t.surv <- c(0:5)
# Qforms <- rep.int("Q.kplus1 ~ CVD + highA1c + N + lastNat1 + TI + TI.tminus1", (max(t.surv)+1))
# params_Q = list(fit.package = "h2o", fit.algorithm = "randomForest",
#                 ntrees = 100, learn_rate = 0.05, sample_rate = 0.8,
#                 col_sample_rate = 0.8, balance_classes = TRUE)
# tmle_est <- fitTMLE(OData, t_periods = t.surv, intervened_TRT = "TI.set1",
#             Qforms = Qforms, params_Q = params_Q,
#             stratifyQ_by_rule = TRUE)
# ## End(Not run)

## Not run: 
# t.surv <- c(0:5)
# Qforms <- rep.int("Q.kplus1 ~ CVD + highA1c + N + lastNat1 + TI + TI.tminus1", (max(t.surv)+1))
# params_Q = list(fit.package = "h2o", fit.algorithm = "randomForest",
#                 ntrees = 100, learn_rate = 0.05, sample_rate = 0.8,
#                 col_sample_rate = 0.8, balance_classes = TRUE)
# tmle_est <- fitTMLE(OData, t_periods = t.surv, intervened_TRT = "TI.set1",
#             Qforms = Qforms, params_Q = params_Q,
#             stratifyQ_by_rule = FALSE)
# ## End(Not run)

Run the code above in your browser using DataLab