# \donttest{
# Loads some data (fictional Stroke data from the package 'causalPAF')
# In this example, we use a small data set called 'strokedata_smallSample' consisting of 5,000
# rows of fictional patient data. For more accurate results, a larger data set is available
# called 'strokedata'which contains 16,623 rows of fictional patient data. The methodology
# applied in the 'causalPAF' package is more accurate the larger the dataset. To use the larger
# 'strokedata' dataset, simply call
# stroke_reduced <- strokedata
stroke_reduced <- strokedata_smallSample
# Just shortening the name of a variable, "apob_apoa", to "apb" so the R code
# in document example is not truncated.
stroke_reduced$apb <- stroke_reduced$apob_apoa
# The data should contain a column of weights for case control matching.
# strokedata$weights
# Weigths are not needed for cohort/cross sectional designs.
# The data should have reference levels of all risk factors already set.
# This can be done as follows but has already been applied to the data so is not run here:
# levels(stroke_reduced$htnadmbp) <- c(0, 1)
# stroke_reduced$subhtn <- factor(stroke_reduced$subhtn,levels=c(1, 2))
# levels(stroke_reduced$nevfcur) <- c(1, 2)
# stroke_reduced$global_stress2 <- factor(stroke_reduced$global_stress2,levels=c(1,2))
# levels(stroke_reduced$whrs2tert) <- c(1, 2, 3)
# levels(stroke_reduced$phys) <- c(2, 1)
# levels(stroke_reduced$alcohfreqwk) <- c(1, 2, 3)
# stroke_reduced$dmhba1c2 <- factor(stroke_reduced$dmhba1c2,levels=c(1,2))
# stroke_reduced$cardiacrfcat <- factor(stroke_reduced$cardiacrfcat,levels=c(1,2))
# levels(stroke_reduced$ahei3tert) <- c(3,2,1)
# levels(stroke_reduced$apob_apoatert) <- c(1,2,3)
# The 'causalPAF' package assumes the data is either complete case data or that missing data
# analysis has already been performed.
# Next, define the causal structure or directed acyclic graph (DAG) of the causal Bayesian
# network defined by the data. We list the parents of each exposure or risk factor or outcome
# in a vector as follows:
# Note it is important that the order at which the variables are defined is such that all
# parents of that variable are defined before it. Please refer to the figure of the causal
# Bayesian network (with both direct and indirect effects) defined earlier as an example of this
# order.
in_phys <- c("subeduc","moteduc","fatduc")
in_ahei <- c("subeduc","moteduc","fatduc")
in_nevfcur <- c("subeduc","moteduc","fatduc")
in_alcohfreqwk <- c("subeduc","moteduc","fatduc")
in_global_stress2 <- c("subeduc","moteduc","fatduc")
in_subhtn <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
"global_stress2")
in_apob_apoa <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
"global_stress2")
in_whr <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
"global_stress2")
# Note splines can be fitted within the causal structure as shown below especially if splines
# are to be used in the fitted models.
# It is important that splines of parent variables are "typed" or "spelt" consistently
# (including spaces) throughout as 'causalPAF' can fit models automatically provided variables are
# spelt consistently. Also if a parent variable is a spline it should be defined in spline
# format in all occurences of the parent variable.
in_cardiacrfcat <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
"global_stress2",
"ns(apb,knots=quantile(apb,c(.25,.5,.75)),Boundary.knots=quantile(apb,c(.001,.95)))",
"ns(whr,df=5)","subhtn")
in_dmhba1c2 <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
"global_stress2",
"ns(apb,knots=quantile(apb,c(.25,.5,.75)),Boundary.knots=quantile(apb,c(.001,.95)))",
"ns(whr,df=5)","subhtn")
in_case <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
"global_stress2",
"ns(apb,knots=quantile(apb,c(.25,.5,.75)),Boundary.knots=quantile(apb,c(.001,.95)))",
"ns(whr,df=5)","subhtn","cardiacrfcat","dmhba1c2")
# Then we define a two dimensional list consisting of
# 1. inlist i.e. a list of the parents of each variable of interest corresponding to its column
# name in the data. Splines should be included here if they are to be modelled as splines.
# 2. outlist i.e. a list of each variable of interest corresponding to its column name in the
# data. Splines should not be input here, only the column names of the variables of interest in
# the data.
# Again the order is such that each variable is defined after all its parents.
in_out <- list(inlist=list(in_phys,in_ahei,in_nevfcur,in_alcohfreqwk,in_global_stress2,
in_subhtn,in_apob_apoa,in_whr,in_cardiacrfcat,in_dmhba1c2,in_case),
outlist=c("phys","ahei3tert","nevfcur","alcohfreqwk","global_stress2","subhtn",
"apb","whr","cardiacrfcat","dmhba1c2","case"))
# If splines are to be used for variables listed in in_out$outlist, then the splines should be
# defined in the same order as variables appear in in_out$outlist as follows. It is necessary to
# list variables in in_out$outlist without splines if no spline is to be applied.
# It is important that Splines_outlist is defined in the following format
# list(c("splinename1","splinename2","splinename3")) for the package to be applied correctly.
# And Splines_outlist should not be an empty list(). If there are no splines it should be
# defined the same as in_out[[2]] and in the same order as variables defined in_out[[2]].
Splines_outlist = list( c("phys","ahei3tert","nevfcur","alcohfreqwk","global_stress2","subhtn",
"ns(apb,knots=quantile(apb,c(.25,.5,.75)),Boundary.knots=quantile(apb,c(.001,.95)))",
"ns(whr,df=5)","cardiacrfcat","dmhba1c2","case") )
# To fit these models to case control data, one needs to perform weighted maximum likelihood
# estimation to imitate estimation using a random sample from the population. We chose weights
# of 0.0035 (for each case) and 0.9965 (for each control), reflective of a yearly incidence of
# first ischemic stroke of 0.35%, or 3.5 strokes per 1,000 individuals. These weights were
# chosen according to average incidences across country, age, group and gender within
# INTERSTROKE according to the global burden of disease.
w <- rep(1,nrow(stroke_reduced))
w[stroke_reduced$case==0] <- 0.9965
w[stroke_reduced$case==1] <- 0.0035
# It is important to assign stroke_reduced$weights to the updated weights defined in w.
# Otherwise if stroke_reduced$weights <- w is not set, the alternative weights supplied in the
# fictional data will be used. In this case, we want to use weigths as defined in w.
stroke_reduced$weights <- w
#The checkMarkovDAG() function in the 'causalPAF' package should be used before running
# causalPAFplot() to ensure:
#1. The causal Markov condition holds for the causal structure defined in the variable in_out.
#2. The variables in in_out are listed in the order so that no variable is defined before a
# parent or direct cause. Note: if this order does not hold, checkMarkovDAG() will automatically
# reorder the variables in, in_out, provided it is a Markov DAG.
#The causal analysis requires that the causal structure is a Markov DAG. The Causal Markov (CM)
# condition states that, conditional on the set of all its direct causes, a node is independent
# of all variables which are not direct causes or direct effects of that node. In the event that
# the structure of a Bayesian network accurately depicts causality, the two conditions are
# equivalent. However, a network may accurately embody the Markov condition without depicting
# causality, in which case it should not be assumed to embody the causal Markov condition.
# in_out is as defined above and input into this code.
if(checkMarkovDAG(in_out)$IsMarkovDAG & !checkMarkovDAG(in_out)$Reordered){
print("Your in_out DAG is a Markov DAG.")
} else if( checkMarkovDAG(in_out)$IsMarkovDAG & checkMarkovDAG(in_out)$Reordered ) {
in_out <- checkMarkovDAG(in_out)[[2]]
print("Your in_out DAG is a Markov DAG.The checkMarkovDAG function has reordered your
in_out list so that all parent variables come before descendants.")
} else{ print("Your ``in_out'' list is not a Bayesian Markov DAG so the methods in the
'causalPAF' package cannot be applied for non Markov DAGs.")}
# The pointEstimate() function evaluates Point Estimates for Total PAF, Direct PAF, Indirect PAF
# and Path Specific PAF for a user inputted number of integral simulations. There is no bootstrap
# applied in this fucntion.
# Since bootstraps are not applied, the pointEstimate() function will run quicker than the
# alternative causalPAFplot() function which calculates bootstrap estimates which can take
# longer to run.
pointEstimate(dataframe = stroke_reduced,
exposure="phys",
mediator=c("subhtn","apb","whr"),
response="case",
response_model_mediators = list(),
response_model_exposure = list(),
in_outArg = in_out,
Splines_outlist = Splines_outlist,
splinesDefinedIn_in_outDAG = TRUE,
model_listArg = list(),
weights = w,
NumSimulation = 3,
addCustom = TRUE,
custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)")
# The causalPAFplot() function will perform Pathway Specific Population Attributable Fraction
# (PSPAF) calculations and output results based on an exposure, mediators and response input
# by the user according to the columns names of these variables defined in the dataframe.
# Setting model_listArg, response_model_mediators and response_model_exposure by default to an
# empty list will instruct the 'causalPAF' package to fit these models automatically based on the
# causal DAG supplied in the in _outArg. Alternatively the user can supply their custom fitted,
# model_listpop, response_model_mediators and response_model_exposure which should be consistent
# with the causal structure.
# Note we fit a custom interaction for the outcome (or case or response) regression
# ( custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)") ). Care should be taken that the
# customised regression should not contain variables that might affect the causal interpretation
# of the regression e.g. in this case we have used baseline confounders (i.e. regionn, eage and
# esex) with interactions and splines. In general, using baseline confounders in custom should
# not affect any causal interpretations whereas using variables far ``downstream'' might block
# causal pathways. The user is required to apply discretion in using ``addCustom'' and
# ``Custom'' in ensuring a causal interpretation remains. If no customisation is required the
# user can input addCustom = FALSE and custom = "" which is the default setting.
# Finally we call the causalPAFplot function for the pathway specific PAF calculations as
# follows:
# For greater accuracy a larger number of bootstraps (e.g. 200) and larger number of simulations
# (e.g. 1000) should be run. However, this will increase the run time greatly.
causalPAFplot(dataframe = stroke_reduced,
exposure="phys",
mediator=c("subhtn","apb","whr"),
response="case",
response_model_mediators = list(),
response_model_exposure = list(),
in_outArg = in_out,
Splines_outlist = Splines_outlist,
splinesDefinedIn_in_outDAG = TRUE,
model_listArg = list(),
weights = w,
NumBootstrap = 2,
NumSimulation = 2,
plot = "bar",
fill= "skyblue",
colour="orange",
addCustom = TRUE,
custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)")
# The causalPAFplot function below has response_model_mediators, response_model_exposure and
# model_listArg pre fit. This allows the user to apply customised regressions instead of the
# default setting above, where the 'causalPAF' R package fitted these regressions automatically
# based on the causalDAG defined in in_outArg.
# Libraries must be loaded if fitting models outside of the 'causalPAF' R package.
library(MASS)
library(splines)
# Next we fit the, response_model_mediators and response_model_exposure, models outside of the
# 'causalPAF' package as an input into the package.
# It is important that response_vs_mediator is a list and it must be the same length as the
# parameter, mediator, i.e. length( response_vs_mediator ) == length( mediator). In this
# example, mediator=c("subhtn","apb","whr") so length( mediator) is 3, so we create a list
# with three models for "subhtn","apb" and "whr" respectively in that order. Note in this
# example, the model is the same for each mediator, but it must still be input 3 times within
# the list as follows:
response_vs_mediator <- list(
glm("case ~ regionnn7*ns(eage,df=5)+esex*ns(eage,df=5) +subeduc+moteduc+ fatduc+ phys+
ahei3tert+ nevfcur+ alcohfreqwk+ global_stress2+ subhtn +
ns(apb, knots = quantile(apb,c(.25,0.5,0.75)),
Boundary.knots = quantile(apb,c(.001,0.95)))+
ns(whr,df=5)",data = stroke_reduced,family='binomial',w = stroke_reduced$weights ),
# "subhtn" mediator model
glm("case ~ regionnn7*ns(eage,df=5)+esex*ns(eage,df=5) + subeduc+ moteduc+ fatduc+ phys+
ahei3tert+ nevfcur+ alcohfreqwk+ global_stress2+ subhtn +
ns(apb, knots = quantile(apb,c(.25,0.5,0.75)),
Boundary.knots = quantile(apb,c(.001,0.95)))+
ns(whr,df=5)",data = stroke_reduced,family='binomial',w = stroke_reduced$weights ),
# "apob_apoa" mediator model name shoretd to "apb"
glm("case ~ regionnn7*ns(eage,df=5)+esex*ns(eage,df=5) + subeduc+ moteduc+ fatduc+ phys+
ahei3tert+ nevfcur+ alcohfreqwk+ global_stress2+ subhtn +
ns(apb, knots = quantile(apb,c(.25,0.5,0.75)),
Boundary.knots = quantile(apb,c(.001,0.95)))+
ns(whr,df=5)",data = stroke_reduced,family='binomial',w = stroke_reduced$weights ) )
# "whr" mediator model
# Next we fit a customised response_model_exposure model rather than allowing the package fit it
# automatically as shown previously. This must be a list of length 1.
response_vs_phys <- list(glm("case ~ regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)+subeduc+moteduc+
fatduc+ phys+ ahei3tert+ nevfcur+ alcohfreqwk+ global_stress2",data = stroke_reduced,
family='binomial',w= stroke_reduced$weights) )
# model_listArg is a list of models fitted for each of the variables in in_out$outlist based on
# its parents given in in_out$inlist. By default this is set to an empty list. Alternatively the
# user can supply their custom fitted, model_listpop, which should be consistent with the causal
# structure. model_listArg is defined earlier in this tutorial.
# Note it is important that model_listArg is defined as a list and in the same order and length
# as the variables defined in in_outArg[[2]].
model_listArgFit <- list(glm(formula = phys ~ subeduc + regionnn7 * ns(eage, df = 5) +
esex * ns(eage, df = 5) + moteduc + fatduc, family = "binomial", data = stroke_reduced,
weights = weights), # model 1 phys
polr(formula = ahei3tert ~ subeduc + regionnn7 * ns(eage, df = 5) + esex * ns(eage, df = 5) +
moteduc + fatduc, data = stroke_reduced, weights = weights), # model 2 ahei3tert
glm(formula = nevfcur ~ subeduc + regionnn7 * ns(eage, df = 5) + esex * ns(eage, df = 5) +
moteduc + fatduc, family = "binomial",data = stroke_reduced, weights = weights),
# model 3 nevfcur
polr(formula = alcohfreqwk ~ subeduc + regionnn7 * ns(eage, df = 5) +esex * ns(eage, df = 5) +
moteduc + fatduc, data = stroke_reduced,weights = weights), # model 4 alcohfreqwk
glm(formula = global_stress2 ~ subeduc + regionnn7 * ns(eage,df = 5) + esex * ns(eage, df = 5) +
moteduc + fatduc, family = "binomial",data = stroke_reduced, weights = weights),
# model 5 global_stress2
glm(formula = subhtn ~ subeduc + regionnn7 * ns(eage, df = 5) +esex * ns(eage, df = 5) +
moteduc + fatduc + phys + ahei3tert +nevfcur + alcohfreqwk + global_stress2,family = "binomial",
data = stroke_reduced, weights = weights), # model 6 subhtn
lm(formula = apb ~ subeduc + regionnn7 * ns(eage, df = 5) +esex * ns(eage, df = 5) +
moteduc + fatduc + phys + ahei3tert +nevfcur + alcohfreqwk + global_stress2,
data = stroke_reduced,weights = weights), # model 7 apob_apoa name shorted to "apb"
lm(formula = whr ~ subeduc + regionnn7 * ns(eage, df = 5) + esex *ns(eage, df = 5) + moteduc +
fatduc + phys + ahei3tert +nevfcur + alcohfreqwk + global_stress2, data = stroke_reduced,
weights = weights), # model 8 whr
glm(formula = cardiacrfcat ~ subeduc + regionnn7 * ns(eage, df = 5) +esex * ns(eage, df = 5) +
moteduc + fatduc + phys + ahei3tert +nevfcur + alcohfreqwk + global_stress2 +
ns(apb, knots = quantile(apb,c(0.25, 0.5, 0.75)),
Boundary.knots = quantile(apb,c(0.001, 0.95))) + ns(whr, df = 5) + subhtn,
family = "binomial",data = stroke_reduced, weights = weights), # model 9 cardiacrfcat
glm(formula = dmhba1c2 ~ subeduc + regionnn7 * ns(eage, df = 5) +esex * ns(eage, df = 5) +
moteduc + fatduc + phys + ahei3tert +nevfcur + alcohfreqwk + global_stress2 +
ns(apb, knots = quantile(apb,c(0.25, 0.5, 0.75)),
Boundary.knots = quantile(apb,c(0.001, 0.95))) + ns(whr, df = 5) + subhtn,
family = "binomial",data = stroke_reduced, weights = weights), # model 10 dmhba1c2
glm(formula = case ~ subeduc + regionnn7 * ns(eage, df = 5) +esex * ns(eage, df = 5) + moteduc +
fatduc + phys + ahei3tert +nevfcur + alcohfreqwk + global_stress2 +
ns(apb, knots = quantile(apb,c(0.25, 0.5, 0.75)),
Boundary.knots = quantile(apb,c(0.001, 0.95))) + ns(whr, df = 5) + subhtn +
cardiacrfcat +dmhba1c2, family = "binomial", data = stroke_reduced, weights = weights)
# model 11 case
)
# For greater accuracy a larger number of bootstraps (e.g. 200) and larger number of simulations
# (e.g. 1000) should be run. However, this will increase the run time greatly.
causalPAFplot(dataframe = stroke_reduced,
exposure="phys",
mediator=c("subhtn","apb","whr"),
response="case",
response_model_mediators = response_vs_mediator,
response_model_exposure = response_vs_phys,
in_outArg = in_out,
Splines_outlist = Splines_outlist,
splinesDefinedIn_in_outDAG = TRUE,
model_listArg = model_listArgFit,
weights = w,
NumBootstrap = 2,
NumSimulation = 2,
plot = "bar",
fill= "skyblue",
colour ="orange" )
# }
Run the code above in your browser using DataLab