# NOT RUN {
# }
# NOT RUN {
# ----------------------------------------------------------------------------------------- ##
# Mother's stress on child illness:
# Investigate whether mother's stress is (Granger) causal of child illness
# 'htb' model is fit using previous observations of mother's stress to predict
# child's illness at given time point, but not mother's stress at that time point
#
# Predictor variables are classified into "historical" and "concurrent"
#
# A predictor is "historical" if its prior realizations can be used to predict
# the outcome.
#
# A predictor is "concurrent" if its realization at the same timepoint as the outcome
# can be used to predict the outcome at that timepoint
#
# A predictor can be both "concurrent" and "historical", the status of the predictors
# can be set by the 'vh' and 'vc' arguments of 'hrf'.
# (if not set these are automatically determined)
#
# ------------------------------------------------------------------------------------------- ##
data(mscm)
mscm=as.data.frame(na.omit(mscm))
# -- set concurrent and historical predictors
historical_predictors=match(c("illness","stress"),names(mscm))
concurrent_predictors=which(names(mscm)!="stress")
control=list(vh=historical_predictors,vc=concurrent_predictors,
ntrees=200,family="bernoulli",cvfold=10,lambda=.1)
# logistic regression
ff=htb(x=mscm,id=mscm$id,time=mscm$day,yindx="illness",control=control)
# cross-validated negative log-likelihood
plot(1:ff$ntrees,ff$cv_error,type="l",#col="blue",
xlab="iterations",ylab="cross-validation error")
# -- variable importance table
vi=varimp_htb(ff)
vi
# -- plot partial dependence (with +/-2 approximate standard errors)
par(mfrow=c(2,2))
for(k in 1:4)
partdep_htb(ff,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
# -- Standard errors are based on cross-validation runs (using delete-d
# (subjects) jackknife estimator, d="number-of-subjects"/cvfold)
# -- increasing nfold (which defaults to equal cvfold) gives less
# noisy standard error estimates ...
control$nfold=50
ff=htb(x=mscm,id=mscm$id,time=mscm$day,yindx="illness",control=control)
par(mfrow=c(2,2))
for(k in 1:4)
partdep_htb(ff,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
# -------------------------------- #
# Data w/irregular observation times
# ------------------------------- #
data(cd4)
control=list(cvfold=10,lambda=.1,nsplit=3,ntrees=200)
ff=htb(x=cd4,id=cd4$id,time=cd4$time,yindx="count",control=control)
vi=varimp_htb(ff)
# -- partial dependence for top 4 predictors (with +/-2 SE estimates)
par(mfrow=c(2,2))
for(k in 1:4)
pd=partdep_htb(ff,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
# -- cv error
plot(1:ff$control$ntrees,ff$cv_error,xlab="boosting iteration",
ylab="cross-validated mse",type="l")
# estimated boosting iteration
abline(v=ff$nboost_cv,lty=2)
## by default, the number of delta values (parameter 'eta_1' above) is 20
## can set this using 'ndelta'
control$ndelta=50
ff=htb(x=cd4,id=cd4$id,time=cd4$time,yindx="count",control=control)
points(1:ff$control$ntrees,ff$cv_error,type="l",lty=2)
## note: differences in cv_error can be due (in part) to randomness
## in out-of-sample selection
# the grid of delta values
ff$control$delta
# ------------------------------------------ #
# Boston Housing data (not longitudinal)
# ------------------------------------------ #
# library(htree)
library(mlbench)
data(BostonHousing)
dat=as.data.frame(na.omit(BostonHousing))
# omitting arguments 'time' and 'id' assumes rows are iid
control=list(cvfold=10,ntrees=500)
h=htb(x=dat,yindx="medv",control=control)
# -- plot cross-validated Mean-squared error --- #
plot(1:h$ntrees,h$cv_error,type="l",xlab="Boosting iterations",
ylab="MSE",main="Cross-validated Mean-squared error")
# -- variable importance table
vi=varimp_htb(h,nperm=20)
vi
# -- partial dependence of top 4 predictors with +/-2 S.E.
par(mfrow=c(2,2))
for(k in 1:4)
partdep_htb(h,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab