# NOT RUN {
# }
# NOT RUN {
# ----------------------------------------------------------------------------------------- ##
# Mother's stress on child illness:
# Investigate whether mother's stress is (Granger) causal for child illness
# 'hrf' model is fit using previous observations of mother's stress to predict
# child's illness at given time point, but not mother's stress at that time point
#
# Predictor variables are classified into "historical" and "concurrent"
#
# A predictor is "historical" if its prior realizations can be used to predict
# the outcome.
#
# A predictor is "concurrent" if its realization at the same timepoint as the outcome
# can be used to predict the outcome at that timepoint
#
# A predictor can be both "concurrent" and "historical", the status of the predictors
# can be set by the 'vh' and 'vc' arguments of 'hrf'.
# (if not set these are automatically determined)
#
# ------------------------------------------------------------------------------------------- ##
data(mscm)
mscm=as.data.frame(na.omit(mscm))
# -- set concurrent and historical predictors
historical_predictors=match(c("stress","illness"),names(mscm))
concurrent_predictors=which(names(mscm)!="stress")
control=list(vh=historical_predictors,vc=concurrent_predictors)
# -- fit historical random forest
# (NOTE: response is 0/1 so a regression tree is
# the same as a classification tree with Gini-index splitting)
ff=hrf(x=mscm,id=mscm$id,time=mscm$day,yindx="illness",control=control)
# out-of-bag error
plot(1:length(ff$error),ff$error,type="l",main="OOB error",xlab="forest size",ylab="mse")
# .. larger nodesize works slightly better
control$nodesize=20
ff=hrf(x=mscm,id=mscm$id,time=mscm$day,yindx="illness",control=control)
points(1:length(ff$error),ff$error,type="l",col="blue")
# -- variable importance table
vi=varimp_hrf(ff)
vi
# -- fit historical random forest with 'se=TRUE'
control$se=TRUE
ff=hrf(x=mscm,id=mscm$id,time=mscm$day,yindx="illness",control=control)
# -- partial dependence for top 4 predictors (with +/-2 SE estimates)
par(mfrow=c(2,2))
for(k in 1:4)
pd=partdep_hrf(ff,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
## -- Classification trees
## setting classify=TRUE builds classification tree (gini-impurity node splitting)
control$classify=TRUE
## ... standard error estimation not implemented .. turn off bootstrapping
control$se=FALSE
ff=hrf(x=mscm,id=mscm$id,time=mscm$day,yindx="illness",control=control)
# -- plot oob classification error
plot(1:length(ff$error),ff$error,type="l",xlab="forest size",ylab="oob classification error")
abline(mean(mscm$illness),0,lty=2) ## error of constant model
p=predict_hrf(ff)
## variable importance table (model error measured by gini-impurity)
vi=varimp_hrf(ff)
vi
# -------------------------------- #
# Data w/irregular observation times
# ------------------------------- #
data(cd4)
control=list(se=TRUE)
ff=hrf(x=cd4,id=cd4$id,time=cd4$time,yindx="count",control=control)
vi=varimp_hrf(ff)
# -- partial dependence for top 4 predictors (with +/-2 SE estimates)
par(mfrow=c(2,2))
for(k in 1:4)
pd=partdep_hrf(ff,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
plot(1:length(ff$error),ff$error,xlab="forest size",ylab="oob mse",type="l")
## by default, the number of delta values (parameter 'eta_1' above) is 20
## can set this using 'ndelta'
control$ndelta=50
control$se=FALSE # -- turning off bootstrapping ..
ff=hrf(x=cd4,id=cd4$id,time=cd4$time,yindx="count",control=control)
points(1:length(ff$error),ff$error,type="l",lty=2)
# the grid of delta values
ff$control$delta
# --------------------------------------- ##
# Boston Housing data (not longitudinal)
# --------------------------------------- ##
# library(htree)
library(mlbench)
library(randomForest)
data(BostonHousing)
dat=as.data.frame(na.omit(BostonHousing))
## omitting arguments time/id assumes rows are iid
control=list(ntrees=500,sample_fraction=.5,nodesize=1)
h=hrf(x=dat,yindx="medv",control=control)
## randomForest comparison
## (by default, randomForest samples with replacement, while hrf samples without)
r=randomForest(medv~.,data=dat,replace=F,sampsize=ceiling(.5*nrow(dat)),nodesize=1)
## plot oob-error for both
plot(1:length(r$mse),r$mse,type="l",ylim=c(min(r$mse,h$error),max(r$mse,h$error)),
main="BostonHousing",xlab="forest size",ylab="out-of-bag mse")
points(1:length(h$error),h$error,type="l",col="blue")
## -- variable importance table
vi=varimp_hrf(h)
vi
## -- partial dependence plots with approximate 95<!-- % C.I -->
control$se=TRUE
h=hrf(x=dat,yindx="medv",control=control)
par(mfrow=c(2,2))
for(k in 1:4)
pd=partdep_hrf(h,xindx=as.character(vi$Predictor[k]))
par(mfrow=c(1,1))
# }
Run the code above in your browser using DataLab