## ------------------------------------------------------------
## toy example - needed to pass CRAN test
## ------------------------------------------------------------
## mtcars regression
o <- varpro(mpg ~ ., mtcars, ntree = 1)
# \donttest{
## ------------------------------------------------------------
## classification example: iris
## ------------------------------------------------------------
## apply varpro to the iris data
o <- varpro(Species ~ ., iris, max.tree = 5)
## call the importance function and print the results
print(importance(o))
## ------------------------------------------------------------
## regression example: boston housing
## ------------------------------------------------------------
## load the data
data(BostonHousing, package = "mlbench")
## call varpro
o <- varpro(medv~., BostonHousing)
## extract and print importance values
imp <- importance(o)
print(imp)
## another way to extract and print importance values
print(get.vimp(o))
print(get.vimp(o, pretty = FALSE))
## plot importance values
importance(o, plot.it = TRUE)
## ------------------------------------------------------------
## regression example: boston housing illustrating hot-encoding
## ------------------------------------------------------------
## load the data
data(BostonHousing, package = "mlbench")
## convert some of the features to factors
Boston <- BostonHousing
Boston$zn <- factor(Boston$zn)
Boston$chas <- factor(Boston$chas)
Boston$lstat <- factor(round(0.2 * Boston$lstat))
Boston$nox <- factor(round(20 * Boston$nox))
Boston$rm <- factor(round(Boston$rm))
## call varpro and print the importance
print(importance(o <- varpro(medv~., Boston)))
## get top variables
get.topvars(o)
## map importance values back to original features
print(get.orgvimp(o))
## same as above ... but for all variables
print(get.orgvimp(o, pretty = FALSE))
## ------------------------------------------------------------
## regression example: friedman 1
## ------------------------------------------------------------
o <- varpro(y~., data.frame(mlbench::mlbench.friedman1(1000)))
print(importance(o))
## ------------------------------------------------------------
## example without guided tree-splitting
## ------------------------------------------------------------
o <- varpro(y~., data.frame(mlbench::mlbench.friedman2(1000)),
nodesize = 10, split.weight = FALSE)
print(importance(o))
## ------------------------------------------------------------
## regression example: all noise
## ------------------------------------------------------------
x <- matrix(rnorm(100 * 50), 100, 50)
y <- rnorm(100)
o <- varpro(y~., data.frame(y = y, x = x))
print(importance(o))
## ------------------------------------------------------------
## multivariate regression example: boston housing
## ------------------------------------------------------------
data(BostonHousing, package = "mlbench")
## using rfsrc multivariate formula call
importance(varpro(Multivar(lstat, nox) ~., BostonHousing))
## using cbind multivariate formula call
importance(varpro(cbind(lstat, nox) ~., BostonHousing))
##----------------------------------------------------------------
## class imbalanced problem
##
## - simulation example using the caret R-package
## - creates imbalanced data by randomly sampling the class 1 values
##
##----------------------------------------------------------------
if (library("caret", logical.return = TRUE)) {
## experimental settings
n <- 5000
q <- 20
ir <- 6
f <- as.formula(Class ~ .)
## simulate the data, create minority class data
d <- twoClassSim(n, linearVars = 15, noiseVars = q)
d$Class <- factor(as.numeric(d$Class) - 1)
idx.0 <- which(d$Class == 0)
idx.1 <- sample(which(d$Class == 1), sum(d$Class == 1) / ir , replace = FALSE)
d <- d[c(idx.0,idx.1),, drop = FALSE]
d <- d[sample(1:nrow(d)), ]
## varpro call
print(importance(varpro(f, d)))
}
## ------------------------------------------------------------
## survival example: pbc
## ------------------------------------------------------------
data(pbc, package = "randomForestSRC")
o <- varpro(Surv(days, status)~., pbc)
print(importance(o))
## ------------------------------------------------------------
## pbc survival with rmst (restricted mean survival time)
## functional of interest is RMST at 500 days
## ------------------------------------------------------------
data(pbc, package = "randomForestSRC")
o <- varpro(Surv(days, status)~., pbc, rmst = 500)
print(importance(o))
## ------------------------------------------------------------
## pbc survival with rmst vector
## variable importance is a list for each rmst value
## ------------------------------------------------------------
data(pbc, package = "randomForestSRC")
o <- varpro(Surv(days, status)~., pbc, rmst = c(500, 1000))
print(importance(o))
## ------------------------------------------------------------
## survival example with more variables
## ------------------------------------------------------------
data(peakVO2, package = "randomForestSRC")
o <- varpro(Surv(ttodead, died)~., peakVO2)
imp <- importance(o, plot.it = TRUE)
print(imp)
## ------------------------------------------------------------
## high dimensional survival example
## ------------------------------------------------------------
data(vdv, package = "randomForestSRC")
o <- varpro(Surv(Time, Censoring)~., vdv)
print(importance(o))
## ------------------------------------------------------------
## high dimensional survival example without sparse option
## ------------------------------------------------------------
data(vdv, package = "randomForestSRC")
o <- varpro(Surv(Time, Censoring)~., vdv, sparse = FALSE)
print(importance(o))
## ----------------------------------------------------------------------
## high dimensional survival example using different split-weight methods
## ----------------------------------------------------------------------
data(vdv, package = "randomForestSRC")
f <- as.formula(Surv(Time, Censoring)~.)
## lasso only
print(importance(varpro(f, vdv, split.weight.method = "lasso")))
## lasso and vimp
print(importance(varpro(f, vdv, split.weight.method = "lasso vimp")))
## lasso, vimp and shallow trees
print(importance(varpro(f, vdv, split.weight.method = "lasso vimp tree")))
## ------------------------------------------------------------
## largish data (iowa housing data)
## to speed up calculations convert data to all real
## ------------------------------------------------------------
## first we roughly impute the data
data(housing, package = "randomForestSRC")
dta <- roughfix(housing)
dta <- data.frame(data.matrix(dta))
## varpro call
o <- varpro(SalePrice~., dta)
print(importance(o))
## ------------------------------------------------------------
## large data: illustrates different ways to improve speed
## ------------------------------------------------------------
n <- 25000
p <- 50
d <- data.frame(y = rnorm(n), x = matrix(rnorm(n * p), n))
## use large nodesize
print(system.time(o <- varpro(y~., d, ntree = 100, nodesize = 200)))
print(importance(o))
## use large nodesize, smaller bootstrap
print(system.time(o <- varpro(y~., d, ntree = 100, nodesize = 200,
sampsize = 100)))
print(importance(o))
## ------------------------------------------------------------
## custom split-weights (hidden option)
## ------------------------------------------------------------
## load the data
data(BostonHousing, package = "mlbench")
## make some features into factors
Boston <- BostonHousing
Boston$zn <- factor(Boston$zn)
Boston$chas <- factor(Boston$chas)
Boston$lstat <- factor(round(0.2 * Boston$lstat))
Boston$nox <- factor(round(20 * Boston$nox))
Boston$rm <- factor(round(Boston$rm))
## get default custom split-weights: a named real vector
swt <- get.splitweight.custom(medv~.,Boston)
## define custom splits weight
swt <- swt[grepl("crim", names(swt)) |
grepl("zn", names(swt)) |
grepl("nox", names(swt)) |
grepl("rm", names(swt)) |
grepl("lstat", names(swt))]
swt[grepl("nox", names(swt))] <- 4
swt[grepl("lstat", names(swt))] <- 4
swt <- c(swt, strange=99999)
cat("custom split-weight\n")
print(swt)
## call varpro with the custom split-weights
o <- varpro(medv~.,Boston,split.weight.custom=swt,verbose=TRUE,sparse=FALSE)
cat("varpro result\n")
print(importance(o))
print(get.vimp(o, pretty=FALSE))
print(get.orgvimp(o, pretty=FALSE))
# }
Run the code above in your browser using DataLab