if (FALSE) {
## Load package:
library("diversityForest")
## Set seed to make results reproducible:
set.seed(1234)
## Load the "ctg" data set:
data(ctg)
## Construct a multi forest:
model <- multifor(dependent.variable.name = "CLASS", data = ctg,
num.trees = 20)
# NOTE: num.trees = 20 (in the above) would be much too small for practical
# purposes. This small number of trees was simply used to keep the
# runtime of the example short.
# The default number of trees is num.trees = 5000 for datasets with a maximum of
# 5000 observations and num.trees = 1000 for datasets larger than that.
## The out-of-bag estimated Brier score (note that by default
## 'probability = TRUE' is used in 'multifor'):
model$prediction.error
## Inspect the multi-class and the discriminatory VIM values:
model$var.imp.multiclass
# --> Note that there are no multi-class VIM values for some of the variables.
# These are those for which there are fewer unique values than outcome classes.
# See the "Details" section above.
model$var.imp.discr
## Inspect the 5 variables with the largest multi-class VIM values and the
## 5 variables with the largest discriminatory VIM values:
sort(model$var.imp.multiclass, decreasing = TRUE)[1:5]
sort(model$var.imp.discr, decreasing = TRUE)[1:5]
## Instead of passing the name of the outcome variable through the
## 'dependent.variable.name' argument as above, the formula interface can also
## be used. Below, we fit a multi forest with only the first five variables
## from the 'ctg' data set:
model <- multifor(CLASS ~ b + e + LBE + LB + AC, data=ctg, num.trees = 20)
## As expected, the out-of-bag estimated prediction error is much larger
## for this model:
model$prediction.error
## NOTE: Visual exploration of the results of the multi-class VIM analysis
## is crucial.
## Therefore, in practice the next step would be to apply the
## 'plot.multifor' function to the object 'model'.
# plot(model)
## Prediction:
# Separate 'ctg' data set randomly in training
# and test data:
data(ctg)
train.idx <- sample(nrow(ctg), 2/3 * nrow(ctg))
ctg.train <- ctg[train.idx, ]
ctg.test <- ctg[-train.idx, ]
# Construct multi forest on training data:
# NOTE again: num.trees = 20 is specified too small for practical purposes.
model_train <- multifor(dependent.variable.name = "CLASS", data = ctg.train,
importance = "none", probability = FALSE,
num.trees = 20)
# NOTE: Because we are only interested in prediction here, we do not
# calculate VIM values (by setting importance = "none"), because this
# speeds up calculations.
# NOTE also: Because we are interested in class label prediction here
# rather than class probability prediction we specified 'probability = FALSE'
# above.
# Predict class values of the test data:
pred.ctg <- predict(model_train, data = ctg.test)
# Compare predicted and true class values of the test data:
table(ctg.test$CLASS, pred.ctg$predictions)
## Repeat the analysis for class probability prediction
## (default 'probability = TRUE'):
model_train <- multifor(dependent.variable.name = "CLASS", data = ctg.train,
importance = "none", num.trees = 20)
# Predict class probabilities in the test data:
pred.ctg <- predict(model_train, data = ctg.test)
# The predictions are now a matrix of class probabilities:
head(pred.ctg$predictions)
# Obtain class predictions by choosing the classes with the maximum predicted
# probabilities (the function 'which.is.max' chooses one class randomly if
# there are several classes with maximum probability):
library("nnet")
classes <- levels(ctg.train$CLASS)
pred_classes <- factor(classes[apply(pred.ctg$predictions, 1, which.is.max)],
levels=classes)
# Compare predicted and true class values of the test data:
table(ctg.test$CLASS, pred_classes)
}
Run the code above in your browser using DataLab