## not run
## NOTE : use option 'threads = 1' (disabling parallel processing) to speed up computing
## for small samples, since parallel processing is useful only for computationally
## intensive tasks
###### PART ONE : QUICK GUIDE
#### Classification
# data(iris)
# iris.ruf <- randomUniformForest(Species ~ ., data = iris, threads = 1)
## MODEL, PARAMETERS, STATISTICS:
# iris.ruf ## or print(iris.ruf)
## plot OOB error:
# plot(iris.ruf, threads = 1)
## print and plot (global) variable importance and some statistics about trees:
# summary(iris.ruf)
#### Regression
## NOTE: when formula is used, missing values are automatically deleted and dummies
## are built for categorical features
# data(airquality)
# ozone.ruf <- randomUniformForest(Ozone ~ ., data = airquality, threads = 1)
# ozone.ruf
## plot OOB error:
# plot(ozone.ruf, threads = 1)
## BAGGING:
# ozone.bagging.ruf <- randomUniformForest(Ozone ~ ., data = airquality,
# bagging = TRUE, threads = 1)
## Ensemble of totally randomized trees, e.g. PURELY RANDOM FOREST:
# ozone.prf <- randomUniformForest(Ozone ~ ., data = airquality,
# randomfeature = TRUE, threads = 1)
#### Common case: use X, as a matrix or data frame, and Y, as a response vector,
#### Classification : iris data, training and testing
# data(iris)
## define random train and test sample. "Species" is the response vector:
# iris.train_test <- init_values(iris[,-which(colnames(iris) == "Species")], iris$Species,
# sample.size = 1/2)
## iris train and test samples:
# iris.train = iris.train_test$xtrain
# species.train = iris.train_test$ytrain
# iris.test = iris.train_test$xtest
# species.test = iris.train_test$ytest
## iris train and test modelling:
# iris.train_test.ruf <- randomUniformForest(iris.train, species.train,
# xtest = iris.test, ytest = species.test, threads = 1)
## view model and statistics:
# iris.train_test.ruf
## BALANCED SAMPLING : equal sample size for all labels
# iris.train_test.balancedsampling.ruf <- randomUniformForest(iris.train, species.train,
# xtest = iris.test, ytest = species.test, rebalancedsampling = TRUE, threads = 1)
###### PART TWO : SUMMARIZED CASE STUDIES
#### Classification : Wine Quality data
## http://archive.ics.uci.edu/ml/datasets/Wine+Quality
## We use red wine quality file : data have 1599 observations, 12 variables and 6 classes.
# data(wineQualityRed)
# wineQualityRed.data = wineQualityRed
## class and observations
# Y = wineQualityRed.data[,"quality"]
# X = wineQualityRed.data[, -which(colnames(wineQualityRed.data) == "quality")]
## First look : train model with default parameters (and retrieve estimates)
## call it standard model.
# wineQualityRed.std.ruf <- randomUniformForest(X, as.factor(Y), threads = 2)
# wineQualityRed.std.ruf
## GLOBAL VARIABLE IMPORTANCE:
# summary(wineQualityRed.std.ruf)
## But some labels do not have enough observations to assess variable importance
## merging class 3 and 4. Merging class 7 and 8 to get enough data.
# Y[Y == 3] = 4
# Y[Y == 8] = 7
## make Y as a factor, change names and get a summary
# Y = as.factor(Y)
# levels(Y) = c("3 or 4", "5", "6", "7 or 8")
# table(Y)
## learn a new model to get a better view on variable importance
## NOTE: Y is now a factor, the model will catch the learning task as a classification
# wineQualityRed.new.ruf <- randomUniformForest(X, Y)
# wineQualityRed.new.ruf
## global variable importance is more consistent
# summary(wineQualityRed.new.ruf)
## plot OOB error (needs some computing)
# plot(wineQualityRed.new.ruf, threads = 2)
## go deeper in assessing variable importance, using a high level of interaction
# importance.wineQualityRed <- importance(wineQualityRed.new.ruf, Xtest = X, maxInteractions = 6)
## VISUALIZING IMPORTANCE: global importance, interactions, importance based on interactions,
## importance based on labels, partial dependencies for all influential variables
## (loop over the prompt to get others partial dependencies)
## get more points, using option whichOrder = "all", default option.
# plot(importance.wineQualityRed, Xtest = X, whichOrder = "first")
## LINKS BETWEEN OBSERVATIONS AND VARIABLES:
# featuresAndObs = as.data.frame(importance.wineQualityRed$localVariableImportance$obs)
# frequencyFeaturesIdx = grep("Frequency", colnames(featuresAndObs))
# featuresNames = apply(featuresAndObs[,-c(1,frequencyFeaturesIdx)], 2,
# function(Z) colnames(X)[Z])
# featuresAndObs[,-c(1,frequencyFeaturesIdx)] = featuresNames
# head(featuresAndObs)
## PARTIAL IMPORTANCE: look at some specific labels from a (very) local view point
## which features for a very good wine (class 7 or 8) ?
# pImportance.wineQualityRed.class7or8 <- partialImportance(X, importance.wineQualityRed,
# whichClass = "7 or 8", nLocalFeatures = 6)
## PARTIAL DEPENDENCIES: how response relies to variables ?
## get it feature after feature, recalling partial dependence
## and considering feature at the first order assuming the feature is the most important,
## at least for the class one need to assess.
# pDependence.wineQualityRed.totalSulfurDioxide <- partialDependenceOverResponses(X,
# importance.wineQualityRed, whichFeature = "total.sulfur.dioxide",
# whichOrder = "first", outliersFilter = TRUE)
## see what happen then for "alcohol" (ask more points using option 'whichOrder = "all"')
# pDependence.wineQualityRed.alcohol <- partialDependenceOverResponses(X,
# importance.wineQualityRed, whichFeature = "alcohol",
# whichOrder = "first", outliersFilter = TRUE)
#### Regression : Auto MPG
## http://archive.ics.uci.edu/ml/datasets/Auto+MPG
## 398 observations, 8 variables,
## Variable to predict : "mpg", miles per gallon
# data(autoMPG)
# autoMPG.data = autoMPG
# Y = autoMPG.data[,"mpg"]
# X = autoMPG.data[,-which(colnames(autoMPG.data) == "mpg")]
## remove "car name" which is a variable with unique ID (car models)
# X = X[, -which(colnames(X) == "car name")]
## train the default model and get OOB evaluation
# autoMPG.ruf <- randomUniformForest(X, Y)
## assess variable importance (asking more points with 'maxInteractions' option)
## NOTE: importance strongly depends on 'ntree' and 'mtry' parameters
# importance.autoMPG <- importance(autoMPG.ruf, Xtest = X)
# plot(importance.autoMPG, Xtest = X)
## opening the way for EXTRAPOLATION (recalling partial dependencies and getting points)
## NOTE : points are the result of the forest classifier and not training responses
# pDependence.autoMPG.weight <- partialDependenceOverResponses(X, importance.autoMPG,
# whichFeature = "weight", whichOrder = "all", outliersFilter = TRUE)
## visualize again 'model year' as a discrete variable and not as a continuous one
# pDependence.autoMPG.modelyear <- partialDependenceOverResponses(X, importance.autoMPG,
# whichFeature = "model year", whichOrder = "all", maxClasses = 30)
## what are the features that lead to a lower consumption (and high mpg)?
# pImportance.autoMPG.high <- partialImportance(X, importance.autoMPG,
# threshold = mean(Y), thresholdDirection = "high", nLocalFeatures = 6)
## PARTIAL DEPENDENCIES BETWEEN COVARIATES : Look at "weight" and "acceleration" dependence
## and looking informations to get their interactions, relatively to all others
# pDependence.autoMPG.weightAndAcceleration <-
# partialDependenceBetweenPredictors(X, importance.autoMPG, c("weight", "acceleration"),
# whichOrder = "all", perspective = FALSE, outliersFilter = TRUE)
## Visualize in 3D (looking to the prompt to start animation)
## NOTE : requires some computation
# pDependence.autoMPG.weightAndAcceleration <-
# partialDependenceBetweenPredictors(X, importance.autoMPG, c("weight", "acceleration"),
# whichOrder = "all", perspective = TRUE, outliersFilter = FALSE)
##dtFWRun the code above in your browser using DataLab