# Prepare data: split into training set (2/3) and test set (1/3)
data("iris", package = "datasets")
train <- c(1:34, 51:83, 101:133)
iris_train <- iris[train, ]
iris_test <- iris[-train, ]
# One case with missing data in train set, and another case in test set
iris_train[1, 1] <- NA
iris_test[25, 2] <- NA
iris_rf <- ml_rforest(data = iris_train, Species ~ .)
summary(iris_rf)
plot(iris_rf) # Useful to look at the effect of ntree=
# For such a relatively simple case, 50 trees are enough
iris_rf <- ml_rforest(data = iris_train, Species ~ ., ntree = 50)
summary(iris_rf)
predict(iris_rf) # Default type is class
predict(iris_rf, type = "membership")
predict(iris_rf, type = "both")
predict(iris_rf, type = "vote")
# Out-of-bag prediction (unbiased)
predict(iris_rf, method = "oob")
# Self-consistency (always very high for random forest, biased, do not use!)
confusion(iris_rf)
# This one is better
confusion(iris_rf, method = "oob") # Out-of-bag performances
# Cross-validation prediction is also a good choice when there is no test set
predict(iris_rf, method = "cv") # Idem: cvpredict(res)
# Cross-validation for performances estimation
confusion(iris_rf, method = "cv")
# Evaluation of performances using a separate test set
confusion(predict(iris_rf, newdata = iris_test), iris_test$Species)
# Regression using random forest (from ?randomForest)
set.seed(131) # Useful for reproducibility (use a different number each time)
ozone_rf <- ml_rforest(data = airquality, Ozone ~ ., mtry = 3,
importance = TRUE, na.action = na.omit)
summary(ozone_rf)
# Show "importance" of variables: higher value mean more important variables
round(randomForest::importance(ozone_rf), 2)
plot(na.omit(airquality)$Ozone, predict(ozone_rf))
abline(a = 0, b = 1)
# Unsupervised classification using random forest (from ?randomForest)
set.seed(17)
iris_urf <- ml_rforest(train = iris[, -5]) # Use only quantitative data
summary(iris_urf)
randomForest::MDSplot(iris_urf, iris$Species)
plot(stats::hclust(stats::as.dist(1 - iris_urf$proximity),
method = "average"), labels = iris$Species)
Run the code above in your browser using DataLab