# Prepare data: split into training set (2/3) and test set (1/3)
data("iris", package = "datasets")
train <- c(1:34, 51:83, 101:133)
iris_train <- iris[train, ]
iris_test <- iris[-train, ]
# One case with missing data in train set, and another case in test set
iris_train[1, 1] <- NA
iris_test[25, 2] <- NA
iris_lda <- ml_lda(data = iris_train, Species ~ .)
iris_lda
summary(iris_lda)
plot(iris_lda, col = as.numeric(response(iris_lda)) + 1)
# Prediction using a test set
predict(iris_lda, newdata = iris_test) # class (default type)
predict(iris_lda, type = "membership") # posterior probability
predict(iris_lda, type = "both") # both class and membership in a list
# Type projection
predict(iris_lda, type = "projection") # Projection on the LD axes
# Add test set items to the previous plot
points(predict(iris_lda, newdata = iris_test, type = "projection"),
col = as.numeric(predict(iris_lda, newdata = iris_test)) + 1, pch = 19)
# predict() and confusion() should be used on a separate test set
# for unbiased estimation (or using cross-validation, bootstrap, ...)
# Wrong, cf. biased estimation (so-called, self-consistency)
confusion(iris_lda)
# Estimation using a separate test set
confusion(predict(iris_lda, newdata = iris_test), iris_test$Species)
# Another dataset (binary predictor... not optimal for lda, just for test)
data("HouseVotes84", package = "mlbench")
house_lda <- ml_lda(data = HouseVotes84, na.action = na.omit, Class ~ .)
summary(house_lda)
confusion(house_lda) # Self-consistency (biased metrics)
print(confusion(house_lda), error.col = FALSE) # Without error column
# More complex formulas
# Exclude one or more variables
iris_lda2 <- ml_lda(data = iris, Species ~ . - Sepal.Width)
summary(iris_lda2)
# With calculation
iris_lda3 <- ml_lda(data = iris, Species ~ log(Petal.Length) +
log(Petal.Width) + I(Petal.Length/Sepal.Length))
summary(iris_lda3)
# Factor levels with missing items are allowed
ir2 <- iris[-(51:100), ] # No Iris versicolor in the training set
iris_lda4 <- ml_lda(data = ir2, Species ~ .)
summary(iris_lda4) # missing class
# Missing levels are reinjected in class or membership by predict()
predict(iris_lda4, type = "both")
# ... but, of course, the classifier is wrong for Iris versicolor
confusion(predict(iris_lda4, newdata = iris), iris$Species)
# Simpler interface, but more memory-effective
iris_lda5 <- ml_lda(train = iris[, -5], response = iris$Species)
summary(iris_lda5)
Run the code above in your browser using DataLab