# NOT RUN {
### Example 1: detect an obvious outlier
### (Random data from a standard normal distribution)
library(isotree)
set.seed(1)
m <- 100
n <- 2
X <- matrix(rnorm(m * n), nrow = m)
### Will now add obvious outlier point (3, 3) to the data
X <- rbind(X, c(3, 3))
### Fit a small isolation forest model
iso <- isolation.forest(X, ntrees = 10, nthreads = 1)
### Check which row has the highest outlier score
pred <- predict(iso, X)
cat("Point with highest outlier score: ",
X[which.max(pred), ], "\n")
### Example 2: plotting outlier regions
### This example shows predicted outlier score in a small
### grid, with a model fit to a bi-modal distribution. As can
### be seen, the extended model is able to detect high
### outlierness outside of both regions, without having false
### ghost regions of low-outlierness where there isn't any data
library(isotree)
oldpar <- par(mfrow = c(2, 2), mar = c(2.5,2.2,2,2.5))
### Randomly-generated data from different distributions
set.seed(1)
group1 <- data.frame(x = rnorm(1000, -1, .4),
y = rnorm(1000, -1, .2))
group2 <- data.frame(x = rnorm(1000, +1, .2),
y = rnorm(1000, +1, .4))
X = rbind(group1, group2)
### Add an obvious outlier which is within the 1d ranges
### (As an interesting test, remove and see what happens)
X = rbind(X, c(-1, 1))
### Produce heatmaps
pts = seq(-3, 3, .1)
space_d <- expand.grid(x = pts, y = pts)
plot.space <- function(Z, ttl) {
image(pts, pts, matrix(Z, nrow = length(pts)),
col = rev(heat.colors(50)),
main = ttl, cex.main = 1.4,
xlim = c(-3, 3), ylim = c(-3, 3),
xlab = "", ylab = "")
par(new = TRUE)
plot(X, type = "p", xlim = c(-3, 3), ylim = c(-3, 3),
col = "#0000801A",
axes = FALSE, main = "",
xlab = "", ylab = "")
}
### Now try ouy different variations of the model
### Single-variable model
iso_simple = isolation.forest(
X, ndim=1,
ntrees=100,
nthreads=1,
prob_pick_pooled_gain=0,
prob_pick_avg_gain=0)
Z1 <- predict(iso_simple, space_d)
plot.space(Z1, "Isolation Forest")
### Extended model
iso_ext = isolation.forest(
X, ndim=2,
ntrees=100,
nthreads=1,
prob_pick_pooled_gain=0,
prob_pick_avg_gain=0)
Z2 <- predict(iso_ext, space_d)
plot.space(Z2, "Extended Isolation Forest")
### SCiForest
iso_sci = isolation.forest(
X, ndim=2,
ntrees=100,
nthreads=1,
prob_pick_pooled_gain=0,
prob_pick_avg_gain=1)
Z3 <- predict(iso_sci, space_d)
plot.space(Z3, "SCiForest")
### Fair-cut forest
iso_fcf = isolation.forest(
X, ndim=2,
ntrees=100,
nthreads=1,
prob_pick_pooled_gain=1,
prob_pick_avg_gain=0)
Z4 <- predict(iso_fcf, space_d)
plot.space(Z4, "Fair-Cut Forest")
par(oldpar)
### Example 3: calculating pairwise distances,
### with a short validation against euclidean dist.
library(isotree)
### Generate random data with 3 dimensions
set.seed(1)
m <- 100
n <- 3
X <- matrix(rnorm(m * n), nrow=m, ncol=n)
### Fit isolation forest model
iso <- isolation.forest(X, ntrees=100, nthreads=1)
### Calculate distances with the model
D_iso <- predict(iso, X, type = "dist")
### Check that it correlates with euclidean distance
D_euc <- dist(X, method = "euclidean")
cat(sprintf("Correlation with euclidean distance: %f\n",
cor(D_euc, D_iso)))
### (Note that euclidean distance will never take
### any correlations between variables into account,
### which the isolation forest model can do)
# }
# NOT RUN {
### Example 4: imputing missing values
### (requires package MASS)
library(isotree)
### Generate random data, set some values as NA
if (require("MASS")) {
set.seed(1)
S <- matrix(rnorm(5 * 5), nrow = 5)
S <- t(S) %*% S
mu <- rnorm(5)
X <- MASS::mvrnorm(1000, mu, S)
X_na <- X
values_NA <- matrix(runif(1000 * 5) < .15, nrow = 1000)
X_na[values_NA] = NA
### Impute missing values with model
iso <- isolation.forest(X_na,
build_imputer = TRUE,
prob_pick_pooled_gain = 1,
ntry = 10)
X_imputed <- predict(iso, X_na, type = "impute")
cat(sprintf("MSE for imputed values w/model: %f\n",
mean((X[values_NA] - X_imputed[values_NA])^2)))
### Compare against simple mean imputation
X_means <- apply(X, 2, mean)
X_imp_mean <- X_na
for (cl in 1:5)
X_imp_mean[values_NA[,cl], cl] <- X_means[cl]
cat(sprintf("MSE for imputed values w/means: %f\n",
mean((X[values_NA] - X_imp_mean[values_NA])^2)))
}
# }
# NOT RUN {
# }
# NOT RUN {
#### A more interesting example
#### (requires package outliertree)
### Compare outliers returned by these different methods,
### and see why some of the outliers returned by the
### isolation forest could be flagged as outliers
if (require("outliertree")) {
hypothyroid <- outliertree::hypothyroid
iso <- isolation.forest(hypothyroid, nthreads=1)
pred_iso <- predict(iso, hypothyroid)
otree <- outliertree::outlier.tree(
hypothyroid,
z_outlier = 6,
pct_outliers = 0.02,
outliers_print = 20,
nthreads = 1)
### Now compare against the top
### outliers from isolation forest
head(hypothyroid[order(-pred_iso), ], 20)
}
# }
Run the code above in your browser using DataCamp Workspace