# Generate synthetic data (three normal cluster in two dimensions)
# Clusters have different shapes and orientation.
# The data is contaminated uniformly (level 20%).
# Generates base clusters
set.seed(1)
Z1 <- c(rnorm(100, 0), rnorm(100, 0), rnorm(100, 0))
Z2 <- rnorm(300)
X <- matrix(0, ncol = 2, nrow = 300)
X[, 1] <- Z1
X[, 2] <- Z2
true.cluster <- c(rep(1, 100), rep(2, 100), rep(3, 100))
# Rotate, expand and translate base clusters
theta <- pi/3
aux1 <- matrix(c(cos(theta), -sin(theta), sin(theta), cos(theta)), nrow = 2)
aux2 <- sqrt(4) * diag(c(1, 1/4))
B <- aux1 %*% aux2 %*% t(aux1)
X[true.cluster == 3, ] <-
X[true.cluster == 3, ] %*% aux2 %*% aux1 + matrix(c(5, 2),
byrow = TRUE,
nrow = 100,
ncol = 2)
X[true.cluster == 2, 2] <- X[true.cluster == 2, 2] * 5
X[true.cluster == 1, 2] <- X[true.cluster == 1, 2] * 0.1
X[true.cluster == 1, ] <- X[true.cluster == 1, ] + matrix(c(-5, -1),
byrow = TRUE,
nrow = 100,
ncol = 2)
# Generate 60 synthetic outliers (contamination level 20%)
outliers <- sample(1:300, 60)
X[outliers, ] <- matrix(runif( 40, 2 * min(X), 2 * max(X) ),
ncol = 2, nrow = 60)
# Applying the algorithm
robust <- improvedktaucenters(X, K = 3, cutoff = 0.999)
# Plotting results
oldpar <- par(mfrow = c(2, 1))
plot(X, main = "Actual clusters")
for (j in 1:3){
points(X[true.cluster == j, ], pch = 19, col = j + 1)
}
points(X[outliers, ], pch = 19, col = 1)
plot(X, main = "Clusters estimation")
for (j in 1:3){
points(X[robust$cluster == j,], pch = 19, col = j + 1)
}
points(X[robust$outliers, ], pch = 19)
par(oldpar)
Run the code above in your browser using DataLab