# NOT RUN {
# ===========================================================================
# Play random numbers. See speed.
# ===========================================================================
N = 5000L # Number of points.
d = 500L # Dimensionality.
K = 50L # Number of clusters.
dat = matrix(rnorm(N * d) + runif(N * d), nrow = d)
# Use kmeans++ initialization.
centroidInd = GMKMcharlie::KMppIni(
X = dat, K, firstSelection = 1L, minkP = 2, stochastic = FALSE,
seed = sample(1e9L, 1), maxCore = 2L, verbose = TRUE)
centroid = dat[, centroidInd]
# Euclidean.
system.time({rst = GMKMcharlie::KM(
X = dat, centroid = centroid, maxIter = 100,
minkP = 2, maxCore = 2, verbose = TRUE)})
# Cosine dissimilarity.
dat = apply(dat, 2, function(x) x / sum(x ^ 2) ^ 0.5)
centroid = dat[, centroidInd]
system.time({rst2 = GMKMcharlie::KM(
X = dat, centroid = centroid, maxIter = 100,
minkP = "cosine", maxCore = 2, verbose = TRUE)})
# ===========================================================================
# Test against R's inbuilt km()
# ===========================================================================
dat = t(iris[1:4])
dimnames(dat) = NULL
# Use kmeans++ initialization.
centroidInd = GMKMcharlie::KMppIni(
X = dat, K = 3, firstSelection = 1L, minkP = 2, stochastic = FALSE,
seed = sample(1e9L, 1), maxCore = 2L, verbose = TRUE)
centroid = dat[, centroidInd]
rst = GMKMcharlie::KM(X = dat, centroid = centroid, maxIter = 100,
minkP = 2, maxCore = 2, verbose = TRUE)
rst = lapply(rst, function(x) sort(x$clusterMember))
rst2 = kmeans(x = t(dat), centers = t(centroid), algorithm = "Lloyd")
rst2 = aggregate(list(1L : length(rst2$cluster)),
list(rst2$cluster), function(x) sort(x))[[2]]
setdiff(rst, rst2)
# }
Run the code above in your browser using DataLab