Last chance! 50% off unlimited learning
Sale ends in
Imitation of the Python sklearn.datasets
functions.
Gen.cl.data(type=c("blobs", "moons", "circles"), N=100, noise=NULL,
shuffle=TRUE, bdim=2, bcenters=3, bnoise=1, bbox=c(-10, 10), cfactor=0.8)
'blobs' are Gaussian blobs; 'moons' are two interleaving half-circles; 'circles' are two embedded circles
Number of data points
Whether to randomize the output
Standard deviation of Gaussian noise applied to point positions
Dimensionality of 'blobs' dataset
Number of 'blobs' centers
Standard deviation of 'blobs' Gaussian noise: vector of length one or length equal to the number of centers
The bounding box within which blobs centers will be created
Scale factor between 'circles' (should be > 0 and < 1)
Alexey Shipunov
Algorihms were taken partly from Python 'scikit-learn' and from Github 'elbamos/clusteringdatasets'.
scikit.palette <- c("#377EB8", "#FF7F00", "#4DAF4A", "#F781BF", "#A65628", "#984EA3",
"#999999", "#E41A1C", "#DEDE00", "#000000")
palette(scikit.palette)
n.samples <- 500
## data
set.seed(21)
no.structure <- list(samples=cbind(runif(n.samples), runif(n.samples)),
labels=rep(1, n.samples))
noisy.circles <- Gen.cl.data(type="circles", N=n.samples, cfactor=0.5, noise=0.05)
noisy.moons <- Gen.cl.data(type="moons", N=n.samples, noise=0.05)
blobs <- Gen.cl.data(type="blobs", N=n.samples, noise=1)
## anisotropically distributed data
aniso <- Gen.cl.data(type="blobs", N=n.samples)
aniso$samples <- aniso$samples %*% rbind(c(0.6, -0.6), c(-0.4, 0.8))
## blobs with varied variances
varied <- Gen.cl.data(type="blobs", N=n.samples, bnoise=c(1, 2.5, 0.5))
set.seed(NULL)
## single example
plot(aniso$samples, col=aniso$labels, pch=19)
## all data objects example
## old.X11.options <- X11.options(width=6, height=6) # to make square cells
oldpar <- par(mfrow=c(2, 3), mar=c(1, 1, 3, 1))
for (n in c("noisy.circles", "noisy.moons", "no.structure",
"blobs", "aniso", "varied")) {
plot(get(n)$samples, col=get(n)$labels, pch=19, main=n, xlab="", ylab="",
xaxt="n", yaxt="n")
}
par(oldpar)
## X11.options <- old.X11.options
# \donttest{
## comparison of clustering techniques example
## old.X11.options <- X11.options(width=10, height=6) # to make square cells
oldpar <- par(mfrow=c(6, 10), mar=rep(0, 4), xaxt="n", yaxt="n")
COUNT <- 1
for (n in c("noisy.circles", "noisy.moons", "no.structure", "blobs", "aniso", "varied")) {
K <- 3
if (n %in% c("noisy.circles", "noisy.moons")) K <- 2
TITLE <- function(x) if (COUNT==1) { legend("topleft", legend=x, cex=1.25, bty="n") }
##
newlabels <- cutree(hclust(dist(get(n)$samples), method="ward.D2"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("Ward")
##
newlabels <- cutree(hclust(dist(get(n)$samples), method="average"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("UPGMA")
##
newlabels <- kmeans(round(get(n)$samples, 5), centers=K)$cluster
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("K-means")
##
newlabels <- cutree(as.hclust(cluster::diana(dist(get(n)$samples))), k=K) # slow
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("DIANA")
##
nn <- cluster::fanny(get(n)$samples, k=K) # a bit slow
dunn <- apply(nn$membership, 1, function(.x) (sum(.x^2) - 1/K) / (1 - 1/K))
fuzzy <- dunn < 0.05
plot(get(n)$samples[!fuzzy, ], col=nn$clustering[!fuzzy], pch=19)
points(get(n)$samples[fuzzy, ], col="black", pch=1)
TITLE("FANNY")
##
newlabels <- kernlab::specc(get(n)$samples, centers=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("spectral")
##
nn <- apcluster::apclusterK(apcluster::negDistMat(), get(n)$samples, K=K) # very slow
newlabes <- apply(sapply(nn@clusters,
function(.y) 1:nrow(get(n)$samples) %in% .y), 1, which)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("AP") # affinity propagation
##
## eps values taken out of scikit and 'dbscan::kNNdistplot() "knee"', 'minPts' default
EPS <- c(noisy.circles=0.3, noisy.moons=0.3, no.structure=0.3, blobs=1,
aniso=0.5, varied=1)
nn <- dbscan::dbscan(get(n)$samples, eps=EPS[n])
outliers <- nn$cluster == 0
plot(get(n)$samples[!outliers, ], col=nn$cluster[!outliers], pch=19)
points(get(n)$samples[outliers, ], col="black", pch=1)
TITLE("DBSCAN")
##
newlabels <- meanShiftR::meanShift(get(n)$samples, nNeighbors=10)$assignment
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("mean-shift")
##
library(mclust)
newlabels <- Mclust(get(n)$samples)$classification
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("Gaussian")
COUNT <- COUNT + 1
}
par(oldpar)
## X11.options <- old.X11.options
# }
# \donttest{
## comparison of linkages example
## old.X11.options <- X11.options(width=8, height=6) # to make square cells
oldpar <- par(mfrow=c(6, 8), mar=rep(0, 4), xaxt="n", yaxt="n")
COUNT <- 1
for (n in c("noisy.circles", "noisy.moons", "no.structure", "blobs", "aniso", "varied")) {
K <- 3 ; if (n %in% c("noisy.circles", "noisy.moons")) K <- 2
TITLE <- function(x) if (COUNT==1) { legend("topleft", legend=x, cex=1.25, bty="n") }
newlabels <- cutree(hclust(dist(get(n)$samples), method="ward.D2"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("Ward orig")
newlabels <- cutree(hclust(dist(get(n)$samples), method="ward.D"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("Ward")
newlabels <- cutree(hclust(dist(get(n)$samples), method="average"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("UPGMA")
newlabels <- cutree(hclust(dist(get(n)$samples), method="single"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("single")
newlabels <- cutree(hclust(dist(get(n)$samples), method="complete"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("complete")
newlabels <- cutree(hclust(dist(get(n)$samples), method="mcquitty"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("WPGMA")
newlabels <- cutree(hclust(dist(get(n)$samples), method="median"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("WPGMC")
newlabels <- cutree(hclust(dist(get(n)$samples), method="centroid"), k=K)
plot(get(n)$samples, col=newlabels, pch=19)
TITLE("UPGMC")
COUNT <- COUNT + 1
}
par(oldpar)
## X11.options <- old.X11.options
# }
palette("default")
Run the code above in your browser using DataLab