# Simple example (with one processor):
library(bigmemory)
x <- big.matrix(100000, 3, init=0, type="double")
x[seq(1,100000,by=2),] <- rnorm(150000)
x[seq(2,100000,by=2),] <- rnorm(150000, 5, 1)
head(x)
ans <- bigkmeans(x, 1) # One cluster isn't always allowed
# but is convenient.
ans$centers
ans$withinss
ans$size
apply(x, 2, mean)
ans <- bigkmeans(x, 2, nstart=5) # Sequential multiple starts.
class(ans)
names(ans)
ans$centers
ans$withinss
ans$size
# To use a parallel backend, try something like the following,
# assuming you have at least 3 cores available on this machine.
# Each processor does incur memory overhead for the storage of
# cluster memberships.
library(doSNOW)
cl <- makeCluster(3, type="SOCK")
registerDoSNOW(cl)
ans <- bigkmeans(x, 2, nstart=5)
# Both the following are run iteratively, but with less memory overhead
# using bigkmeans(). Note that the gc() comparisons aren't completely
# fair, because the big.matrix objects aren't reflected in the gc()
# summary. But the savings is there.
gc(reset=TRUE)
time.new <- system.time(print(bigkmeans(x, 2, nstart=5)$centers))
gc()
y <- x[,]
rm(x)
gc(reset=TRUE)
time.old <- system.time(print(kmeans(y, 2, nstart=5)$centers))
gc()
# The new kmeans() centers should match the old kmeans() centers, without
# the memory overhead amd running more quickly.
time.new
time.old
Run the code above in your browser using DataLab