# Simple example (with one processor, because we don't want to require the
# installation of package nws here:
x <- big.matrix(100000, 3, init=0, type="double")
x[seq(1,100000,by=2),] <- rnorm(150000)
x[seq(2,100000,by=2),] <- rnorm(150000, 5, 1)
head(x)
ans <- kmeans.big.matrix(x, 2, nstart=5) # Sequential multiple starts.
# To use NWS, try something like the following:
library(nws)
s <- sleigh(nwsHost='yourhostname.xxx.yyy.zzz', workerCount=2)
ans <- kmeans.big.matrix(x, 2, nstart=5, parallel='nws', nwssleigh=s)
stopSleigh(s)
# Both the following are run iteratively, but with less memory overhead using
# kmeans.big.matrix. Note that this first gc() doesn't reflect the C++
# memory usage for the big.matrix, but the maximum memory used is about
# 35 MB after kmeans.big.matrix().
gc(reset=TRUE)
time.new <- system.time(print(kmeans.big.matrix(x, 2, nstart=5)$centers))
gc()
y <- x[,]
rm(x)
# In contrast, the regular kmeans() really burns through the memory:
gc(reset=TRUE)
time.old <- system.time(print(kmeans(y, 2, nstart=5)$centers))
gc()
# The new kmeans() centers should match the old kmeans() centers, without
# the memory overhead running more quickly; it isn't a problem with the guts of the
# kmeans() implementation (the algorithm is in C, well-implemented), but in
# the traditional C/R interface and R code managing the objects and nstart:
time.new
time.old
Run the code above in your browser using DataLab