# functional upto limits of RAM of the hardware used
# consider removing small values of result to improve sparsity
X <- rSparseMatrix(1e6, 1e6, 1e7)
print(object.size(X), units = "auto") # 118 Mb
system.time(M <- cosSparse(X)) # might take half a minute, depending on hardware
print(object.size(M), units = "auto") # 587 Mb
M <- drop0(M, tol = 0.1) # remove small values
print(object.size(M), units = "auto") # reduced to 141 Mb
# Compare various weightings.
# with random data from a normal distribution there is almost no difference
#
# data from a normal distribution
# X <- rSparseMatrix(1e2, 1e2, 1e3)
# with heavily skewed data there is a strong difference!
X <- rSparseMatrix(1e2, 1e2, 1e3,
rand.x = function(n){round(rpois(1e3, 10), 2)})
w0 <- cosSparse(X, norm = norm2, weight = NULL)@x
wi <- cosSparse(X, norm = norm2, weight = idf)@x
ws <- cosSparse(X, norm = norm2, weight = isqrt)@x
pairs(~ w0 + wi + ws,
labels=c("no weighting","inverse
document
frequency","inverse
square root"))
Run the code above in your browser using DataLab