# \donttest{
# Load the dataset AML2004
data(AML2004)
data <- as.matrix(AML2004$Gene)
# Perform the clustering
result <- PerturbationClustering(data = data)
# Plot the result
condition = seq(unique(AML2004$Group[, 2]))
names(condition) <- unique(AML2004$Group[, 2])
plot(
prcomp(data)$x,
col = result$cluster,
pch = condition[AML2004$Group[, 2]],
main = "AML2004"
)
legend(
"bottomright",
legend = paste("Cluster ", sort(unique(result$cluster)), sep = ""),
fill = sort(unique(result$cluster))
)
legend("bottomleft", legend = names(condition), pch = condition)
# Change kmeans parameters
result <- PerturbationClustering(
data = data,
clusteringMethod = "kmeans",
clusteringOptions = list(
iter.max = 500,
nstart = 50
)
)
# Change to use pam
result <- PerturbationClustering(data = data, clusteringMethod = "pam")
# Change to use hclust
result <- PerturbationClustering(data = data, clusteringMethod = "hclust")
# Pass a user-defined clustering algorithm
result <- PerturbationClustering(data = data, clusteringFunction = function(data, k){
# this function must return a vector of cluster
kmeans(x = data, centers = k, nstart = k*10, iter.max = 2000)$cluster
})
# Use noise as the perturb method
result <- PerturbationClustering(data = data,
perturbMethod = "noise",
perturbOptions = list(noise = 0.3))
# or
result <- PerturbationClustering(data = data,
perturbMethod = "noise",
perturbOptions = list(noisePercent = 10))
# Change to use subsampling
result <- PerturbationClustering(data = data,
perturbMethod = "subsampling",
perturbOptions = list(percent = 90))
# Users can pass their own perturb method
result <- PerturbationClustering(data = data, perturbFunction = function(data){
rowNum <- nrow(data)
colNum <- ncol(data)
epsilon <-
matrix(
data = rnorm(rowNum * colNum, mean = 0, sd = 1.234),
nrow = rowNum,
ncol = colNum
)
list(
data = data + epsilon,
ConnectivityMatrixHandler = function(connectivityMatrix, ...) {
connectivityMatrix
},
MergeConnectivityMatrices = function(oldMatrix, newMatrix, iter, ...){
return((oldMatrix*(iter-1) + newMatrix)/iter)
}
)
})
# Clustering on simulation data
# Load necessary library
if (!require("mclust")) install.packages("mclust")
library(mclust)
library(irlba)
#Generate a simulated data matrix with the size of 50,000 x 5,000
sampleNum <- 50000 # Number of samples
geneNum <- 5000 # Number of genes
subtypeNum <- 3 # Number of subtypes
# Generate expression matrix
exprs <- matrix(rnorm(sampleNum*geneNum, 0, 1), nrow = sampleNum, ncol = geneNum)
rownames(exprs) <- paste0("S", 1:sampleNum) # Assign unique names for samples
# Generate subtypes
group <- sort(rep(1:subtypeNum, sampleNum/subtypeNum + 1)[1:sampleNum])
names(group) <- rownames(exprs)
# Make subtypes separate
for (i in 1:subtypeNum) {
exprs[group == i, 1:100 + 100*(i-1)] <- exprs[group == i, 1:100 + 100*(i-1)] + 2
}
# Plot the data
library(irlba)
exprs.pca <- irlba::prcomp_irlba(exprs, n = 2)$x
plot(exprs.pca, main = "PCA")
#Run PINSPlus clustering:
set.seed(1)
t1 <- Sys.time()
result <- PerturbationClustering(data = exprs.pca, ncore = 1)
t2 <- Sys.time()
#Print out the running time:
time<- t2-t1
#Print out the number of clusters:
result$k
#Get cluster assignment
subtype <- result$cluster
# Here we assess the clustering accurracy using Adjusted Rand Index (ARI).
#ARI takes values from -1 to 1 where 0 stands for a random clustering and 1
#stands for a perfect partition result.
if (!require("mclust")) install.packages("mclust")
library(mclust)
ari <- mclust::adjustedRandIndex(subtype, group)
#Plot the cluster assginments
colors <- as.numeric(as.character(factor(subtype)))
plot(exprs.pca, col = colors, main = "Cluster assigments for simulation data")
legend("topright", legend = paste("ARI:", ari))
legend("bottomright", fill = unique(colors),
legend = paste("Group ",
levels(factor(subtype)), ": ",
table(subtype)[levels(factor(subtype))], sep = "" )
)
# }
Run the code above in your browser using DataLab