# Example 1: Basic Mixed-Type Clustering
set.seed(123)
# Create a more realistic dataset with mixed variable types
data_mix <- data.frame(
# Categorical variables
education = factor(sample(c("High School", "Bachelor", "Master", "PhD"), 150,
replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1))),
employment = factor(sample(c("Full-time", "Part-time", "Unemployed"), 150,
replace = TRUE, prob = c(0.6, 0.25, 0.15))),
# Ordinal variable
satisfaction = factor(sample(c("Low", "Medium", "High"), 150, replace = TRUE),
levels = c("Low", "Medium", "High"), ordered = TRUE),
# Continuous variables
income = rlnorm(150, meanlog = 10, sdlog = 0.5), # Log-normal income
age = rnorm(150, mean = 35, sd = 10), # Normally distributed age
experience = rpois(150, lambda = 8) # Years of experience
)
# Perform Mixed-Type Clustering
result_mix <- DIBmix(X = data_mix, ncl = 3, nstart = 5)
# View results
print(paste("Number of clusters found:", length(unique(result_mix$Cluster))))
print(paste("Mutual Information:", round(result_mix$MutualInfo, 3)))
table(result_mix$Cluster)
# Example 2: Comparing cat_first parameter
# When categorical variables are more informative
result_cat_first <- DIBmix(X = data_mix, ncl = 3,
cat_first = TRUE, # Prioritize categorical variables
nstart = 5)
# When continuous variables are more informative (default)
result_cont_first <- DIBmix(X = data_mix, ncl = 3,
cat_first = FALSE,
nstart = 5)
# Compare clustering performance
if (requireNamespace("mclust", quietly = TRUE)){ # For adjustedRandIndex
print(paste("Agreement between approaches:",
round(mclust::adjustedRandIndex(result_cat_first$Cluster,
result_cont_first$Cluster), 3)))
}
plot(result_cat_first, type = "sizes") # Bar plot of cluster sizes
plot(result_cat_first, type = "info") # Information-theoretic quantities plot
plot(result_cat_first, type = "beta") # Plot of evolution of beta
# Simulated categorical data example
data_cat <- data.frame(
Var1 = as.factor(sample(letters[1:3], 200, replace = TRUE)), # Nominal variable
Var2 = as.factor(sample(letters[4:6], 200, replace = TRUE)), # Nominal variable
Var3 = factor(sample(c("low", "medium", "high"), 200, replace = TRUE),
levels = c("low", "medium", "high"), ordered = TRUE) # Ordinal variable
)
# Perform hard clustering on categorical data with Deterministic IB
result_cat <- DIBmix(X = data_cat, ncl = 3, lambda = -1, nstart = 5)
# Print clustering results
print(result_cat$Cluster) # Cluster assignments
print(result_cat$Entropy) # Final entropy
print(result_cat$MutualInfo) # Mutual information
# Simulated continuous data example
set.seed(123)
# Continuous data with 200 observations, 5 features
data_cont <- as.data.frame(matrix(rnorm(1000), ncol = 5))
# Perform hard clustering on continuous data with Deterministic IB
result_cont <- DIBmix(X = data_cont, ncl = 3, s = -1, nstart = 5)
# Print clustering results
print(result_cont$Cluster) # Cluster assignments
print(result_cont$Entropy) # Final entropy
print(result_cont$MutualInfo) # Mutual information
# Summary of output
print(result_cont)
summary(result_cont)
Run the code above in your browser using DataLab