## not run
## 1 - the famous iris dataset
## load data
# data(iris)
## run unsupervised modelling, removing labels :
## Default options, letting the 'gap statistic' find the number of clusters,
## except for 'baseModel' which is slightly more efficient
## with the "proximityThenDistance" argument
# iris.rufUnsupervised = unsupervised.randomUniformForest(iris[,-5],
# baseModel = "proximityThenDistance", seed = 2014, threads = 1)
## view a summary
# iris.rufUnsupervised
## one may assess the gap statistic by calling the 'modifyClusters( )' function,
## increasing or decreasing the number of clusters and looking the variations
## of the silhouette coefficient.
## For example, if 4 clusters are found (since we know there are 3) :
# iris.rufUnsupervised2 = modifyClusters(iris.rufUnsupervised, decreaseBy = 1)
## plot clusters
# plot(iris.rufUnsupervised)
## 2 - Full example with details
## Wholesale customers data (UCI machine learning repository)
# URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/00292/"
# datasetName = "Wholesale%20customers%20data.csv"
# wholesaleCustomers = read.csv(paste(URL, datasetName, sep =""))
## modelling, letting the algorithm deal with all problems :
## categorical features, number of clusters, dimension reduction, visualization,
## variable importance, links between features and observations,...
# wholesaleCustomers.rufUnsupervised = unsupervised.randomUniformForest(wholesaleCustomers,
# nodesize = 10, bagging = TRUE, ntree = 200, categoricalvariablesidx = "all")
## assess quality of the clustering :
## (and change eventually model parameters, e.g. 'baseModel' or 'endModel',
## running again the model to get a better clustering one, looking the average silhouette
## or the distance between clusters)
# wholesaleCustomers.rufUnsupervised
## visualization : at first, only clusters
# plot(wholesaleCustomers.rufUnsupervised)
## but, we may need more :
## get details, turning first the model in a supervised one
# wholesaleCustomers.rufSupervised = as.supervised(wholesaleCustomers.rufUnsupervised,
# wholesaleCustomers, bagging = TRUE, ntree = 200,
# nodesize = 10, categoricalvariablesidx = "all")
## Is the learning efficient (using OOB evaluation) ?
# wholesaleCustomers.rufSupervised
## get variable importance, leading to a full analysis and visualization
## while limiting interactions of variables to 3 orders
# wholesaleCustomers.importance = importance(wholesaleCustomers.rufSupervised,
# Xtest = wholesaleCustomers, maxInteractions = 3)
## a - visualize : features, interactions, partial dependencies, features in clusters
## NOTE : tile window in the R menu to see all plots. Loop over the prompt to see
## all matched partial dependencies
# plot(wholesaleCustomers.importance, Xtest = wholesaleCustomers)
## we get global variable importance (information gain), interactions, partial dependencies,
## and variable importance over labels. See vignette for more details.
## b - more visualization : (another look on 'variable importance over labels')
# featuresCluster1 = partialImportance(wholesaleCustomers, wholesaleCustomers.importance,
# whichClass = 1)
## c - visualization : clusters and most important features
# plot(wholesaleCustomers.rufUnsupervised, importanceObject = wholesaleCustomers.importance)
## d - table : see individual links between observations and features
## the table show each observation with its associated features
## and their frequencies of occurrence
# featuresAndObs = as.data.frame(wholesaleCustomers.importance$localVariableImportance$obs)
# frequencyFeaturesIdx = grep("Frequency", colnames(featuresAndObs))
# featuresNames = apply(featuresAndObs[,-c(1,frequencyFeaturesIdx)], 2,
# function(Z) colnames(wholesaleCustomers)[Z])
# featuresAndObs[,-c(1,frequencyFeaturesIdx)] = featuresNames
# head(featuresAndObs)
## NOTE : since features are almost in monetary units, one may assess clusters
## by looking the sum of all features per cluster and turn the problem
## into a 'revenues per cluster and feature' one that can be linked
## with the clustering process and visualization tools.
## first, merge outliers and retrieve clusters
# Class = mergeOutliers(wholesaleCustomers.rufUnsupervised)
## then add classes
# wholesaleCustomersClusterized = cbind(wholesaleCustomers, Class)
## finally compute revenues per cluster and feature.
## Note that this view may give more insights on how the algorithm clusters data.
# revenuePerClusterAndFeature =
# aggregate(wholesaleCustomersClusterized[,-c(1,2,9)], list(Class), sum)
## see results
# revenuePerClusterAndFeature
## revenuePerCluster : leading to know where and how more work might happen...
# rowSums(revenuePerClusterAndFeature[,-1])Run the code above in your browser using DataLab