# NOT RUN {
## not run
## A - generate a skewed dataset to learn
# set.seed(2014)
# n = 1000
# p = 100
# X = simulationData(n, p)
# X = fillVariablesNames(X)
# epsilon1 = runif(n,-1,1)
# epsilon2 = runif(n,-1,1)
# rule = 2*(X[,1]*X[,2] + X[,3]*X[,4]) + epsilon1*X[,5] + epsilon2*X[,6]
# Y = as.factor(ifelse(rule > (3*(mean(rule) - 4/3*epsilon1*X[,7])), "-","+"))
## distribution of labels
# distributionOfY = table(Y)
## ratio of majority/minority sample
# max(distributionOfY)/min(distributionOfY)
## generate train and test sets, experiencing concept drift
# set.seed(2014)
# train_test = init_values(X, Y, sample.size = 5/10)
# X1 = train_test$xtrain
# Y1 = train_test$ytrain
# X2 = train_test$xtest
# Y2 = train_test$ytest
# table(Y1)
# table(Y2)
## learn and predict with base model
# baseModel.ruf = randomUniformForest(X1, Y1, xtest = X2, ytest = Y2)
# baseModel.ruf
## reSMOTE train sample : we choose 'increaseMinorityClassTo = 0.3'.
# X1Y1.resmoted = reSMOTE(X1, Y1, majorityClass = "-",
# conditionalTo = Y1, increaseMinorityClassTo = 0.3)
## learn and predict with reSMOTEd model
# reSMOTEdModel.ruf = randomUniformForest(X1Y1.resmoted[,1:p], X1Y1.resmoted[,"Class"],
# xtest = X2, ytest = Y2)
## compare both models, looking the AUC (and AUPR, if misclassified positives cases
## have a much higher cost than misclassified negative ones)
# baseModel.ruf
# reSMOTEdModel.ruf
## B - German credit dataset
# URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/"
# dataset = "german.data-numeric"
# germanCreditData = read.table(paste(URL, dataset, sep =""))
# Y = germanCreditData[,25]
# X = germanCreditData[,-25]
# set.seed(2014)
# train_test = init_values(X, Y, sample.size = 5/10)
# X1 = train_test$xtrain
# Y1 = train_test$ytrain
# X2 = train_test$xtest
# Y2 = train_test$ytest
# 1 - default modelling: AUC around 0.67; AUPR around 0.56
# germanCredit.baseModel.ruf = randomUniformForest(X1, as.factor(Y1),
# xtest = X2, ytest = as.factor(Y2), ntree = 200, importance = FALSE)
# germanCredit.baseModel.ruf
## 2 - reSMOTEd modelling: AUC around 0.70; AUPR around 0.60;
## Note : reSMOTE is not consistent with OOB evaluation.
# p = ncol(X1)
# X1Y1.resmoted = reSMOTE(as.true.matrix(X1), as.factor(Y1), majorityClass = "1",
# conditionalTo = Y1, increaseMinorityClassTo = 0.6)
# X1.new = X1Y1.resmoted[,1:p]
# Y1.new = X1Y1.resmoted[,"Class"]
# germanCredit.reSMOTEd.ruf2 = randomUniformForest(X1.new, as.factor(Y1.new),
# xtest = X2, ytest = as.factor(Y2), ntree = 200, importance = FALSE)
## 2.1 - try reSMOTE + Random Forests: AUC around 0.70; AUPR around 0.58;
# library(randomForest)
# germanCredit.reSMOTEd.rf = randomForest(X1.new, as.factor(Y1.new),
# xtest = X2, ytest = as.factor(Y2))
# statsRF = model.stats(as.factor(as.numeric(germanCredit.SMOTEd.rf$test$pred)), as.factor(Y2))
## 3 - use benchmark: SMOTE, implemented in 'unbalanced' R package
# require(unbalanced)
## SMOTE : generate 'perc.over/100' new instance for each rare instance.
## 'perc.under/100' is the number of instances of majority class for each smoted observation.
## Generate smoted observations to have a similar distribution (but more examples)
## of the labels than the one generated by 'reSMOTE()'.
# XX = ubBalance(X1, as.factor(Y1-1), type = "ubSMOTE", perc.over = 400, perc.under = 100)
## 3.1 apply to randomUniformForest: AUC around 0.68; AUPR around 0.55;
# germanCredit.SMOTEd.ruf = randomUniformForest(XX$X, as.factor(XX$Y),
# xtest = X2, ytest = as.factor(Y2-1), ntree = 200, importance = FALSE, BreimanBounds = FALSE)
## 3.2 apply to randomForest: AUC around 0.68; AUPR around 0.57;
# germanCredit.SMOTEd.rf = randomForest(XX$X, as.factor(XX$Y),
# xtest = X2, ytest = as.factor(Y2-1))
# statsModel = model.stats(as.factor(as.numeric(germanCredit.SMOTEd.rf$test$pred)),
# as.factor(Y2))
## 4 - Stabilize : AUC around 0.71; AUPR around 0.62;
## a - generate a large number of minority class examples
## b - use rebalanced sampling in the model : equal size for each class
## so that size of the new training sample equals size of the original training sample
## c - increase minimum number of observations in terminal nodes (nodesize)
## d - increase number of trees.
# X1Y1.resmoted = reSMOTE(as.true.matrix(X1), as.factor(Y1), majorityClass = "1",
# conditionalTo = Y1, increaseMinorityClassTo = 1)
# X1.new = X1Y1.resmoted[,1:p]
# Y1.new = X1Y1.resmoted[,"Class"]
# germanCredit.reSMOTEd.ruf2 = randomUniformForest(X1.new, as.factor(Y1.new),
# xtest = X2, ytest = as.factor(Y2), rebalancedsampling = c(250,250), nodesize = 10,
# ntree = 1000, importance = FALSE, OOB = FALSE)
# 4.1 - Apply to Random Forests : AUC around 0.72; AUPR around 0.62;
# germanCredit.reSMOTEd.rf = randomForest(X1.new, as.factor(Y1.new),
# xtest = X2, ytest = as.factor(Y2), sampsize = c(250,250), nodesize = 10, ntree = 1000)
# statsModel = model.stats(as.factor(as.numeric(germanCredit.reSMOTEd.rf$test$pred)),
# as.factor(Y2))
# }
Run the code above in your browser using DataLab