# NOT RUN {
# CatBoost data generator
dataGenH2O <- function() {
Correl <- 0.85
N <- 10000
data <- data.table::data.table(Classification = runif(N))
data[, x1 := qnorm(Classification)]
data[, x2 := runif(N)]
data[, Independent_Variable1 := log(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable2 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable3 := exp(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable4 := exp(exp(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2))))]
data[, Independent_Variable5 := sqrt(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable6 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.10]
data[, Independent_Variable7 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.25]
data[, Independent_Variable8 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.75]
data[, Independent_Variable9 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^2]
data[, Independent_Variable10 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^4]
data[, Independent_Variable11 := as.factor(
ifelse(Independent_Variable2 < 0.20,
"A",ifelse(Independent_Variable2 < 0.40,
"B",ifelse(Independent_Variable2 < 0.6,
"C",ifelse(Independent_Variable2 < 0.8, "D", "E")))))]
data[, ':=' (x1 = NULL, x2 = NULL)]
data[, Classification := ifelse(Classification > 0.5, 1, 0)]
rm(N,Correl)
return(data)
}
data <- dataGenH2O()
TestModel <- RemixAutoML::AutoCatBoostRegression(
data,
TrainOnFull = FALSE,
ValidationData = NULL,
TestData = NULL,
TargetColumnName = "Classification",
FeatureColNames = c(2:12),
PrimaryDateColumn = NULL,
IDcols = NULL,
MaxModelsInGrid = 3,
task_type = "GPU",
eval_metric = "RMSE",
Trees = 50,
GridTune = FALSE,
model_path = "C:/Users/aantico/Documents/Package/GUI_Package",
metadata_path = NULL,
ModelID = "Adrian",
NumOfParDepPlots = 15,
ReturnModelObjects = TRUE,
SaveModelObjects = TRUE,
PassInGrid = NULL)
# CatBoost Build Lime Model and Explanations
LimeOutput <- RemixAutoML::AutoLimeAid(
EvalPredsData = data[c(1,15)],
LimeTrainingData = data,
LimeBins = 10,
LimeIterations = 7500,
LimeNumFeatures = 0,
TargetType = "regression",
LimeModel = NULL,
LimeModelPath = "C:/Users/aantico/Documents/Package/GUI_Package",
LimeModelID = "AdrianLime",
MLModel = NULL,
MLModelPath = "C:/Users/aantico/Documents/Package/GUI_Package",
MLMetaDataPath = NULL,
MLModelID = "Adrian",
ModelType = "catboost",
NThreads = parallel::detectCores(),
MaxMem = "14G",
FeatureColumnNames = NULL,
IDcols = NULL,
FactorLevelsList = NULL,
TargetLevels = NULL,
OneHot = FALSE,
ReturnFeatures = TRUE,
TransformNumeric = FALSE,
BackTransNumeric = FALSE,
TargetColumnName = NULL,
TransformationObject = NULL,
TransID = NULL,
TransPath = NULL,
MDP_Impute = TRUE,
MDP_CharToFactor = TRUE,
MDP_RemoveDates = TRUE,
MDP_MissFactor = "0",
MDP_MissNum = -1)
# Plot lime objects
lime::plot_features(LimeOutput$LimeExplanations)
suppressWarnings(lime::plot_explanations(LimeOutput$LimeExplanations))
# H2O data generator
dataGenH2O <- function() {
Correl <- 0.85
N <- 10000
data <- data.table::data.table(Classification = runif(N))
data[, x1 := qnorm(Classification)]
data[, x2 := runif(N)]
data[, Independent_Variable1 := log(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable2 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable3 := exp(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable4 := exp(exp(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2))))]
data[, Independent_Variable5 := sqrt(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable6 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.10]
data[, Independent_Variable7 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.25]
data[, Independent_Variable8 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.75]
data[, Independent_Variable9 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^2]
data[, Independent_Variable10 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^4]
data[, Independent_Variable11 := as.factor(ifelse(Independent_Variable2 < 0.20,
"A",ifelse(Independent_Variable2 < 0.40,
"B",ifelse(Independent_Variable2 < 0.6,
"C",ifelse(Independent_Variable2 < 0.8, "D", "E")))))]
data[, ':=' (x1 = NULL, x2 = NULL)]
data[, Classification := ifelse(Classification > 0.5, 1, 0)]
rm(N,Correl)
return(data)
}
data <- dataGenH2O()
TestModel <- RemixAutoML::AutoH2oDRFClassifier(
data = data,
TrainOnFull = FALSE,
ValidationData = NULL,
TestData = NULL,
TargetColumnName = "Classification",
FeatureColNames = setdiff(names(data),"Classification"),
eval_metric = "auc",
Trees = 50,
GridTune = FALSE,
MaxMem = "32G",
NThreads = max(1, parallel::detectCores()-2),
MaxModelsInGrid = 10,
model_path = "C:/Users/aantico/Desktop/Retention Analytics",
metadata_path = NULL,
ModelID = "Adrian",
NumOfParDepPlots = 10,
ReturnModelObjects = TRUE,
SaveModelObjects = TRUE,
IfSaveModel = "standard",
H2OShutdown = TRUE)
LimeOutput <- RemixAutoML::AutoLimeAid(
EvalPredsData = data[c(1,15)],
LimeTrainingData = data,
LimeBins = 10,
LimeIterations = 7500,
TargetType = "regression",
LimeNumFeatures = 0,
LimeModel = NULL,
LimeModelPath = "C:/Users/aantico/Desktop/Retention Analytics",
LimeModelID = "AdrianLime",
MLModel = NULL,
MLModelPath = "C:/Users/aantico/Desktop/Retention Analytics",
MLMetaDataPath = NULL,
MLModelID = "Adrian",
ModelType = "h2o",
NThreads = parallel::detectCores(),
MaxMem = "14G",
FeatureColumnNames = NULL,
IDcols = NULL,
FactorLevelsList = NULL,
TargetLevels = NULL,
OneHot = FALSE,
ReturnFeatures = TRUE,
TransformNumeric = FALSE,
BackTransNumeric = FALSE,
TargetColumnName = NULL,
TransformationObject = NULL,
TransID = NULL,
TransPath = NULL,
MDP_Impute = TRUE,
MDP_CharToFactor = TRUE,
MDP_RemoveDates = TRUE,
MDP_MissFactor = "0",
MDP_MissNum = -1)
# Plot lime objects
lime::plot_features(LimeOutput$LimeExplanations)
suppressWarnings(lime::plot_explanations(LimeOutput$LimeExplanations))
# XGBoost create data function
dataGenXGBoost <- function() {
Correl <- 0.85
N <- 10000
data <- data.table::data.table(Classification = runif(N))
data[, x1 := qnorm(Classification)]
data[, x2 := runif(N)]
data[, Independent_Variable1 := log(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable2 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable3 := exp(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable4 := exp(exp(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2))))]
data[, Independent_Variable5 := sqrt(pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))]
data[, Independent_Variable6 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.10]
data[, Independent_Variable7 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.25]
data[, Independent_Variable8 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^0.75]
data[, Independent_Variable9 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^2]
data[, Independent_Variable10 := (pnorm(Correl * x1 + sqrt(1-Correl^2) * qnorm(x2)))^4]
data[, Independent_Variable11 := as.factor(ifelse(Independent_Variable2 < 0.20,
"A",ifelse(Independent_Variable2 < 0.40,
"B",ifelse(Independent_Variable2 < 0.6,
"C",ifelse(Independent_Variable2 < 0.8, "D", "E")))))]
data[, ':=' (x1 = NULL, x2 = NULL)]
data[, Classification := ifelse(Classification > 0.5, 1, 0)]
rm(Correl,N)
return(data)
}
data <- dataGenXGBoost()
TestModel <- RemixAutoML::AutoXGBoostClassifier(
data,
TrainOnFull = FALSE,
ValidationData = NULL,
TestData = NULL,
TargetColumnName = "Classification",
FeatureColNames = 2:12,
IDcols = NULL,
eval_metric = "auc",
Trees = 50,
GridTune = FALSE,
grid_eval_metric = "auc",
MaxModelsInGrid = 10,
NThreads = 8,
TreeMethod = "hist",
model_path = "C:/Users/aantico/Desktop/Retention Analytics",
metadata_path = NULL,
ModelID = "Adrian2",
NumOfParDepPlots = 3,
ReturnModelObjects = TRUE,
ReturnFactorLevels = TRUE,
SaveModelObjects = TRUE,
PassInGrid = NULL)
# XGBoost Build Lime and Generate Output
LimeOutput <- RemixAutoML::AutoLimeAid(
EvalPredsData = data[c(1,15)],
LimeTrainingData = data,
LimeBins = 10,
TargetType = "classification",
LimeIterations = 7500,
LimeNumFeatures = 0,
LimeModel = NULL,
LimeModelPath = "C:/Users/aantico/Desktop/Retention Analytics",
LimeModelID = "Adrian2Lime",
MLModel = NULL,
MLModelPath = "C:/Users/aantico/Desktop/Retention Analytics",
MLMetaDataPath = NULL,
MLModelID = "Adrian2",
ModelType = "xgboost",
NThreads = parallel::detectCores(),
MaxMem = "14G",
FeatureColumnNames = NULL,
IDcols = NULL,
FactorLevelsList = NULL,
TargetLevels = NULL,
OneHot = FALSE,
ReturnFeatures = TRUE,
TransformNumeric = FALSE,
BackTransNumeric = FALSE,
TargetColumnName = NULL,
TransformationObject = NULL,
TransID = NULL,
TransPath = NULL,
MDP_Impute = TRUE,
MDP_CharToFactor = TRUE,
MDP_RemoveDates = TRUE,
MDP_MissFactor = "0",
MDP_MissNum = -1)
# Plot lime objects
lime::plot_features(LimeOutput$LimeExplanations)
suppressWarnings(lime::plot_explanations(LimeOutput$LimeExplanations))
# }
Run the code above in your browser using DataLab