# NOT RUN {
# Create some dummy correlated data
data <- RemixAutoML::FakeDataGenerator(
Correlation = 0.85,
N = 1000,
ID = 2,
ZIP = 0,
AddDate = FALSE,
Classification = FALSE,
MultiClass = FALSE)
# Run function
TestModel <- RemixAutoML::AutoXGBoostRegression(
# GPU or CPU
TreeMethod = "hist",
NThreads = NThreads = parallel::detectCores(),
LossFunction = 'reg:squarederror',
# Metadata arguments:
# 'ModelID' is used to create part of the file
# names generated when saving to file'
# 'model_path' is where the minimal model objects
# for scoring will be stored
# 'ModelID' will be the name of the saved model object
# 'metadata_path' is where model evaluation and model
# interpretation files are saved
# objects saved to model_path if metadata_path is null
# Saved objects include:
# 'ModelID_ValidationData.csv' is the supplied or generated
# TestData with predicted values
# 'ModelID_ROC_Plot.png' and 'Model_ID_EvaluationPlot.png'
# calibration plot
# 'ModelID_VariableImportance.csv' is the variable importance.
# This won't be saved to file if GrowPolicy is either
# "Depthwise" or "Lossguide" was used
# 'ModelID_ExperimentGrid.csv' if GridTune = TRUE.
# Results of all model builds including parameter settings,
# bandit probs, and grid IDs
# 'ModelID_EvaluationMetrics.csv' which contains all confusion
# matrix measures across all thresholds
model_path = normalizePath("./"),
metadata_path = NULL,
ModelID = "Test_Model_1",
ReturnFactorLevels = TRUE,
ReturnModelObjects = TRUE,
SaveModelObjects = FALSE,
# Data arguments:
# 'TrainOnFull' is to train a model with 100 percent of
# your data.
# That means no holdout data will be used for evaluation
# If ValidationData and TestData are NULL and TrainOnFull
# is FALSE then data will be split 70 20 10
# 'PrimaryDateColumn' is a date column in data that is
# meaningful when sorted.
# CatBoost categorical treatment is enhanced when supplied
# 'IDcols' are columns in your data that you don't use for
# modeling but get returned with ValidationData
data = data,
TrainOnFull = FALSE,
ValidationData = NULL,
TestData = NULL,
TargetColumnName = "Adrian",
FeatureColNames = names(data)[!names(data) %chin%
c("IDcol_1", "IDcol_2","Adrian")],
IDcols = c("IDcol_1","IDcol_2"),
TransformNumericColumns = NULL,
Methods = c("BoxCox", "Asinh", "Asin", "Log",
"LogPlus1", "Sqrt", "Logit", "YeoJohnson"),
# Model evaluation
eval_metric = "rmse",
NumOfParDepPlots = 3L,
# Grid tuning arguments
PassInGrid = NULL,
GridTune = TRUE,
grid_eval_metric = "mse",
BaselineComparison = "default",
MaxModelsInGrid = 10L,
MaxRunsWithoutNewWinner = 20L,
MaxRunMinutes = 24L*60L,
Verbose = 1L,
# Trees, Depth, and LearningRate used in the bandit grid tuning
# Must set Trees to a single value if you are not grid tuning
# The ones below can be set to NULL
Shuffles = 1L,
Trees = seq(50L, 500L, 50L),
eta = seq(0.05,0.40,0.05),
max_depth = seq(4L, 16L, 2L),
min_child_weight = seq(1.0, 10.0, 1.0),
subsample = seq(0.55, 1.0, 0.05),
colsample_bytree = seq(0.55, 1.0, 0.05))
# }
Run the code above in your browser using DataLab