Learn R Programming

RemixAutoML (version 0.5.4)

CLTrainer: CLTrainer

Description

CLTrainer is a forecasting model for chain ladder style forecasting

Usage

CLTrainer(
  data,
  PartitionRatios = c(0.7, 0.2, 0.1),
  BaseFunnelMeasure = NULL,
  ConversionMeasure = NULL,
  ConversionRateMeasure = NULL,
  CohortPeriodsVariable = NULL,
  CalendarDate = NULL,
  CohortDate = NULL,
  TruncateDate = NULL,
  TimeUnit = c("day"),
  CalendarTimeGroups = c("day", "week", "month"),
  CohortTimeGroups = c("day", "week", "month"),
  TransformTargetVariable = TRUE,
  TransformMethods = c("Identity", "YeoJohnson"),
  AnomalyDetection = list(tstat_high = 3, tstat_low = -2),
  Jobs = c("Evaluate", "Train"),
  SaveModelObjects = TRUE,
  ModelID = "Segment_ID",
  ModelPath = NULL,
  MetaDataPath = NULL,
  TaskType = "CPU",
  NumGPUs = 1,
  DT_Threads = max(1L, parallel::detectCores()),
  EvaluationMetric = "RMSE",
  LossFunction = "RMSE",
  NumOfParDepPlots = 1L,
  MetricPeriods = 50L,
  CalendarVariables = c("wday", "mday", "yday", "week", "isoweek", "month", "quarter",
    "year"),
  HolidayGroups = c("USPublicHolidays", "EasterGroup", "ChristmasGroup",
    "OtherEcclesticalFeasts"),
  HolidayLookback = NULL,
  ImputeRollStats = -0.001,
  CohortHolidayLags = c(1L, 2L, 7L),
  CohortHolidayMovingAverages = c(3L, 7L),
  CalendarHolidayLags = c(1L, 2L, 7L),
  CalendarHolidayMovingAverages = c(3L, 7L),
  CalendarLags = list(day = c(1L, 7L, 21L), week = c(1L, 4L, 52L), month = c(1L, 6L,
    12L)),
  CalendarMovingAverages = list(day = c(1L, 7L, 21L), week = c(1L, 4L, 52L), month =
    c(1L, 6L, 12L)),
  CalendarStandardDeviations = NULL,
  CalendarSkews = NULL,
  CalendarKurts = NULL,
  CalendarQuantiles = NULL,
  CalendarQuantilesSelected = "q50",
  CohortLags = list(day = c(1L, 7L, 21L), week = c(1L, 4L, 52L), month = c(1L, 6L,
    12L)),
  CohortMovingAverages = list(day = c(1L, 7L, 21L), week = c(1L, 4L, 52L), month =
    c(1L, 6L, 12L)),
  CohortStandardDeviations = NULL,
  CohortSkews = NULL,
  CohortKurts = NULL,
  CohortQuantiles = NULL,
  CohortQuantilesSelected = "q50",
  PassInGrid = NULL,
  GridTune = FALSE,
  BaselineComparison = "default",
  MaxModelsInGrid = 25L,
  MaxRunMinutes = 180L,
  MaxRunsWithoutNewWinner = 10L,
  Trees = 3000L,
  Depth = seq(4L, 8L, 1L),
  LearningRate = seq(0.01, 0.1, 0.01),
  L2_Leaf_Reg = seq(1, 10, 1),
  RSM = c(0.8, 0.85, 0.9, 0.95, 1),
  BootStrapType = c("Bayesian", "Bernoulli", "Poisson", "MVS", "No"),
  GrowPolicy = c("SymmetricTree", "Depthwise", "Lossguide")
)

Arguments

data

data object

PartitionRatios

Requires three values for train, validation, and test data sets

BaseFunnelMeasure

E.g. "Leads". This value should be a forward looking variable. Say you want to forecast ConversionMeasure 2 months into the future. You should have two months into the future of values of BaseFunnelMeasure

ConversionMeasure

E.g. "Conversions". Rate is derived as conversions over leads by cohort periods out

ConversionRateMeasure

Conversions over Leads for every cohort

CohortPeriodsVariable

Numeric. Numerical value of the the number of periods since cohort base date.

CalendarDate

The name of your date column that represents the calendar date

CohortDate

The name of your date column that represents the cohort date

TruncateDate

NULL. Supply a date to represent the earliest point in time you want in your data. Filtering takes place before partitioning data so feature engineering can include as many non null values as possible.

TimeUnit

Base time unit of data. "days", "weeks", "months", "quarters", "years"

CalendarTimeGroups

TimeUnit value must be included. If you want to generate lags and moving averages in several time based aggregations, choose from "days", "weeks", "months", "quarters", "years".

CohortTimeGroups

TimeUnit value must be included. If you want to generate lags and moving averages in several time based aggregations, choose from "days", "weeks", "months", "quarters", "years".

TransformTargetVariable

TRUE or FALSe

TransformMethods

Choose from "Identity", "BoxCox", "Asinh", "Asin", "Log", "LogPlus1", "Logit", "YeoJohnson"

AnomalyDetection

Provide a named list. See examples

Jobs

Default is "eval" and "train"

SaveModelObjects

Set to TRUE to return all modeling objects to your environment

ModelID

A character string to name your model and output

ModelPath

Path to where you want your models saved

MetaDataPath

Path to where you want your metadata saved. If NULL, function will try ModelPath if it is not NULL.

TaskType

"GPU" or "CPU" for catboost training

NumGPUs

Number of GPU's you would like to utilize

DT_Threads

Number of threads to use for data.table. Default is Total - 2

EvaluationMetric

This is the metric used inside catboost to measure performance on validation data during a grid-tune. "RMSE" is the default, but other options include: "MAE", "MAPE", "Poisson", "Quantile", "LogLinQuantile", "Lq", "NumErrors", "SMAPE", "R2", "MSLE", "MedianAbsoluteError".

LossFunction

Used in model training for model fitting. Select from 'RMSE', 'MAE', 'Quantile', 'LogLinQuantile', 'MAPE', 'Poisson', 'PairLogitPairwise', 'Tweedie', 'QueryRMSE'

NumOfParDepPlots

Number of partial dependence plots to return

MetricPeriods

Number of trees to build before the internal catboost eval step happens

CalendarVariables

"wday", "mday", "yday", "week", "isoweek", "month", "quarter", "year"

HolidayGroups

c("USPublicHolidays","EasterGroup","ChristmasGroup","OtherEcclesticalFeasts")

HolidayLookback

Number of days in range to compute number of holidays from a given date in the data. If NULL, the number of days are computed for you.

ImputeRollStats

Constant value to fill NA after running AutoLagRollStats()

CohortHolidayLags

c(1L, 2L, 7L),

CohortHolidayMovingAverages

c(3L, 7L),

CalendarHolidayLags

c(1L, 2L, 7L),

CalendarHolidayMovingAverages

= c(3L, 7L),

CalendarLags

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CalendarMovingAverages

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CalendarStandardDeviations

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CalendarSkews

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CalendarKurts

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CalendarQuantiles

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CalendarQuantilesSelected

Supply a vector of "q5", "q10", "q15", "q20", "q25", "q30", "q35", "q40", "q45", "q50", "q55", "q60", "q65", "q70", "q75", "q80", "q85", "q90", "q95"

CohortLags

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CohortMovingAverages

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CohortStandardDeviations

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CohortSkews

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CohortKurts

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CohortQuantiles

List of the form list("day" = c(1L, 7L, 21L), "week" = c(1L, 4L, 52L), "month" = c(1L, 6L, 12L))

CohortQuantilesSelected

Supply a vector of "q5", "q10", "q15", "q20", "q25", "q30", "q35", "q40", "q45", "q50", "q55", "q60", "q65", "q70", "q75", "q80", "q85", "q90", "q95"

PassInGrid

Defaults to NULL. Pass in a single row of grid from a previous output as a data.table (they are collected as data.tables)

GridTune

Set to TRUE to run a grid tuning procedure. Set a number in MaxModelsInGrid to tell the procedure how many models you want to test.

BaselineComparison

Set to either "default" or "best". Default is to compare each successive model build to the baseline model using max trees (from function args). Best makes the comparison to the current best model.

MaxModelsInGrid

Number of models to test from grid options

MaxRunMinutes

Maximum number of minutes to let this run

MaxRunsWithoutNewWinner

Number of models built before calling it quits

Trees

Bandit grid partitioned. The maximum number of trees you want in your models

Depth

Bandit grid partitioned. Number, or vector for depth to test. For running grid tuning, a NULL value supplied will mean these values are tested seq(4L, 16L, 2L)

LearningRate

Bandit grid partitioned. Supply a single value for non-grid tuning cases. Otherwise, supply a vector for the LearningRate values to test. For running grid tuning, a NULL value supplied will mean these values are tested c(0.01,0.02,0.03,0.04)

L2_Leaf_Reg

Random testing. Supply a single value for non-grid tuning cases. Otherwise, supply a vector for the L2_Leaf_Reg values to test. For running grid tuning, a NULL value supplied will mean these values are tested seq(1.0, 10.0, 1.0)

RSM

CPU only. Random testing. Supply a single value for non-grid tuning cases. Otherwise, supply a vector for the RSM values to test. For running grid tuning, a NULL value supplied will mean these values are tested c(0.80, 0.85, 0.90, 0.95, 1.0)

BootStrapType

Random testing. Supply a single value for non-grid tuning cases. Otherwise, supply a vector for the BootStrapType values to test. For running grid tuning, a NULL value supplied will mean these values are tested c("Bayesian", "Bernoulli", "Poisson", "MVS", "No")

GrowPolicy

Random testing. NULL, character, or vector for GrowPolicy to test. For grid tuning, supply a vector of values. For running grid tuning, a NULL value supplied will mean these values are tested c("SymmetricTree", "Depthwise", "Lossguide")

Value

Saves metadata and models to files of your choice. Also returns metadata and models from the function. User specifies both options.

See Also

Other Population Dynamics Forecasting: CLForecast()

Examples

Run this code
# NOT RUN {
# Create simulated data
data <- RemixAutoML::FakeDataGenerator(
  ChainLadderData = TRUE)

# Build model
RemixAutoML::CLTrainer(

   # Data Arguments----
   data = data,
   PartitionRatios = c(0.70,0.20,0.10),
   BaseFunnelMeasure = "Leads",
   ConversionMeasure = "Appointments",
   ConversionRateMeasure = NULL,
   CohortPeriodsVariable = "CohortDays",
   CalendarDate = "CalendarDateColumn",
   CohortDate = "CohortDateColumn",
   TruncateDate = NULL,
   TimeUnit = "days",
   TransformTargetVariable = TRUE,
   TransformMethods = c("Identity","BoxCox","Asinh",
                        "Asin","LogPlus1","Logit",
                        "YeoJohnson"),
   AnomalyDetection = list(tstat_high = 3,
     tstat_low = -2),

   # MetaData Arguments----
   Jobs = c("eval","train"),
   SaveModelObjects = TRUE,
   ModelID = "ModelTest",
   ModelPath = getwd(),
   MetaDataPath = NULL,
   TaskType = "GPU",
   NumGPUs = 1,
   DT_Threads = max(1L, parallel::detectCores() - 2L),
   EvaluationMetric = "RMSE",
   LossFunction = "RMSE",
   NumOfParDepPlots = 1L,
   MetricPeriods = 50L,

   # Feature Engineering Arguments----
   ImputeRollStats = -0.001,
   CalendarTimeGroups = c("days","weeks","months"),
   CohortTimeGroups = c("days", "weeks"),
   CalendarVariables = c("wday","mday","yday","week",
                         "month","quarter","year"),
   HolidayGroups = c("USPublicHolidays","EasterGroup",
                     "ChristmasGroup","OtherEcclesticalFeasts"),
   HolidayLookback = NULL,
   CohortHolidayLags = c(1L,2L,7L),
   CohortHolidayMovingAverages = c(3L,7L),
   CalendarHolidayLags = c(1L,2L,7L),
   CalendarHolidayMovingAverages = c(3L,7L),
   CalendarLags = list("day" = c(1L,2L,7L,35L,42L),
                       "week" = c(5L,6L,10L,12L,25L,26L)),
   CalendarMovingAverages = list("day" = c(7L,14L,35L,42L),
                                 "week" = c(5L,6L,10L,12L,20L,24L),
                                 "month" = c(6L,12L)),
   CalendarStandardDeviations = NULL,
   CalendarSkews = NULL,
   CalendarKurts = NULL,
   CalendarQuantiles = NULL,
   CalendarQuantilesSelected = "q50",
   CohortLags = list("day" = c(1L,2L,7L,35L,42L),
                     "week" = c(5L,6L)),
   CohortMovingAverages = list("day" = c(7L,14L,35L,42L),
                               "week" = c(5L,6L),
                               "month" = c(1L,2L)),
   CohortStandardDeviations = NULL,
   CohortSkews = NULL,
   CohortKurts = NULL,
   CohortQuantiles = NULL,
   CohortQuantilesSelected = "q50",

   # Grid Tuning
   PassInGrid = NULL,
   GridTune = FALSE,
   BaselineComparison = "default",
   MaxModelsInGrid = 25L,
   MaxRunMinutes = 180L,
   MaxRunsWithoutNewWinner = 10L,
   Trees = 1000L,
   Depth = seq(4L,8L,1L),
   LearningRate = seq(0.01,0.10,0.01),
   L2_Leaf_Reg = seq(1.0,10.0,1.0),
   RSM = c(0.80,0.85,0.90,0.95,1.0),
   BootStrapType = c("Bayesian","Bernoulli","Poisson","MVS","No"),
   GrowPolicy = c("SymmetricTree","Depthwise","Lossguide"))
# }

Run the code above in your browser using DataLab