AutoCARMA_QA: AutoCARMA_QA

Description

AutoCARMA_QA

Usage

AutoCARMA_QA(
  ModelName = "catboost",
  FeatureGridTune = FALSE,
  MaxMem_ = "28G",
  NThreads_ = max(1, parallel::detectCores() - 2),
  TreeMethod__ = "hist",
  TestRows = "ALL",
  DataPath = "C:/Users/Bizon/Documents/GitHub/QA_DataSets",
  dataForecastX = "CARMA-WALMART-2GroupVars_FC.csv",
  dataX = "OneGroup-Eval-Walmart.csv",
  XREGSX = "CARMA-WALMART-2GroupVars-XREGS_2Var.csv",
  TargetColumnName_ = "Weekly_Sales",
  DateColumnName_ = "Date",
  HierarchGroups_ = c("Store", "Dept"),
  GroupVariables_ = c("Store", "Dept"),
  TimeUnit_ = "week",
  TimeGroups_ = c("week", "month", "quarter"),
  ZeroPadSeries_ = NULL,
  DataTruncate_ = FALSE,
  SplitRatios_ = c(1 - 3/143, 3/143),
  PartitionType_ = "timeseries",
  TrainOnFull_ = FALSE,
  FC_Periods_ = 4,
  EvalMetric_ = "RMSE",
  GridTune_ = FALSE,
  GridEvalMetric_ = "mae",
  ModelCount_ = 5,
  TaskType_ = "GPU",
  Timer_ = TRUE,
  TargetTransformation_ = TRUE,
  Difference_ = TRUE,
  CalendarVariables_ = TRUE,
  HolidayVariable_ = TRUE,
  HolidayLags_ = 1,
  HolidayMovingAverages_ = 1:2,
  Lags_ = c(1:5),
  MA_Periods_ = c(1:5),
  SD_Periods_ = c(2:5),
  Skew_Periods_ = c(3:5),
  Kurt_Periods_ = c(4:5),
  Quantile_Periods_ = c(3:5),
  Quantiles_Selected_ = c("q5", "q95"),
  FourierTerms_ = 4,
  TimeTrendVariable_ = TRUE,
  NTrees_ = 150,
  DebugMode_ = TRUE,
  OptionsWarn = 1
)

Arguments

ModelName

Choose from 'catboost', 'h2odrf', 'h2ogbm', 'h2oglm', 'h2oautoml', 'xgboost'

FeatureGridTune

Set to TRUE to only run in evaluation model opposed to TrainOnFull model which does not return model performance measures

MaxMem_

= "28G"

NThreads_

= parallel::detectCores() - 2

TreeMethod__

= "hist" or "gpu_hist" for xgboost carma

TestRows

= "ALL" to run all tests (see example for all tests), or a numeric vector with the row numbers from the test list (see example)

DataPath

In quotes, provide the file path to where your data is stored

dataForecastX

= "RawDataXREG.csv" Use quotes. # Be aware that grouped data and using XREGS_ requires that your joining group variables have the same name. MUST SUPPLY VALUE

dataX

= "RawDataXREG.csv" Use quotes. # Be aware that grouped data and using XREGS_ requires that your joining group variables have the same name. MUST SUPPLY VALUE

XREGSX

= "XREG.csv" Use quotes. # data.table with ONLY 3 COLUMN TYPES: 1: - GroupVariables_ and DateColumnName_ join-by variables with matching join column names and data types compared to data_ and; 2 - features - needs to exist for all historical periods matching data_ along with a sufficient amount of data to cover the forecast period as defined by FC_Periods_. OR Supply NULL to arg.

TargetColumnName_

= "Weekly_Sales" # WalmartData target column name.

DateColumnName_

= "Date" # Name of data_ date column name.

HierarchGroups_

= c("Store","Dept") # NULL otherwise

GroupVariables_

= c("Store","Dept") #

TimeUnit_

= "week" # Choices include "1min", "5min", "10min", "15min", "30min", "hour", "day", "week", "month", "quarter", "year"

TimeGroups_

= c("weeks","months","quarter") # These will tell GDL to build gdl features along the time aggregation dimension

ZeroPadSeries_

= c('NULL', 'all', 'inner') ZeroPadSeries choose "all", "inner", or NULL. 'Outer' grows missing dates by group to the largest of all groups size. 'Inner' fills in series by using the group level's own max and min values (versus filling all group levels to the max value of the groups level with the widest time gap)

DataTruncate_

= FALSE # TRUE will truncate all rows where GDL columns produced a -1 (remove all rows where ID < max(rolling stats)). FALSE otherwise.

SplitRatios_

= c(1 - 10 / 143, 10 / 143) # If you have GroupVariables_ then base it on number of records in a group, like default

PartitionType_

= "timeseries" # always time series for this function. Place holder for other time series options down the road.

TrainOnFull_

= FALSE # Set to TRUE put in Forecase mode. FALSE to put in Evaluation mode. Forecast mode generates forecasts based on a model built using all of data_, and no evaluation metrics are collected when set to TRUE. Evaluation mode will build a forecast for your validation periods and collect the holdout metrics and other evaluation objects, but no future forecast beyond max date of data_. as specified in SplitRatios_.

FC_Periods_

= 4 # Self explanatory

EvalMetric_

= "RMSE" # "RMSE" only with catboost 17.5

GridTune_

= FALSE # NEEDS TO BE UPDATED ONCE BANDIT GRID TUNING WORKS.

GridEvalMetric_

= "mae" # 'poisson', 'mae', 'mape', 'mse', 'msle', 'kl', 'cs', 'r2'. If metric computation fails then no output is generated in final metric evaluation data.table

ModelCount_

= 5 # NEEDS TO BE UPDATED ONCE BANDIT GRID TUNING WORKS.

TaskType_

= "GPU" # Set to "CPU" to train on CPU versus GPU. Must supply a value.

Timer_

= TRUE # Print out the forecast step the function is currently working on. If it errors on the first run scoring the model then it is likely a very different error then if has printed "Forecasting 1:"

TargetTransformation_

= TRUE # Set to TRUE to have every available numeric transformation compete for best normalization fit to normal distribution

Difference_

= TRUE # The I in ARIMA. Works for single series and grouped series a.k.a. panel data.

CalendarVariables_

= TRUE # This TURNS ON procedure to create numeric calendar variables that your TimeUnit_ directs. FALSE otherwise.

HolidayVariable_

= TRUE # This TURNS ON procedure to create a numeric holiday count variable. FALSE otherwise.

HolidayLags_

= c(1:2) # Supply a numeric vector of lag periods

HolidayMovingAverages_

= c(1:2) # Supply a numeric vector of Moving Average periods

Lags_

= c(1:5) # Numeric vector of lag periods

MA_Periods_

= c(1:5) # Numeric vector of lag periods

SD_Periods_

= c(2:5) # Numeric vector of lag periods

Skew_Periods_

= c(3:5) # Numeric vector of lag periods

Kurt_Periods_

= c(4:5) # Numeric vector of lag periods

Quantile_Periods_

= c(3:5) # Numeric vector of lag periods

Quantiles_Selected_

= c("q5","q95") # Select the quantiles you want calculated. "q5", "q10", ..., "q95".

FourierTerms_

= 2 # (TECHINICALLY FOURIER PAIRS) Hierarchy grouping (full group variable interaction set) is ran by default (MAKE INTO OPTIOn). Uses parallelization to loop through the unique set of all GroupVariables levels and computes fourier terms as if the group level's are a single series; just for all groups and it's parallelized.

TimeTrendVariable_

= TRUE # Set to TRUE to have a sequence created from 1 to nrow by group or single series

NTrees_

= 150 # Number of trees to have trained. Can be 10000 or more depending on group level size.

DebugMode_

= TRUE # When TRUE it will print every comment section header line. When it crashes, you can get a print out of the last N steps that were ran, depending on the print max limit.

OptionsWarn

Set to 1 to print warnings immediately to screen versus after a function finishes; 2 to kill processes if a warning occurs. See options(warn = )