# NOT RUN {
# Set up your output file path for saving results as a .csv
Path <- 'C:/YourPathHere'
# Run on GPU or CPU (some options in the grid tuning force usage of CPU for some runs)
TaskType = 'GPU'
# Define number of CPU threads to allow data.table to utilize
data.table::setDTthreads(percent = max(1L, parallel::detectCores()-2L))
# Load data
data <- data <- data.table::fread('https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1')
# Ensure series have no missing dates (also remove series with more than 25% missing values)
data <- RemixAutoML::TimeSeriesFill(
data,
DateColumnName = 'Date',
GroupVariables = c('Store','Dept'),
TimeUnit = 'weeks',
FillType = 'maxmax',
MaxMissingPercent = 0.25,
SimpleImpute = TRUE)
# Set negative numbers to 0
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]
# Remove IsHoliday column
data[, IsHoliday := NULL]
# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
xregs <- data[, .SD, .SDcols = c('Date', 'Store', 'Dept')]
# Change data types
data[, ':=' (Store = as.character(Store), Dept = as.character(Dept))]
xregs[, ':=' (Store = as.character(Store), Dept = as.character(Dept))]
# Subset data so we have an out of time sample
data1 <- data.table::copy(data[, ID := 1L:.N, by = c('Store','Dept')][ID <= 125L][, ID := NULL])
data[, ID := NULL]
# Define values for SplitRatios and FCWindow Args
N1 <- data1[, .N, by = c('Store','Dept')][1L, N]
N2 <- xregs[, .N, by = c('Store','Dept')][1L, N]
# Setup Grid Tuning & Feature Tuning data.table using a cross join of vectors
Tuning <- data.table::CJ(
TimeWeights = c('None',0.999),
MaxTimeGroups = c('weeks','months'),
TargetTransformation = c('TRUE','FALSE'),
Difference = c('TRUE','FALSE'),
HoldoutTrain = c(6,18),
Langevin = c('TRUE','FALSE'),
NTrees = c(2500,5000),
Depth = c(6,9),
RandomStrength = c(0.75,1),
L2_Leaf_Reg = c(3.0,4.0),
RSM = c(0.75,'NULL'),
GrowPolicy = c('SymmetricTree','Lossguide','Depthwise'),
BootStrapType = c('Bayesian','MVS','No'))
# Remove options that are not compatible with GPU (skip over this otherwise)
Tuning <- Tuning[Langevin == 'TRUE' | (Langevin == 'FALSE' & RSM == 'NULL' & BootStrapType %in% c('Bayesian','No'))]
# Randomize order of Tuning data.table
Tuning <- Tuning[order(runif(.N))]
# Load grid results and remove rows that have already been tested
if(file.exists(file.path(Path, 'Walmart_CARMA_Metrics.csv'))) {
Metrics <- data.table::fread(file.path(Path, 'Walmart_CARMA_Metrics.csv'))
temp <- data.table::rbindlist(list(Metrics,Tuning), fill = TRUE)
temp <- unique(temp, by = c(4:(ncol(temp)-1)))
Tuning <- temp[is.na(RunTime)][, .SD, .SDcols = names(Tuning)]
rm(Metrics,temp)
}
# Define the total number of runs
TotalRuns <- Tuning[,.N]
# Kick off feature + grid tuning
for(Run in seq_len(TotalRuns)) {
# Print run number
for(zz in seq_len(100)) print(Run)
# Use fresh data for each run
xregs_new <- data.table::copy(xregs)
data_new <- data.table::copy(data1)
# Timer start
StartTime <- Sys.time()
# Run carma system
CatBoostResults <- RemixAutoML::AutoCatBoostCARMA(
# data args
data = data_new,
TimeWeights = if(Tuning[Run, TimeWeights] == 'None') NULL else as.numeric(Tuning[Run, TimeWeights]),
TargetColumnName = 'Weekly_Sales',
DateColumnName = 'Date',
HierarchGroups = NULL,
GroupVariables = c('Store','Dept'),
TimeUnit = 'weeks',
TimeGroups = if(Tuning[Run, MaxTimeGroups] == 'weeks') 'weeks' else if(Tuning[Run, MaxTimeGroups] == 'months') c('weeks','months') else c('weeks','months','quarters'),
# Production args
TrainOnFull = TRUE,
SplitRatios = c(1 - Tuning[Run, HoldoutTrain] / N2, Tuning[Run, HoldoutTrain] / N2),
PartitionType = 'random',
FC_Periods = N2-N1,
TaskType = TaskType,
NumGPU = 1,
Timer = TRUE,
DebugMode = TRUE,
# Target variable transformations
TargetTransformation = as.logical(Tuning[Run, TargetTransformation]),
Methods = c('YeoJohnson', 'BoxCox', 'Asinh', 'Log', 'LogPlus1', 'Sqrt', 'Asin', 'Logit'),
Difference = as.logical(Tuning[Run, Difference]),
NonNegativePred = TRUE,
RoundPreds = FALSE,
# Calendar-related features
CalendarVariables = c('week','wom','month','quarter'),
HolidayVariable = c('USPublicHolidays'),
HolidayLookback = NULL,
HolidayLags = c(1,2,3),
HolidayMovingAverages = c(2,3),
# Lags, moving averages, and other rolling stats
Lags = if(Tuning[Run, MaxTimeGroups] == 'weeks') c(1,2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == 'months') list('weeks' = c(1,2,3,4,5,8,9,12,13,51,52,53), 'months' = c(1,2,6,12)) else list('weeks' = c(1,2,3,4,5,8,9,12,13,51,52,53), 'months' = c(1,2,6,12), 'quarters' = c(1,2,3,4)),
MA_Periods = if(Tuning[Run, MaxTimeGroups] == 'weeks') c(2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == 'months') list('weeks' = c(2,3,4,5,8,9,12,13,51,52,53), 'months' = c(2,6,12)) else list('weeks' = c(2,3,4,5,8,9,12,13,51,52,53), 'months' = c(2,6,12), 'quarters' = c(2,3,4)),
SD_Periods = NULL,
Skew_Periods = NULL,
Kurt_Periods = NULL,
Quantile_Periods = NULL,
Quantiles_Selected = NULL,
# Bonus features
AnomalyDetection = NULL,
XREGS = xregs_new,
FourierTerms = 0,
TimeTrendVariable = TRUE,
ZeroPadSeries = NULL,
DataTruncate = FALSE,
# ML grid tuning args
GridTune = FALSE,
PassInGrid = NULL,
ModelCount = 5,
MaxRunsWithoutNewWinner = 50,
MaxRunMinutes = 60*60,
# ML evaluation output
PDFOutputPath = NULL,
SaveDataPath = NULL,
NumOfParDepPlots = 0L,
# ML loss functions
EvalMetric = 'RMSE',
EvalMetricValue = 1,
LossFunction = 'RMSE',
LossFunctionValue = 1,
# ML tuning args
NTrees = Tuning[Run, NTrees],
Depth = Tuning[Run, Depth],
L2_Leaf_Reg = Tuning[Run, L2_Leaf_Reg],
LearningRate = 0.03,
Langevin = as.logical(Tuning[Run, Langevin]),
DiffusionTemperature = 10000,
RandomStrength = Tuning[Run, RandomStrength],
BorderCount = 254,
RSM = if(Tuning[Run, RSM] == 'NULL') NULL else as.numeric(Tuning[Run, RSM]),
GrowPolicy = Tuning[Run, GrowPolicy],
BootStrapType = Tuning[Run, BootStrapType],
ModelSizeReg = 0.5,
FeatureBorderType = 'GreedyLogSum',
SamplingUnit = 'Group',
SubSample = NULL,
ScoreFunction = 'Cosine',
MinDataInLeaf = 1)
# Timer End
EndTime <- Sys.time()
# Prepare data for evaluation
Results <- CatBoostResults$Forecast
data.table::setnames(Results, 'Weekly_Sales', 'bla')
Results <- merge(Results, data, by = c('Store','Dept','Date'), all = FALSE)
Results <- Results[is.na(bla)][, bla := NULL]
# Create totals and subtotals
Results <- data.table::groupingsets(
x = Results,
j = list(Predictions = sum(Predictions), Weekly_Sales = sum(Weekly_Sales)),
by = c('Date', 'Store', 'Dept'),
sets = list(c('Date', 'Store', 'Dept'), c('Store', 'Dept'), 'Store', 'Dept', 'Date'))
# Fill NAs with 'Total' for totals and subtotals
for(cols in c('Store','Dept')) Results[, eval(cols) := data.table::fifelse(is.na(get(cols)), 'Total', get(cols))]
# Add error measures
Results[, Weekly_MAE := abs(Weekly_Sales - Predictions)]
Results[, Weekly_MAPE := Weekly_MAE / Weekly_Sales]
# Weekly results
Weekly_MAPE <- Results[, list(Weekly_MAPE = mean(Weekly_MAPE)), by = list(Store,Dept)]
# Monthly results
temp <- data.table::copy(Results)
temp <- temp[, Date := lubridate::floor_date(Date, unit = 'months')]
temp <- temp[, lapply(.SD, sum), by = c('Date','Store','Dept'), .SDcols = c('Predictions', 'Weekly_Sales')]
temp[, Monthly_MAE := abs(Weekly_Sales - Predictions)]
temp[, Monthly_MAPE := Monthly_MAE / Weekly_Sales]
Monthly_MAPE <- temp[, list(Monthly_MAPE = mean(Monthly_MAPE)), by = list(Store,Dept)]
# Collect metrics for Total (feel free to switch to something else or no filter at all)
Metrics <- data.table::data.table(
RunNumber = Run,
Total_Weekly_MAPE = Weekly_MAPE[Store == 'Total' & Dept == 'Total', Weekly_MAPE],
Total_Monthly_MAPE = Monthly_MAPE[Store == 'Total' & Dept == 'Total', Monthly_MAPE],
Tuning[Run],
RunTime = EndTime - StartTime)
# Append to file (not overwrite)
data.table::fwrite(Metrics, file = file.path(Path, 'Walmart_CARMA_Metrics.csv'), append = TRUE)
# Remove objects (clear space before new runs)
rm(CatBoostResults, Results, temp, Weekly_MAE, Weekly_MAPE, Monthly_MAE, Monthly_MAPE)
# Garbage collection because of GPU
gc()
}
# }
Run the code above in your browser using DataLab