# NOT RUN {
# Load data
data <- data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1")
# Ensure series have no missing dates (also remove series with more than 25% missing values)
data <- RemixAutoML::TimeSeriesFill(
data,
DateColumnName = "Date",
GroupVariables = c("Store","Dept"),
TimeUnit = "weeks",
FillType = "maxmax",
MaxMissingPercent = 0.25,
SimpleImpute = TRUE)
# Set negative numbers to 0
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]
# Remove IsHoliday column
data[, IsHoliday := NULL]
# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
xregs <- data[, .SD, .SDcols = c("Date", "Store", "Dept")]
# Change data types
data[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]
xregs[, ":=" (Store = as.character(Store), Dept = as.character(Dept))]
# Build forecast
Results <- RemixAutoML::AutoH2OCARMA(
# Data Artifacts
AlgoType = "drf",
ExcludeAlgos = NULL,
data = data,
TargetColumnName = "Weekly_Sales",
DateColumnName = "Date",
HierarchGroups = NULL,
GroupVariables = c("Dept"),
TimeUnit = "week",
TimeGroups = c("weeks","months"),
# Data Wrangling Features
ZeroPadSeries = NULL,
DataTruncate = FALSE,
SplitRatios = c(1 - 10 / 138, 10 / 138),
PartitionType = "random",
# Productionize
FC_Periods = 4L,
TrainOnFull = FALSE,
EvalMetric = "RMSE",
GridTune = FALSE,
ModelCount = 5,
MaxMem = "28G",
NThreads = parallel::detectCores(),
Timer = TRUE,
# Target Transformations
TargetTransformation = FALSE,
Methods = c("BoxCox", "Asinh", "Asin", "Log",
"LogPlus1", "Sqrt", "Logit", "YeoJohnson"),
Difference = FALSE,
NonNegativePred = FALSE,
RoundPreds = FALSE,
# Features
AnomalyDetection = NULL,
HolidayLags = 1:7,
HolidayMovingAverages = 2:7,
Lags = list("weeks" = c(1:4), "months" = c(1:3)),
MA_Periods = list("weeks" = c(2:8), "months" = c(6:12)),
SD_Periods = NULL,
Skew_Periods = NULL,
Kurt_Periods = NULL,
Quantile_Periods = NULL,
Quantiles_Selected = NULL,
XREGS = NULL,
FourierTerms = 2L,
CalendarVariables = c("week", "wom", "month", "quarter", "year"),
HolidayVariable = c("USPublicHolidays","EasterGroup",
"ChristmasGroup","OtherEcclesticalFeasts"),
TimeTrendVariable = TRUE,
NTrees = 1000L,
DebugMode = TRUE)
UpdateMetrics <-
Results$ModelInformation$EvaluationMetrics[
Metric == "MSE", MetricValue := sqrt(MetricValue)]
print(UpdateMetrics)
# Get final number of trees actually used
Results$Model@model$model_summary$number_of_internal_trees
# Inspect performance
Results$ModelInformation$EvaluationMetricsByGroup[order(-R2_Metric)]
Results$ModelInformation$EvaluationMetricsByGroup[order(MAE_Metric)]
Results$ModelInformation$EvaluationMetricsByGroup[order(MSE_Metric)]
Results$ModelInformation$EvaluationMetricsByGroup[order(MAPE_Metric)]
# }
Run the code above in your browser using DataLab