# NOT RUN {
library(h2o)
h2o.init()
# Import a sample binary outcome train/test set
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)
# For binary classification, response should be a factor
train[, y] <- as.factor(train[, y])
test[, y] <- as.factor(test[, y])
# Number of CV folds
nfolds <- 5
# Train & Cross-validate a GBM
my_gbm <- h2o.gbm(x = x,
y = y,
training_frame = train,
distribution = "bernoulli",
ntrees = 10,
max_depth = 3,
min_rows = 2,
learn_rate = 0.2,
nfolds = nfolds,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 1)
# Train & Cross-validate a RF
my_rf <- h2o.randomForest(x = x,
y = y,
training_frame = train,
ntrees = 50,
nfolds = nfolds,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 1)
# Train a stacked ensemble using the GBM and RF above
ensemble <- h2o.stackedEnsemble(x = x,
y = y,
training_frame = train,
model_id = "my_ensemble_binomial",
base_models = list(my_gbm, my_rf))
# }
Run the code above in your browser using DataLab