## create some data
N <- 1000
X1 <- runif(N)
X2 <- runif(N)
X3 <- factor(sample(letters[1:4],N,replace=TRUE))
mu <- c(-1,0,1,2)[as.numeric(X3)]
p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu)))
Y <- rbinom(N,1,p)
# random weights if you want to experiment with them
w <- rexp(N)
w <- N*w/sum(w)
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3)
# \donttest{
# takes longer, but num_trees=3000 preferable
train_params <-
training_params(num_trees = 3000,
shrinkage = 0.001,
bag_fraction = 0.5,
num_train = N/2,
id=seq_len(nrow(data)),
min_num_obs_in_node = 10,
interaction_depth = 3,
num_features = 3)
# }
# for the example to run quickly, num_trees=100
train_params <-
training_params(num_trees = 100,
shrinkage = 0.001,
bag_fraction = 0.5,
num_train = N/2,
id=seq_len(nrow(data)),
min_num_obs_in_node = 10,
interaction_depth = 3,
num_features = 3)
# fit initial model
gbm1 <- gbmt(Y~X1+X2+X3, # formula
data=data, # dataset
weights=w,
var_monotone=c(0,0,0), # -1: monotone decrease,
# +1: monotone increase,
# 0: no monotone restrictions
distribution=gbm_dist("Bernoulli"),
train_params = train_params,
cv_folds=5, # do 5-fold cross-validation
is_verbose = FALSE) # don't print progress
# plot the performance
# returns out-of-bag estimated best number of trees
best.iter.oob <- gbmt_performance(gbm1,method="OOB")
plot(best.iter.oob)
print(best.iter.oob)
# returns 5-fold cv estimate of best number of trees
best.iter.cv <- gbmt_performance(gbm1,method="cv")
plot(best.iter.cv)
print(best.iter.cv)
# returns test set estimate of best number of trees
best.iter.test <- gbmt_performance(gbm1,method="test")
plot(best.iter.cv)
print(best.iter.test)
best.iter <- best.iter.test
# plot variable influence
summary(gbm1,num_trees=1) # based on first tree
summary(gbm1,num_trees=best.iter) # based on estimated best number of trees
# create marginal plots
# plot variable X1,X2,X3 after "best" iterations
oldpar <- par(no.readonly = TRUE)
par(mfrow=c(1,3))
plot(gbm1,1,best.iter)
plot(gbm1,2,best.iter)
plot(gbm1,3,best.iter)
par(mfrow=c(1,1))
plot(gbm1,1:2,best.iter) # contour plot vars 1 & 2 after "best" num iterations
plot(gbm1,2:3,best.iter) # lattice plot vars 2 & 3 after "best" num iterations
# 3-way plot
plot(gbm1,1:3,best.iter)
# print the first and last trees
print(pretty_gbm_tree(gbm1,1))
print(pretty_gbm_tree(gbm1, gbm1$params$num_trees))
par(oldpar) # reset graphics options to previous settings
Run the code above in your browser using DataLab