# NOT RUN { ## binary classification: data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") # use all trees by default pred <- predict(bst, test$data) # use only the 1st tree pred1 <- predict(bst, test$data, ntreelimit = 1) # Predicting tree leafs: # the result is an nsamples X ntrees matrix pred_leaf <- predict(bst, test$data, predleaf = TRUE) str(pred_leaf) # Predicting feature contributions to predictions: # the result is an nsamples X (nfeatures + 1) matrix pred_contr <- predict(bst, test$data, predcontrib = TRUE) str(pred_contr) # verify that contributions' sums are equal to log-odds of predictions (up to float precision): summary(rowSums(pred_contr) - qlogis(pred)) # for the 1st record, let's inspect its features that had non-zero contribution to prediction: contr1 <- pred_contr[1,] contr1 <- contr1[-length(contr1)] # drop BIAS contr1 <- contr1[contr1 != 0] # drop non-contributing features contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude old_mar <- par("mar") par(mar = old_mar + c(0,7,0,0)) barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds") par(mar = old_mar) ## multiclass classification in iris dataset: lb <- as.numeric(iris$Species) - 1 num_class <- 3 set.seed(11) bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5, objective = "multi:softprob", num_class = num_class) # predict for softmax returns num_class probability numbers per case: pred <- predict(bst, as.matrix(iris[, -5])) str(pred) # reshape it to a num_class-columns matrix pred <- matrix(pred, ncol=num_class, byrow=TRUE) # convert the probabilities to softmax labels pred_labels <- max.col(pred) - 1 # the following should result in the same error as seen in the last iteration sum(pred_labels != lb)/length(lb) # compare that to the predictions from softmax: set.seed(11) bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5, objective = "multi:softmax", num_class = num_class) pred <- predict(bst, as.matrix(iris[, -5])) str(pred) all.equal(pred, pred_labels) # prediction from using only 5 iterations should result # in the same error as seen in iteration 5: pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) sum(pred5 != lb)/length(lb) ## random forest-like model of 25 trees for binary classification: set.seed(11) bst <- xgboost(data = train$data, label = train$label, max_depth = 5, nthread = 2, nrounds = 1, objective = "binary:logistic", num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1) # Inspect the prediction error vs number of trees: lb <- test$label dtest <- xgb.DMatrix(test$data, label=lb) err <- sapply(1:25, function(n) { pred <- predict(bst, dtest, ntreelimit=n) sum((pred > 0.5) != lb)/length(lb) }) plot(err, type='l', ylim=c(0,0.1), xlab='#trees') # }
Run the code above in your browser using DataCamp Workspace