XGBoostDeployment: Deploy a production-ready predictive XGBoost model

Description

This step allows one to

Automatically load a saved model from XGBoostDevelopment
Run the model against test data to generate predictions
Push these predictions to SQL Server or CSV

Usage

XGBoostDeployment(type, df, grainCol,
predictedCol, impute, debug, cores, modelName)

Arguments

type

The type of model (must be multiclass)

Dataframe whose columns are used for new predictions. Data structure should match development as much as possible. Number of columns, names, types, grain, and predicted must be the same.

grainCol

The dataframe's column that has IDs pertaining to the grain

predictedCol

Column that you want to predict.

impute

For training df, set all-column imputation to T or F. If T, this uses values calculated in development. F leads to removal of rows containing NULLs and is not recommended.

debug

Provides the user extended output to the console, in order to monitor the calculations throughout. Use T or F.

cores

Number of cores you'd like to use. Defaults to 2.

modelName

Optional string. Can specify the model name. If used, you must load the same one in the deploy step.

Format

An object of class R6ClassGenerator of length 24.

Methods

The above describes params for initializing a new XGBoostDeployment class with $new(). Individual methods are documented below.

<code>$new()</code>

Initializes a new XGBoost deployment class using the parameters saved in p, documented above. This method loads, cleans, and prepares data for generating predictions. Usage: $new(p)

<code>$deploy()</code>

Generate new predictions and prepare the output dataframe. Usage: $deploy()

<code>$getPredictions()</code>

Return the grain and predictions for each class. Usage: $getPredictions()

<code>$getOutDf()</code>

Returns a dataframe containing the grain column, the top 3 probabilities for each row, and the classes associated with those probabilities. Usage: $getOutDf()

Examples

Run this code

# NOT RUN {
#### Example using csv dataset ####
ptm <- proc.time()
library(healthcareai)

# 1. Load data. Categorical columns should be characters.
# can delete these system.file lines in your work
csvfile <- system.file("extdata", 
                      "dermatology_multiclass_data.csv", 
                      package = "healthcareai")
# Read in CSV; replace csvfile with 'path/file'
df <- read.csv(file = csvfile, 
              header = TRUE, 
             stringsAsFactors = FALSE,
              na.strings = c("NULL", "NA", "", "?"))

str(df) # check the types of columns
dfDeploy <- df[347:366,] # reserve 20 rows for deploy step.

# 2. Develop and save model (saving is automatic)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "multiclass"
p$impute <- TRUE
p$grainCol <- "PatientID"
p$predictedCol <- "target"
p$debug <- FALSE
p$cores <- 1
# xgb_params must be a list with all of these things in it. 
# if you would like to tweak parameters, go for it! 
# Leave objective and eval_metric as they are.
p$xgb_params <- list("objective" = "multi:softprob",
                     "eval_metric" = "mlogloss",
                     "max_depth" = 6, # max depth of each learner
                     "eta" = 0.1, # learning rate
                     "silent" = 0, # verbose output when set to 1
                     "nthread" = 2) # number of processors to use

# Run model
boost <- XGBoostDevelopment$new(p)
boost$run()

## 3. Load saved model (automatic) and use DEPLOY to generate predictions. 
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "multiclass"
p2$df <- dfDeploy
p2$grainCol <- "PatientID"
p2$predictedCol <- "target"
p2$impute <- TRUE
p2$debug <- FALSE

# Deploy model to make new predictions
boostD <- XGBoostDeployment$new(p2)
boostD$deploy()

# Get output dataframe for csv or SQL
outDf <- boostD$getOutDf()
head(outDf)

# If you want to write to sqlite:
# sqliteFile <- system.file("extdata",
#                          "unit-test.sqlite",
#                         package = "healthcareai")
# writeData(SQLiteFileName = sqliteFile,
#         df = outDf,
#         tableName = "dermatologyDeployMulticlassBASE")

# Write to CSV (or JSON, MySQL, etc) using plain R syntax
# write.csv(df,'path/predictionsfile.csv')

# Get raw predictions if you want
# rawPredictions <- boostD$getPredictions()

# If you have known labels, check your prediction accuracy like this:
# caret::confusionMatrix(true_label,
#              predicted_label,
#              mode = "everything")

print(proc.time() - ptm)

# }
# NOT RUN {
#### Example pulling from CSV and writing to SQL server ####
# This example requires you to first create a table in SQL Server
# If you prefer to not use SAMD, execute this in SSMS to create output table:
# CREATE TABLE [dbo].[dermatologyDeployClassificationBASE](
# [BindingID] [int] NULL,[BindingNM] [varchar](255) NULL,
# [LastLoadDTS] [datetime2](7) NULL,
# [PatientID] [decimal](38, 0) NULL,
# [PredictedProb1] [decimal](38, 2) NULL,
# [PredictedClass1] [varchar](255) NULL,
# [PredictedProb2] [decimal](38, 2) NULL,
# [PredictedClass2] [varchar](255) NULL,
# [PredictedProb3] [decimal](38, 2) NULL,
# [PredictedClass3] [varchar](255) NULL)


# 1. Load data. Categorical columns should be characters.
csvfile <- system.file("extdata", 
                       "dermatology_multiclass_data.csv", 
                       package = "healthcareai")

# Replace csvfile with 'path/file'
df <- read.csv(file = csvfile, 
               header = TRUE, 
               stringsAsFactors = FALSE,
               na.strings = c("NULL", "NA", "", "?"))

str(df) # check the types of columns
dfDeploy <- df[347:366,] # reserve 20 rows for deploy step.


# 2. Develop and save model (saving is automatic)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "multiclass"
p$impute <- TRUE
p$grainCol <- "PatientID"
p$predictedCol <- "target"
p$debug <- FALSE
p$cores <- 1
# xgb_params must be a list with all of these things in it. 
# if you would like to tweak parameters, go for it! 
# Leave objective and eval_metric as they are.
p$xgb_params <- list("objective" = "multi:softprob",
                   "eval_metric" = "mlogloss",
                   "max_depth" = 6, # max depth of each learner
                   "eta" = 0.1, # learning rate
                   "silent" = 0, # verbose output when set to 1
                   "nthread" = 2) # number of processors to use

# Run model
boost <- XGBoostDevelopment$new(p)
boost$run()

## 3. Load saved model (automatic) and use DEPLOY to generate predictions. 
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "multiclass"
p2$df <- dfDeploy
p2$grainCol <- "PatientID"
p2$predictedCol <- "target"
p2$impute <- TRUE
p2$debug <- FALSE

# Deploy model to make new predictions
boostD <- XGBoostDeployment$new(p2)
boostD$deploy()

# Get output dataframe for csv or SQL
outDf <- boostD$getOutDf()
head(outDf)

# Save the output to SQL server

connection.string <- "
driver={SQL Server};
server=localhost;
database=SAM;
trusted_connection=true
"
writeData(MSSQLConnectionString = connection.string,
       df = outDf,
       tableName = 'dermatologyDeployClassificationBASE')
       
# Get raw predictions if you want
# rawPredictions <- boostD$getPredictions()
print(proc.time() - ptm)
# }

Run the code above in your browser using DataLab