# NOT RUN {
#### Classification Example using csv data ####
## 1. Loading data and packages.
ptm <- proc.time()
library(healthcareai)
# setwd('C:/Yourscriptlocation/Useforwardslashes') # Uncomment if using csv
# Can delete this line in your work
csvfile <- system.file("extdata",
"HCRDiabetesClinical.csv",
package = "healthcareai")
# Replace csvfile with 'path/file'
df <- read.csv(file = csvfile,
header = TRUE,
na.strings = c("NULL", "NA", ""))
df$PatientID <- NULL # Only one ID column (ie, PatientEncounterID) is needed remove this column
# Save a dataframe for validation later on
dfDeploy <- df[951:1000,]
## 2. Train and save the model using DEVELOP
print('Historical, development data:')
str(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "classification"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "ThirtyDayReadmitFLG"
p$debug <- FALSE
p$cores <- 1
# Run RandomForest
RandomForest <- RandomForestDevelopment$new(p)
RandomForest$run()
## 3. Load saved model and use DEPLOY to generate predictions.
print('Fake production data:')
str(dfDeploy)
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "classification"
p2$df <- dfDeploy
p2$grainCol <- "PatientEncounterID"
p2$predictedCol <- "ThirtyDayReadmitFLG"
p2$impute <- TRUE
p2$debug <- FALSE
p2$cores <- 1
dL <- RandomForestDeployment$new(p2)
dL$deploy()
dfOut <- dL$getOutDf()
head(dfOut)
# Write to CSV (or JSON, MySQL, etc) using plain R syntax
# write.csv(dfOut,'path/predictionsfile.csv')
print(proc.time() - ptm)
# }
# NOT RUN {
#### Classification example using SQL Server data ####
# This example requires you to first create a table in SQL Server
# If you prefer to not use SAMD, execute this in SSMS to create output table:
# CREATE TABLE dbo.HCRDeployClassificationBASE(
# BindingID float, BindingNM varchar(255), LastLoadDTS datetime2,
# PatientEncounterID int, <--change to match inputID
# PredictedProbNBR decimal(38, 2),
# Factor1TXT varchar(255), Factor2TXT varchar(255), Factor3TXT varchar(255)
# )
## 1. Loading data and packages.
ptm <- proc.time()
library(healthcareai)
connection.string <- "
driver={SQL Server};
server=localhost;
database=SAM;
trusted_connection=true
"
query <- "
SELECT
[PatientEncounterID] --Only need one ID column for random forest
,[SystolicBPNBR]
,[LDLNBR]
,[A1CNBR]
,[GenderFLG]
,[ThirtyDayReadmitFLG]
FROM [SAM].[dbo].[HCRDiabetesClinical]
"
df <- selectData(connection.string, query)
# Save a dataframe for validation later on
dfDeploy <- df[951:1000,]
## 2. Train and save the model using DEVELOP
print('Historical, development data:')
str(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "classification"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "ThirtyDayReadmitFLG"
p$debug <- FALSE
p$cores <- 1
# Run RandomForest
RandomForest <- RandomForestDevelopment$new(p)
RandomForest$run()
## 3. Load saved model and use DEPLOY to generate predictions.
print('Fake production data:')
str(dfDeploy)
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "classification"
p2$df <- dfDeploy
p2$grainCol <- "PatientEncounterID"
p2$predictedCol <- "ThirtyDayReadmitFLG"
p2$impute <- TRUE
p2$debug <- FALSE
p2$cores <- 1
dL <- RandomForestDeployment$new(p2)
dL$deploy()
dfOut <- dL$getOutDf()
writeData(MSSQLConnectionString = connection.string,
df = dfOut,
tableName = 'HCRDeployClassificationBASE')
print(proc.time() - ptm)
# }
# NOT RUN {
# }
# NOT RUN {
#### Regression Example using SQL Server data ####
# This example requires you to first create a table in SQL Server
# If you prefer to not use SAMD, execute this in SSMS to create output table:
# CREATE TABLE dbo.HCRDeployRegressionBASE(
# BindingID float, BindingNM varchar(255), LastLoadDTS datetime2,
# PatientEncounterID int, <--change to match inputID
# PredictedValueNBR decimal(38, 2),
# Factor1TXT varchar(255), Factor2TXT varchar(255), Factor3TXT varchar(255)
# )
## 1. Loading data and packages.
ptm <- proc.time()
library(healthcareai)
connection.string <- "
driver={SQL Server};
server=localhost;
database=SAM;
trusted_connection=true
"
query <- "
SELECT
[PatientEncounterID] --Only need one ID column for random forest
,[SystolicBPNBR]
,[LDLNBR]
,[A1CNBR]
,[GenderFLG]
,[ThirtyDayReadmitFLG]
FROM [SAM].[dbo].[HCRDiabetesClinical]
"
df <- selectData(connection.string, query)
# Save a dataframe for validation later on
dfDeploy <- df[951:1000,]
## 2. Train and save the model using DEVELOP
print('Historical, development data:')
str(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "regression"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "A1CNBR"
p$debug <- FALSE
p$cores <- 1
# Run Random Forest
RandomForest <- RandomForestDevelopment$new(p)
RandomForest$run()
## 3. Load saved model and use DEPLOY to generate predictions.
dfDeploy$A1CNBR <- NULL # You won't know the response in production
print('Fake production data:')
str(dfDeploy)
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "regression"
p2$df <- dfDeploy
p2$grainCol <- "PatientEncounterID"
p2$predictedCol <- "A1CNBR"
p2$impute <- TRUE
p2$debug <- FALSE
p2$cores <- 1
dL <- RandomForestDeployment$new(p2)
dL$deploy()
dfOut <- dL$getOutDf()
writeData(MSSQLConnectionString = connection.string,
df = dfOut,
tableName = 'HCRDeployRegressionBASE')
print(proc.time() - ptm)
# }
# NOT RUN {
#' #### Classification example pulling from CSV and writing to SQLite ####
## 1. Loading data and packages.
ptm <- proc.time()
library(healthcareai)
# Can delete these system.file lines in your work
csvfile <- system.file("extdata",
"HCRDiabetesClinical.csv",
package = "healthcareai")
sqliteFile <- system.file("extdata",
"unit-test.sqlite",
package = "healthcareai")
# Read in CSV; replace csvfile with 'path/file'
df <- read.csv(file = csvfile,
header = TRUE,
na.strings = c("NULL", "NA", ""))
df$PatientID <- NULL # Only one ID column (ie, PatientEncounterID) is needed remove this column
# Save a dataframe for validation later on
dfDeploy <- df[951:1000,]
## 2. Train and save the model using DEVELOP
print('Historical, development data:')
str(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "classification"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "ThirtyDayReadmitFLG"
p$debug <- FALSE
p$cores <- 1
# Run Random Forest
RandomForest <- RandomForestDevelopment$new(p)
RandomForest$run()
## 3. Load saved model and use DEPLOY to generate predictions.
print('Fake production data:')
str(dfDeploy)
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "classification"
p2$df <- dfDeploy
p2$grainCol <- "PatientEncounterID"
p2$predictedCol <- "ThirtyDayReadmitFLG"
p2$impute <- TRUE
p2$debug <- FALSE
p2$cores <- 1
dL <- RandomForestDeployment$new(p2)
dL$deploy()
dfOut <- dL$getOutDf()
writeData(SQLiteFileName = sqliteFile,
df = dfOut,
tableName = 'HCRDeployClassificationBASE')
print(proc.time() - ptm)
#### Regression example pulling from CSV and writing to SQLite ####
## 1. Loading data and packages.
ptm <- proc.time()
library(healthcareai)
# Can delete these system.file lines in your work
csvfile <- system.file("extdata",
"HCRDiabetesClinical.csv",
package = "healthcareai")
sqliteFile <- system.file("extdata",
"unit-test.sqlite",
package = "healthcareai")
# Read in CSV; replace csvfile with 'path/file'
df <- read.csv(file = csvfile,
header = TRUE,
na.strings = c("NULL", "NA", ""))
df$PatientID <- NULL # Only one ID column (ie, PatientEncounterID) is needed remove this column
# Save a dataframe for validation later on
dfDeploy <- df[951:1000,]
## 2. Train and save the model using DEVELOP
print('Historical, development data:')
str(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "regression"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "A1CNBR"
p$debug <- FALSE
p$cores <- 1
# Run Random Forest
RandomForest<- RandomForestDevelopment$new(p)
RandomForest$run()
## 3. Load saved model and use DEPLOY to generate predictions.
dfDeploy$A1CNBR <- NULL # You won't know the response in production
print('Fake production data:')
str(dfDeploy)
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "regression"
p2$df <- dfDeploy
p2$grainCol <- "PatientEncounterID"
p2$predictedCol <- "A1CNBR"
p2$impute <- TRUE
p2$debug <- FALSE
p2$cores <- 1
dL <- RandomForestDeployment$new(p2)
dL$deploy()
dfOut <- dL$getOutDf()
writeData(SQLiteFileName = sqliteFile,
df = dfOut,
tableName = 'HCRDeployRegressionBASE')
print(proc.time() - ptm)
#### Identify factors that could benefit outcomes: getProcessVariablesDf ####
#############################################################################
# getProcessVariableDf() identifies opportunities for improved outcomes at
# the grain level. It is important that the variables ("modifiableVariables")
# and values ("variableLevels") used in this function are under the control
# of the care management process. The best use case for this function is a
# "natural experiment" where otherwise similar groups had different
# treatments applied to them, and that treatment is the modifiable variable
# of interest.
# This example shows how to use the getProcessVariableDf() function, using
# another readmission-prediction model. In this example systolic blood pressure
# is converted into a categorical variable to demonstrate functionality.
csvfile <- system.file("extdata",
"HCRDiabetesClinical.csv",
package = "healthcareai")
# Replace csvfile with 'path/file'
df <- read.csv(file = csvfile,
header = TRUE,
na.strings = c("NULL", "NA", ""))
df$PatientID <- NULL # Remove extra ID
# Convert systolic blood pressure from a numeric variable to a categorical
# variable with 5 categories: normal, pre-hypertension, stage 1 hypertension,
# stage 2 hypertension, and hypertensive crisis
df$SystolicBP <- ifelse(df$SystolicBPNBR < 140,
ifelse(df$SystolicBPNBR < 120,
"Normal",
"Pre-hypertensive"),
ifelse(df$SystolicBPNBR < 160,
"Stage_1",
ifelse(df$SystolicBP < 180, "Stage_2", "Crisis")))
df$SystolicBPNBR <- NULL
# Save a dataframe for validation later on
dfDeploy <- df[951:1000,]
## Develop and Deploy the model
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "classification"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "ThirtyDayReadmitFLG"
p$debug <- FALSE
p$cores <- 1
RandomForest <- RandomForestDevelopment$new(p)
RandomForest$run()
p2 <- SupervisedModelDeploymentParams$new()
p2$type <- "classification"
p2$df <- dfDeploy
p2$grainCol <- "PatientEncounterID"
p2$predictedCol <- "ThirtyDayReadmitFLG"
p2$impute <- TRUE
p2$debug <- FALSE
p2$cores <- 1
dL <- RandomForestDeployment$new(p2)
dL$deploy()
## Get predicted outcome changes using getProcessVariablesDf
# Categorical variables can simply be listed as modifiableVariables and all
# factors levels will be used for comparison purposes. The dataframe
# generated from the code below will consider all possible blood pressure
# categories.
dL$getProcessVariablesDf(modifiableVariables = c("SystolicBP"))
# By default, the function returns predictions for all rows, but we can
# restrict to specific rows using the grainColumnIDs parameter
dL$getProcessVariablesDf(modifiableVariables = c("SystolicBP"),
grainColumnIDs = c(954, 965, 996))
# The variableLevels parameter can be used to limit which factor levels are
# considered (for categorical variables). The dataframe generated from the
# code below will only make comparisons with normal BP and pre-hypertensive
dL$getProcessVariablesDf(modifiableVariables = c("SystolicBP"),
variableLevels = list(SystolicBP = c("Normal",
"Pre-hypertensive")))
# The variableLevels parameter can also be used to allow predictions for
# numeric variables, by providing specific target values of the numeric
# variable to make comparisons to. In the code below, the predictions will be
# compared to those for an A1C of 5.6
dL$getProcessVariablesDf(modifiableVariables = c("A1CNBR"),
variableLevels = list(A1CNBR = c(5.6)))
# The repeatedFactors parameter allows one to get multiple predictions
# for the same variable. For example, reducing A1C to 5.0 might most improve
# a patient's risk, but reducing A1C to 5.5 is likely to also reduce the risk
# and that change might be more impactful than altering the patient's blood
# pressure. When repeatedFactors is TRUE, both those results will
# be included. If repeatedFactors were FALSE, only the most beneficial
# value of A1C would be included.
dL$getProcessVariablesDf(modifiableVariables = c("SystolicBP", "A1CNBR"),
variableLevels = list(SystolicBP = c("Normal",
"Pre-hypertensive"),
A1CNBR = c(5.0, 5.5, 6, 6.5)),
repeatedFactors = TRUE)
# The numTopFactors parameter allows one to set the maximum number of
# predictions to display (with the default being 3)
dL$getProcessVariablesDf(modifiableVariables = c("SystolicBP", "A1CNBR"),
variableLevels = list(SystolicBP = c("Normal",
"Pre-hypertensive"),
A1CNBR = c(5.0, 5.5, 6, 6.5)),
repeatedFactors = TRUE,
numTopFactors = 5)
# If greater values of the predicted variable are preferable, setting
# smallerBetter to FALSE will identify the factors that most increase
# the value of the outcome variable. In this case, the deltas will be
# positive, corresponding to an increased risk
dL$getProcessVariablesDf(modifiableVariables = c("SystolicBP"),
smallerBetter = FALSE)
# }
Run the code above in your browser using DataLab