library(dplyr)
library(ConfusionTableR)
library(parsnip)
library(rsample)
library(recipes)
library(ranger)
library(workflows)
data("thyroid_disease")
td <- thyroid_disease
# Create a factor of the class label to use in ML model
td$ThryroidClass <- as.factor(td$ThryroidClass)
# Check the structure of the data to make sure factor has been created
str(td)
# Remove missing values, or choose more advaced imputation option
td <- td[complete.cases(td),]
#Drop the column for referral source
td <- td %>%
dplyr::select(-ref_src)
# Analyse class imbalance
class_imbalance <- prop.table(table(td$ThryroidClass))
class_imbalance
#Divide the data into a training test split
set.seed(123)
split <- rsample::initial_split(td, prop=3/4)
train_data <- rsample::training(split)
test_data <- rsample::testing(split)
# Create recipe to upsample and normalise
set.seed(123)
td_recipe <-
recipe(ThryroidClass ~ ., data=train_data) %>%
step_normalize(all_predictors()) %>%
step_zv(all_predictors())
# Instantiate the model
set.seed(123)
rf_mod <-
parsnip::rand_forest() %>%
set_engine("ranger") %>%
set_mode("classification")
# Create the model workflow
td_wf <-
workflow() %>%
workflows::add_model(rf_mod) %>%
workflows::add_recipe(td_recipe)
# Fit the workflow to our training data
set.seed(123)
td_rf_fit <-
td_wf %>%
fit(data = train_data)
# Extract the fitted data
td_fitted <- td_rf_fit %>%
extract_fit_parsnip()
# Predict the test set on the training set to see model performance
class_pred <- predict(td_rf_fit, test_data)
td_preds <- test_data %>%
bind_cols(class_pred)
# Convert both to factors
td_preds$.pred_class <- as.factor(td_preds$.pred_class)
td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass)
# Evaluate the data with ConfusionTableR
cm <- ConfusionTableR::binary_class_cm(td_preds$ThryroidClass ,
td_preds$.pred_class,
positive="sick")
#View Confusion matrix
cm$confusion_matrix
#View record level
cm$record_level_cm
Run the code above in your browser using DataLab