library(dplyr)
library(ConfusionTableR)
data(heartdisease)
# Convert diabetes data to factor'
hd <- heartdisease %>%
glimpse() %>%
mutate(HeartDisease = as.factor(HeartDisease))
# Check that the label is now a factor
is.factor(hd$HeartDisease)
# Dummy encoding
# Get categorical columns
hd_cat <- hd %>%
dplyr::select_if(is.character)
# Dummy encode the categorical variables
# Specify the columns to encode
cols <- c("RestingECG", "Angina", "Sex")
# Dummy encode using dummy_encoder in ConfusionTableR package
coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE)
coded <- coded %>%
select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y,
Sex=Sex_F)
# Remove column names we have encoded from original data frame
hd_one <- hd[,!names(hd) %in% cols]
# Bind the numerical data on to the categorical data
hd_final <- bind_cols(coded, hd_one)
# Output the final encoded data frame for the ML task
glimpse(hd_final)
Run the code above in your browser using DataLab