# NOT RUN {
set.seed(45796)
# We have two tables we want to use in our models:
# - df is the model table. It has the outcomes (survived), and we want one
# prediction for each row in df
# - meds has detailed information on each row (patient) in df. Each patient
# may have zero, one, or more observations (drugs) in meds, and meds may
# have associated values (doses).
df <- tibble::tibble(
patient = paste0("Z", sample(10, 5)),
age = sample(20:80, 5),
survived = sample(c("N", "Y"), 5, replace = TRUE, prob = c(1, 2))
)
meds <- tibble::tibble(
patient = sample(df$patient, 10, replace = TRUE),
drug = sample(c("Quinapril", "Vancomycin", "Ibuprofen",
"Paclitaxel", "Epinephrine", "Dexamethasone"),
10, replace = TRUE),
dose = sample(c(100, 250), 10, replace = TRUE)
)
# Identify three drugs likely to be good predictors of survival
get_best_levels(d = df,
longsheet = meds,
id = patient,
groups = drug,
outcome = survived,
n_levels = 3)
# Identify four drugs likely to make good features and add them to df.
# The "fill", "fun", and "missing_fill" arguments are passed to
# `pivot`, which allows us to use the total doses of each drug given to the
# patient as our new features
new_df <- add_best_levels(d = df,
longsheet = meds,
id = patient,
groups = drug,
outcome = survived,
n_levels = 4,
fill = dose,
fun = sum,
missing_fill = 0)
new_df
# The names of the medications that were added to df in new_df are stored in the
# best_levels attribute of new_df so that the same columns can be added in
# deployment. This is useful because you need to have the same columns to make
# predictions as you had in model training. When you are ready to add levels to
# a deployment data frame, you can pass to the "levels" argument of
# add_best_levels either the models trained on new_df, new_df itself, or the
# character vector of levels to add.
deployment_df <- tibble::tibble(
patient = "p6",
age = 30
)
deployment_meds <- tibble::tibble(
patient = rep("p6", 2),
drug = rep("Vancomycin", 2),
dose = c(100, 250)
)
# Now, even though Vancomycin is the only drug that appears in
# deployment_meds, because we pass new_df to "levels", we get all the columns
# needed to make predictions on a model trained on new_df
add_best_levels(d = deployment_df,
longsheet = deployment_meds,
id = patient,
groups = drug,
levels = new_df,
fill = dose,
missing_fill = 0)
# }
Run the code above in your browser using DataLab