# Attach packages
library(groupdata2)
library(dplyr)
set.seed(1)
# Create data frame
df <- data.frame(
  "participant" = factor(rep(c("1", "2", "3", "4", "5", "6"), 3)),
  "age" = rep(sample(c(1:100), 6), 3),
  "diagnosis" = factor(rep(c("a", "b", "a", "a", "b", "b"), 3)),
  "score" = sample(c(1:100), 3 * 6)
)
df <- df %>% arrange(participant)
df$session <- rep(c("1", "2", "3"), 6)
# Using fold()
## Without balancing
set.seed(1)
df_folded <- fold(data = df, k = 3)
# Check the balances of the various columns
# As we have not used balancing in `fold()`
# we should not expect it to be amazingly balanced
df_folded %>%
  dplyr::ungroup() %>%
  summarize_balances(
    group_cols = ".folds",
    num_cols = c("score", "age"),
    cat_cols = "diagnosis",
    id_cols = "participant"
  )
## With balancing
set.seed(1)
df_folded <- fold(
  data = df,
  k = 3,
  cat_col = "diagnosis",
  num_col = 'score',
  id_col = 'participant'
)
# Now the balance should be better
# although it may be difficult to get a good balance
# the 'score' column when also balancing on 'diagnosis'
# and keeping all rows per participant in the same fold
df_folded %>%
  dplyr::ungroup() %>%
  summarize_balances(
    group_cols = ".folds",
    num_cols = c("score", "age"),
    cat_cols = "diagnosis",
    id_cols = "participant"
  )
# Comparing multiple grouping columns
# Create 3 fold column that only balance "score"
set.seed(1)
df_folded <- fold(
  data = df,
  k = 3,
  num_fold_cols = 3,
  num_col = 'score'
)
# Summarize all three grouping cols at once
(summ <- df_folded %>%
  dplyr::ungroup() %>%
  summarize_balances(
    group_cols = paste0(".folds_", 1:3),
    num_cols = c("score")
  )
)
# Extract the across-group standard deviations
# The group column with the lowest standard deviation(s)
# is the most balanced group column
summ %>% ranked_balances()
Run the code above in your browser using DataLab