# Generate sample data with varying category frequencies
set.seed(456)
n <- 5000
# Create categories with power-law frequency distribution
categories <- c(
rep("A", 1500), rep("B", 1000), rep("C", 800),
rep("D", 500), rep("E", 300), rep("F", 200),
sample(letters[7:26], 700, replace = TRUE)
)
feature <- sample(categories, n, replace = TRUE)
# Create target with dependency on top categories
target_probs <- ifelse(feature %in% c("A", "B"), 0.7,
ifelse(feature %in% c("C", "D"), 0.5, 0.3)
)
target <- rbinom(n, 1, prob = target_probs)
# Perform sliding window binning
result <- ob_categorical_swb(feature, target)
print(result[c("bin", "woe", "iv", "count")])
# With stricter bin limits
result_strict <- ob_categorical_swb(
feature = feature,
target = target,
min_bins = 4,
max_bins = 6
)
# Handling missing values
feature_with_na <- feature
feature_with_na[sample(length(feature_with_na), 100)] <- NA
result_na <- ob_categorical_swb(feature_with_na, target)
Run the code above in your browser using DataLab