# \donttest{
# Example 1: Basic greedy merge binning
set.seed(123)
n_obs <- 1500
# Simulate customer types with varying risk
customer_types <- c(
"Premium", "Gold", "Silver", "Bronze",
"Basic", "Trial"
)
risk_probs <- c(0.02, 0.05, 0.10, 0.15, 0.22, 0.35)
cat_feature <- sample(customer_types, n_obs,
replace = TRUE,
prob = c(0.10, 0.15, 0.25, 0.25, 0.15, 0.10)
)
bin_target <- sapply(cat_feature, function(x) {
rbinom(1, 1, risk_probs[which(customer_types == x)])
})
# Apply greedy merge binning
result_gmb <- ob_categorical_gmb(
cat_feature,
bin_target,
min_bins = 3,
max_bins = 4
)
# Display results
print(data.frame(
Bin = result_gmb$bin,
WoE = round(result_gmb$woe, 3),
IV = round(result_gmb$iv, 4),
Count = result_gmb$count,
EventRate = round(result_gmb$count_pos / result_gmb$count, 3)
))
cat("\nTotal IV:", round(result_gmb$total_iv, 4), "\n")
cat("Converged:", result_gmb$converged, "\n")
cat("Iterations:", result_gmb$iterations, "\n")
# Example 2: Comparing speed with exact methods
set.seed(456)
n_obs <- 3000
# High cardinality feature
regions <- paste0("Region_", sprintf("%02d", 1:25))
cat_feature_hc <- sample(regions, n_obs, replace = TRUE)
bin_target_hc <- rbinom(n_obs, 1, 0.12)
# Greedy approach (fast)
time_gmb <- system.time({
result_gmb_hc <- ob_categorical_gmb(
cat_feature_hc,
bin_target_hc,
min_bins = 3,
max_bins = 6,
max_n_prebins = 20
)
})
# Dynamic programming (exact but slower)
time_dp <- system.time({
result_dp_hc <- ob_categorical_dp(
cat_feature_hc,
bin_target_hc,
min_bins = 3,
max_bins = 6,
max_n_prebins = 20
)
})
cat("\nPerformance comparison (high cardinality):\n")
cat(" GMB time:", round(time_gmb[3], 3), "seconds\n")
cat(" DP time:", round(time_dp[3], 3), "seconds\n")
cat(" Speedup:", round(time_dp[3] / time_gmb[3], 1), "x\n")
cat("\n GMB IV:", round(result_gmb_hc$total_iv, 4), "\n")
cat(" DP IV:", round(result_dp_hc$total_iv, 4), "\n")
# Example 3: Convergence behavior
set.seed(789)
n_obs_conv <- 1000
# Feature with natural groupings
education <- c("PhD", "Master", "Bachelor", "HighSchool", "NoHighSchool")
cat_feature_conv <- sample(education, n_obs_conv,
replace = TRUE,
prob = c(0.05, 0.15, 0.35, 0.30, 0.15)
)
bin_target_conv <- sapply(cat_feature_conv, function(x) {
probs <- c(0.02, 0.05, 0.08, 0.15, 0.25)
rbinom(1, 1, probs[which(education == x)])
})
# Test different convergence thresholds
thresholds <- c(1e-3, 1e-6, 1e-9)
for (thresh in thresholds) {
result_conv <- ob_categorical_gmb(
cat_feature_conv,
bin_target_conv,
min_bins = 2,
max_bins = 4,
convergence_threshold = thresh
)
cat(sprintf(
"\nThreshold %.0e: %d iterations, converged=%s\n",
thresh, result_conv$iterations, result_conv$converged
))
}
# Example 4: Handling rare categories
set.seed(321)
n_obs_rare <- 2000
# Simulate with many rare categories
products <- c(paste0("Common_", 1:5), paste0("Rare_", 1:15))
product_probs <- c(rep(0.15, 5), rep(0.01, 15))
cat_feature_rare <- sample(products, n_obs_rare,
replace = TRUE,
prob = product_probs
)
bin_target_rare <- rbinom(n_obs_rare, 1, 0.10)
result_gmb_rare <- ob_categorical_gmb(
cat_feature_rare,
bin_target_rare,
min_bins = 3,
max_bins = 5,
bin_cutoff = 0.03 # Aggressive rare category merging
)
cat("\nRare category handling:\n")
cat(" Original categories:", length(unique(cat_feature_rare)), "\n")
cat(" Final bins:", length(result_gmb_rare$bin), "\n")
# Count merged rare categories
for (i in seq_along(result_gmb_rare$bin)) {
n_merged <- length(strsplit(result_gmb_rare$bin[i], "%;%")[[1]])
if (n_merged > 1) {
cat(sprintf(" Bin %d: %d categories merged\n", i, n_merged))
}
}
# Example 5: Imbalanced dataset robustness
set.seed(555)
n_obs_imb <- 1200
# Highly imbalanced target (2% event rate)
cat_feature_imb <- sample(c("A", "B", "C", "D", "E"),
n_obs_imb,
replace = TRUE
)
bin_target_imb <- rbinom(n_obs_imb, 1, 0.02)
result_gmb_imb <- ob_categorical_gmb(
cat_feature_imb,
bin_target_imb,
min_bins = 2,
max_bins = 3
)
cat("\nImbalanced dataset:\n")
cat(" Event rate:", round(mean(bin_target_imb), 4), "\n")
cat(" Total events:", sum(bin_target_imb), "\n")
cat(" Bins created:", length(result_gmb_imb$bin), "\n")
cat(" WoE range:", sprintf(
"[%.2f, %.2f]\n",
min(result_gmb_imb$woe),
max(result_gmb_imb$woe)
))
# }
Run the code above in your browser using DataLab