# \donttest{
# Example 1: Basic monotonic binning with guaranteed WoE ordering
set.seed(42)
n_obs <- 1500
# Simulate risk ratings with natural monotonic relationship
ratings <- c("AAA", "AA", "A", "BBB", "BB", "B", "CCC")
default_probs <- c(0.01, 0.02, 0.05, 0.10, 0.20, 0.35, 0.50)
cat_feature <- sample(ratings, n_obs,
replace = TRUE,
prob = c(0.05, 0.10, 0.20, 0.25, 0.20, 0.15, 0.05)
)
bin_target <- sapply(cat_feature, function(x) {
rbinom(1, 1, default_probs[which(ratings == x)])
})
# Apply MBA algorithm
result_mba <- ob_categorical_mba(
cat_feature,
bin_target,
min_bins = 3,
max_bins = 5
)
# Display results with guaranteed monotonic WoE
print(data.frame(
Bin = result_mba$bin,
WoE = round(result_mba$woe, 3),
IV = round(result_mba$iv, 4),
Count = result_mba$count,
EventRate = round(result_mba$count_pos / result_mba$count, 3)
))
cat("\nMonotonicity check (WoE differences):\n")
woe_diffs <- diff(result_mba$woe)
cat(" Differences:", paste(round(woe_diffs, 4), collapse = ", "), "\n")
cat(" All positive (increasing):", all(woe_diffs >= -1e-10), "\n")
cat(" Total IV:", round(result_mba$total_iv, 4), "\n")
cat(" Converged:", result_mba$converged, "\n")
# Example 2: Comparison with non-monotonic methods
set.seed(123)
n_obs_comp <- 2000
sectors <- c("Tech", "Health", "Finance", "Manufacturing", "Retail")
cat_feature_comp <- sample(sectors, n_obs_comp, replace = TRUE)
bin_target_comp <- rbinom(n_obs_comp, 1, 0.15)
# MBA (strictly monotonic)
result_mba_comp <- ob_categorical_mba(
cat_feature_comp, bin_target_comp,
min_bins = 3, max_bins = 4
)
# Standard binning (may not be monotonic)
result_std_comp <- ob_categorical_cm(
cat_feature_comp, bin_target_comp,
min_bins = 3, max_bins = 4
)
cat("\nMonotonicity comparison:\n")
cat(
" MBA WoE differences:",
paste(round(diff(result_mba_comp$woe), 4), collapse = ", "), "\n"
)
cat(" MBA monotonic:", all(diff(result_mba_comp$woe) >= -1e-10), "\n")
cat(
" Std WoE differences:",
paste(round(diff(result_std_comp$woe), 4), collapse = ", "), "\n"
)
cat(" Std monotonic:", all(diff(result_std_comp$woe) >= -1e-10), "\n")
# Example 3: Bayesian smoothing with sparse data
set.seed(789)
n_obs_sparse <- 400
# Small sample with rare categories
categories <- c("A", "B", "C", "D", "E", "F")
cat_probs <- c(0.30, 0.25, 0.20, 0.15, 0.07, 0.03)
cat_feature_sparse <- sample(categories, n_obs_sparse,
replace = TRUE,
prob = cat_probs
)
bin_target_sparse <- rbinom(n_obs_sparse, 1, 0.08) # 8% event rate
result_mba_sparse <- ob_categorical_mba(
cat_feature_sparse,
bin_target_sparse,
min_bins = 2,
max_bins = 4,
bin_cutoff = 0.02
)
cat("\nBayesian smoothing (sparse data):\n")
cat(" Sample size:", n_obs_sparse, "\n")
cat(" Events:", sum(bin_target_sparse), "\n")
cat(" Final bins:", length(result_mba_sparse$bin), "\n\n")
# Show how smoothing prevents extreme WoE values
for (i in seq_along(result_mba_sparse$bin)) {
cat(sprintf(
" Bin %d: events=%d/%d, WoE=%.3f (smoothed)\n",
i,
result_mba_sparse$count_pos[i],
result_mba_sparse$count[i],
result_mba_sparse$woe[i]
))
}
# Example 4: High cardinality with pre-binning
set.seed(456)
n_obs_hc <- 3000
# Simulate ZIP codes (high cardinality)
zips <- paste0("ZIP_", sprintf("%04d", 1:50))
cat_feature_hc <- sample(zips, n_obs_hc, replace = TRUE)
bin_target_hc <- rbinom(n_obs_hc, 1, 0.12)
result_mba_hc <- ob_categorical_mba(
cat_feature_hc,
bin_target_hc,
min_bins = 4,
max_bins = 6,
max_n_prebins = 20,
bin_cutoff = 0.01
)
cat("\nHigh cardinality performance:\n")
cat(" Original categories:", length(unique(cat_feature_hc)), "\n")
cat(" Final bins:", length(result_mba_hc$bin), "\n")
cat(
" Largest merged bin contains:",
max(sapply(strsplit(result_mba_hc$bin, "%;%"), length)), "categories\n"
)
# Verify monotonicity in high-cardinality case
woe_monotonic <- all(diff(result_mba_hc$woe) >= -1e-10)
cat(" WoE monotonic:", woe_monotonic, "\n")
# Example 5: Convergence behavior
set.seed(321)
n_obs_conv <- 1000
business_sizes <- c("Micro", "Small", "Medium", "Large", "Enterprise")
cat_feature_conv <- sample(business_sizes, n_obs_conv, replace = TRUE)
bin_target_conv <- rbinom(n_obs_conv, 1, 0.18)
# Test different convergence thresholds
thresholds <- c(1e-3, 1e-6, 1e-9)
for (thresh in thresholds) {
result_conv <- ob_categorical_mba(
cat_feature_conv,
bin_target_conv,
min_bins = 2,
max_bins = 4,
convergence_threshold = thresh,
max_iterations = 50
)
cat(sprintf("\nThreshold %.0e:\n", thresh))
cat(" Final bins:", length(result_conv$bin), "\n")
cat(" Total IV:", round(result_conv$total_iv, 4), "\n")
cat(" Converged:", result_conv$converged, "\n")
cat(" Iterations:", result_conv$iterations, "\n")
# Check monotonicity preservation
monotonic <- all(diff(result_conv$woe) >= -1e-10)
cat(" Monotonic:", monotonic, "\n")
}
# Example 6: Missing value handling
set.seed(555)
cat_feature_na <- cat_feature
na_indices <- sample(n_obs, 75) # 5% missing
cat_feature_na[na_indices] <- NA
result_mba_na <- ob_categorical_mba(
cat_feature_na,
bin_target,
min_bins = 3,
max_bins = 5
)
# Locate NA bin
na_bin_idx <- grep("NA", result_mba_na$bin)
if (length(na_bin_idx) > 0) {
cat("\nMissing value treatment:\n")
cat(" NA bin:", result_mba_na$bin[na_bin_idx], "\n")
cat(" NA count:", result_mba_na$count[na_bin_idx], "\n")
cat(
" NA event rate:",
round(result_mba_na$count_pos[na_bin_idx] /
result_mba_na$count[na_bin_idx], 3), "\n"
)
cat(" NA WoE:", round(result_mba_na$woe[na_bin_idx], 3), "\n")
cat(
" Monotonicity preserved:",
all(diff(result_mba_na$woe) >= -1e-10), "\n"
)
}
# }
Run the code above in your browser using DataLab