# \donttest{
# Example 1: Basic JEDI optimization
set.seed(42)
n_obs <- 1500
# Simulate employment types with risk gradient
employment <- c(
"Permanent", "Contract", "Temporary", "SelfEmployed",
"Unemployed", "Student", "Retired"
)
risk_rates <- c(0.03, 0.08, 0.15, 0.12, 0.35, 0.25, 0.10)
cat_feature <- sample(employment, n_obs,
replace = TRUE,
prob = c(0.35, 0.20, 0.15, 0.12, 0.08, 0.06, 0.04)
)
bin_target <- sapply(cat_feature, function(x) {
rbinom(1, 1, risk_rates[which(employment == x)])
})
# Apply JEDI algorithm
result_jedi <- ob_categorical_jedi(
cat_feature,
bin_target,
min_bins = 3,
max_bins = 5
)
# Display results
print(data.frame(
Bin = result_jedi$bin,
WoE = round(result_jedi$woe, 3),
IV = round(result_jedi$iv, 4),
Count = result_jedi$count,
EventRate = round(result_jedi$count_pos / result_jedi$count, 3)
))
cat("\nTotal IV (jointly optimized):", round(result_jedi$total_iv, 4), "\n")
cat("Converged:", result_jedi$converged, "\n")
cat("Iterations:", result_jedi$iterations, "\n")
# Example 2: Method comparison (JEDI vs alternatives)
set.seed(123)
n_obs_comp <- 2000
departments <- c(
"Sales", "IT", "HR", "Finance", "Operations",
"Marketing", "Legal", "R&D"
)
cat_feature_comp <- sample(departments, n_obs_comp, replace = TRUE)
bin_target_comp <- rbinom(n_obs_comp, 1, 0.12)
# JEDI (joint optimization)
result_jedi_comp <- ob_categorical_jedi(
cat_feature_comp, bin_target_comp,
min_bins = 3, max_bins = 4
)
# IVB (exact DP)
result_ivb_comp <- ob_categorical_ivb(
cat_feature_comp, bin_target_comp,
min_bins = 3, max_bins = 4
)
# GMB (greedy)
result_gmb_comp <- ob_categorical_gmb(
cat_feature_comp, bin_target_comp,
min_bins = 3, max_bins = 4
)
cat("\nMethod comparison (Total IV):\n")
cat(
" JEDI:", round(result_jedi_comp$total_iv, 4),
"- converged:", result_jedi_comp$converged, "\n"
)
cat(
" IVB:", round(result_ivb_comp$total_iv, 4),
"- converged:", result_ivb_comp$converged, "\n"
)
cat(
" GMB:", round(result_gmb_comp$total_iv, 4),
"- converged:", result_gmb_comp$converged, "\n"
)
# Example 3: Bayesian smoothing with sparse data
set.seed(789)
n_obs_sparse <- 400
# Small sample with rare events
categories <- c("A", "B", "C", "D", "E", "F", "G")
cat_probs <- c(0.25, 0.20, 0.18, 0.15, 0.12, 0.07, 0.03)
cat_feature_sparse <- sample(categories, n_obs_sparse,
replace = TRUE,
prob = cat_probs
)
bin_target_sparse <- rbinom(n_obs_sparse, 1, 0.05) # 5% event rate
result_jedi_sparse <- ob_categorical_jedi(
cat_feature_sparse,
bin_target_sparse,
min_bins = 2,
max_bins = 4,
bin_cutoff = 0.02
)
cat("\nBayesian smoothing (sparse data):\n")
cat(" Sample size:", n_obs_sparse, "\n")
cat(" Total events:", sum(bin_target_sparse), "\n")
cat(" Event rate:", round(mean(bin_target_sparse), 4), "\n")
cat(" Bins created:", length(result_jedi_sparse$bin), "\n\n")
# Show how smoothing prevents extreme WoE values
for (i in seq_along(result_jedi_sparse$bin)) {
cat(sprintf(
" Bin %d: events=%d/%d, WoE=%.3f (smoothed)\n",
i,
result_jedi_sparse$count_pos[i],
result_jedi_sparse$count[i],
result_jedi_sparse$woe[i]
))
}
# Example 4: Violation detection and repair
set.seed(456)
n_obs_viol <- 1200
# Create feature with non-monotonic risk pattern
risk_categories <- c(
"VeryLow", "Low", "MediumHigh", "Medium", # Intentional non-monotonic
"High", "VeryHigh"
)
actual_risks <- c(0.02, 0.05, 0.20, 0.12, 0.25, 0.40) # MediumHigh > Medium
cat_feature_viol <- sample(risk_categories, n_obs_viol, replace = TRUE)
bin_target_viol <- sapply(cat_feature_viol, function(x) {
rbinom(1, 1, actual_risks[which(risk_categories == x)])
})
result_jedi_viol <- ob_categorical_jedi(
cat_feature_viol,
bin_target_viol,
min_bins = 3,
max_bins = 5,
max_iterations = 50
)
cat("\nViolation detection and repair:\n")
cat(" Original categories:", length(unique(cat_feature_viol)), "\n")
cat(" Final bins:", length(result_jedi_viol$bin), "\n")
cat(" Iterations to convergence:", result_jedi_viol$iterations, "\n")
cat(" Monotonicity achieved:", result_jedi_viol$converged, "\n\n")
# Check final WoE monotonicity
woe_diffs <- diff(result_jedi_viol$woe)
cat(
" WoE differences between bins:",
paste(round(woe_diffs, 3), collapse = ", "), "\n"
)
cat(" All positive (monotonic):", all(woe_diffs >= -1e-6), "\n")
# Example 5: High cardinality performance
set.seed(321)
n_obs_hc <- 3000
# Simulate product categories (high cardinality)
products <- paste0("Product_", sprintf("%03d", 1:50))
cat_feature_hc <- sample(products, n_obs_hc, replace = TRUE)
bin_target_hc <- rbinom(n_obs_hc, 1, 0.08)
# Measure JEDI performance
time_jedi_hc <- system.time({
result_jedi_hc <- ob_categorical_jedi(
cat_feature_hc,
bin_target_hc,
min_bins = 4,
max_bins = 7,
max_n_prebins = 20,
bin_cutoff = 0.02
)
})
cat("\nHigh cardinality performance:\n")
cat(" Original categories:", length(unique(cat_feature_hc)), "\n")
cat(" Final bins:", length(result_jedi_hc$bin), "\n")
cat(" Execution time:", round(time_jedi_hc[3], 3), "seconds\n")
cat(" Total IV:", round(result_jedi_hc$total_iv, 4), "\n")
cat(" Converged:", result_jedi_hc$converged, "\n")
# Show merged categories
for (i in seq_along(result_jedi_hc$bin)) {
n_merged <- length(strsplit(result_jedi_hc$bin[i], "%;%")[[1]])
if (n_merged > 1) {
cat(sprintf(" Bin %d: %d categories merged\n", i, n_merged))
}
}
# Example 6: Convergence behavior
set.seed(555)
n_obs_conv <- 1000
education_levels <- c(
"Elementary", "HighSchool", "Vocational",
"Bachelor", "Master", "PhD"
)
cat_feature_conv <- sample(education_levels, n_obs_conv,
replace = TRUE,
prob = c(0.10, 0.30, 0.20, 0.25, 0.12, 0.03)
)
bin_target_conv <- rbinom(n_obs_conv, 1, 0.15)
# Test different convergence thresholds
thresholds <- c(1e-3, 1e-6, 1e-9)
for (thresh in thresholds) {
result_conv <- ob_categorical_jedi(
cat_feature_conv,
bin_target_conv,
min_bins = 2,
max_bins = 4,
convergence_threshold = thresh,
max_iterations = 100
)
cat(sprintf("\nThreshold %.0e:\n", thresh))
cat(" Final bins:", length(result_conv$bin), "\n")
cat(" Total IV:", round(result_conv$total_iv, 4), "\n")
cat(" Converged:", result_conv$converged, "\n")
cat(" Iterations:", result_conv$iterations, "\n")
}
# Example 7: Missing value handling
set.seed(999)
cat_feature_na <- cat_feature
na_indices <- sample(n_obs, 75) # 5% missing
cat_feature_na[na_indices] <- NA
result_jedi_na <- ob_categorical_jedi(
cat_feature_na,
bin_target,
min_bins = 3,
max_bins = 5
)
# Locate NA bin
na_bin_idx <- grep("NA", result_jedi_na$bin)
if (length(na_bin_idx) > 0) {
cat("\nMissing value treatment:\n")
cat(" NA bin:", result_jedi_na$bin[na_bin_idx], "\n")
cat(" NA count:", result_jedi_na$count[na_bin_idx], "\n")
cat(
" NA event rate:",
round(result_jedi_na$count_pos[na_bin_idx] /
result_jedi_na$count[na_bin_idx], 3), "\n"
)
cat(" NA WoE:", round(result_jedi_na$woe[na_bin_idx], 3), "\n")
cat(" NA IV contribution:", round(result_jedi_na$iv[na_bin_idx], 4), "\n")
}
# }
Run the code above in your browser using DataLab