# Generate sample data with skewed category distribution
set.seed(789)
n <- 3000
# Power-law distributed categories
categories <- c(
rep("X1", 1200), rep("X2", 800), rep("X3", 400),
sample(LETTERS[4:20], 600, replace = TRUE)
)
feature <- sample(categories, n, replace = TRUE)
# Target probabilities based on category importance
probs <- ifelse(grepl("X", feature), 0.7,
ifelse(grepl("[A-C]", feature), 0.5, 0.3)
)
target <- rbinom(n, 1, prob = probs)
# Perform user-defined technique binning
result <- ob_categorical_udt(feature, target)
print(result[c("bin", "woe", "iv", "count")])
# Adjust parameters for finer control
result_custom <- ob_categorical_udt(
feature = feature,
target = target,
min_bins = 2,
max_bins = 7,
bin_cutoff = 0.03
)
# Handling missing values
feature_with_na <- feature
feature_with_na[sample(length(feature_with_na), 150)] <- NA
result_na <- ob_categorical_udt(feature_with_na, target)
Run the code above in your browser using DataLab