# NOT RUN {
# load example data (Bank clients with/without a term deposit - see ?bank_td for details)
data("bank_td")
library(dplyr)
# prepare data for training model for binomial target has_td and train models
train_index = sample(seq(1, nrow(bank_td)),size = 0.5*nrow(bank_td) ,replace = FALSE)
train = bank_td[train_index,c('has_td','duration','campaign','pdays','previous','euribor3m')]
test = bank_td[-train_index,c('has_td','duration','campaign','pdays','previous','euribor3m')]
#train logistic regression model with stats package
glm.model <- glm(has_td ~.,family=binomial(link='logit'),data=train)
#score model
prob_no.term.deposit <- stats::predict(glm.model,newdata=train,type='response')
prob_term.deposit <- 1-prob_no.term.deposit
#set number of ntiles
ntiles = 10
# determine cutoffs
cutoffs = c(stats::quantile(prob_term.deposit,probs = seq(0,1,1/ntiles),na.rm = TRUE))
#calculate ntile values
ntl_term.deposit <- (ntiles+1)-as.numeric(cut(prob_term.deposit,breaks=cutoffs,include.lowest=TRUE))
ntl_no.term.deposit <- (ntiles+1)-ntl_term.deposit
# create scored data frame
scores_and_ntiles <- train %>%
select(has_td) %>%
mutate(model_label=factor('logistic regression'),
dataset_label=factor('train data'),
y_true=factor(has_td),
prob_term.deposit = prob_term.deposit,
prob_no.term.deposit = prob_no.term.deposit,
ntl_term.deposit = ntl_term.deposit,
ntl_no.term.deposit = ntl_no.term.deposit) %>%
select(-has_td)
# add test data
#score model on test data
prob_no.term.deposit <- stats::predict(glm.model,newdata=test,type='response')
prob_term.deposit <- 1-prob_no.term.deposit
#set number of ntiles
ntiles = 10
# determine cutoffs
cutoffs = c(stats::quantile(prob_term.deposit,probs = seq(0,1,1/ntiles),na.rm = TRUE))
#calculate ntile values
ntl_term.deposit <- (ntiles+1)-as.numeric(cut(prob_term.deposit,breaks=cutoffs,include.lowest=TRUE))
ntl_no.term.deposit <- (ntiles+1)-ntl_term.deposit
scores_and_ntiles <- scores_and_ntiles %>%
rbind(
test %>%
select(has_td) %>%
mutate(model_label=factor('logistic regression'),
dataset_label=factor('test data'),
y_true=factor(has_td),
prob_term.deposit = prob_term.deposit,
prob_no.term.deposit = prob_no.term.deposit,
ntl_term.deposit = ntl_term.deposit,
ntl_no.term.deposit = ntl_no.term.deposit) %>%
select(-has_td)
)
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,scope='compare_datasets')
plot_cumgains()
# }
Run the code above in your browser using DataLab