# NOT RUN {
### dontrun is used when the execution of the example requires some computational effort.
### simple regression (with a formula) example.
x1=rnorm(200,100,20); x2=rnorm(200,100,20)
y=0.7*sin(x1/(25*pi))+0.3*sin(x2/(25*pi))
M=fit(y~x1+x2,model="mlpe")
new1=rnorm(100,100,20); new2=rnorm(100,100,20)
ynew=0.7*sin(new1/(25*pi))+0.3*sin(new2/(25*pi))
P=predict(M,data.frame(x1=new1,x2=new2,y=rep(NA,100)))
print(mmetric(ynew,P,"MAE"))
### simple classification example.
# }
# NOT RUN {
data(iris)
M=fit(Species~.,iris,model="rpart")
plot(M@object); text(M@object) # show model
P=predict(M,iris)
print(mmetric(iris$Species,P,"CONF"))
print(mmetric(iris$Species,P,"ALL"))
mgraph(iris$Species,P,graph="ROC",TC=2,main="versicolor ROC",
baseline=TRUE,leg="Versicolor",Grid=10)
M2=fit(Species~.,iris,model="ctree")
plot(M2@object) # show model
P2=predict(M2,iris)
print(mmetric(iris$Species,P2,"CONF"))
# ctree with different setup:
# (ctree_control is from the party package)
M3=fit(Species~.,iris,model="ctree",controls = party::ctree_control(testtype="MonteCarlo"))
plot(M3@object) # show model
# }
# NOT RUN {
### simple binary classification example with cv.glmnet and xgboost
# }
# NOT RUN {
data(sa_ssin_2)
H=holdout(sa_ssin_2$y,ratio=2/3)
# cv.glmnet:
M=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet",task="cla") # pure classes
P=predict(M,sa_ssin_2[H$ts,])
cat("1st prediction, class:",as.character(P[1]),"\n")
cat("Confusion matrix:\n")
print(mmetric(sa_ssin_2[H$ts,]$y,P,"CONF")$conf)
M2=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet") # probabilities
P2=predict(M2,sa_ssin_2[H$ts,])
L=M2@levels
cat("1st prediction, prob:",L[1],"=",P2[1,1],",",L[2],"=",P2[1,2],"\n")
cat("Confusion matrix:\n")
print(mmetric(sa_ssin_2[H$ts,]$y,P2,"CONF")$conf)
cat("AUC of ROC curve:\n")
print(mmetric(sa_ssin_2[H$ts,]$y,P2,"AUC"))
M3=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet",nfolds=3) # use 3 folds instead of 10
plot(M3@object) # show cv.glmnet object
P3=predict(M3,sa_ssin_2[H$ts,])
# xgboost:
M4=fit(y~.,sa_ssin_2[H$tr,],model="xgboost",verbose=1) # nrounds=2, show rounds:
P4=predict(M4,sa_ssin_2[H$ts,])
print(mmetric(sa_ssin_2[H$ts,]$y,P4,"AUC"))
M5=fit(y~.,sa_ssin_2[H$tr,],model="xgboost",nrounds=3,verbose=1) # nrounds=3, show rounds:
P5=predict(M5,sa_ssin_2[H$ts,])
print(mmetric(sa_ssin_2[H$ts,]$y,P5,"AUC"))
# }
# NOT RUN {
### classification example with discrete classes, probabilities and holdout
# }
# NOT RUN {
data(iris)
H=holdout(iris$Species,ratio=2/3)
M=fit(Species~.,iris[H$tr,],model="ksvm",task="class")
M1=fit(Species~.,iris[H$tr,],model="lssvm") # default task="class" is assumed
M2=fit(Species~.,iris[H$tr,],model="ksvm",task="prob")
P=predict(M,iris[H$ts,]) # classes
P1=predict(M1,iris[H$ts,]) # classes
P2=predict(M2,iris[H$ts,]) # probabilities
print(mmetric(iris$Species[H$ts],P,"CONF"))
print(mmetric(iris$Species[H$ts],P1,"CONF"))
print(mmetric(iris$Species[H$ts],P2,"CONF"))
print(mmetric(iris$Species[H$ts],P,"CONF",TC=1))
print(mmetric(iris$Species[H$ts],P2,"CONF",TC=1))
print(mmetric(iris$Species[H$ts],P2,"AUC"))
### exploration of some rminer classification models:
models=c("lda","naiveBayes","kknn","randomForest","cv.glmnet","xgboost")
for(m in models)
{ cat("model:",m,"\n")
M=fit(Species~.,iris[H$tr,],model=m)
P=predict(M,iris[H$ts,])
print(mmetric(iris$Species[H$ts],P,"AUC")[[1]])
}
# }
# NOT RUN {
### classification example with hyperparameter selection
### note: for regression, similar code can be used
### SVM
# }
# NOT RUN {
data(iris)
# large list of SVM configurations:
# SVM with kpar="automatic" sigma rbfdot kernel estimation and default C=1:
# note: each execution can lead to different M@mpar due to sigest stochastic nature:
M=fit(Species~.,iris,model="ksvm")
print(M@mpar) # model hyperparameters/arguments
# same thing, explicit use of mparheuristic:
M=fit(Species~.,iris,model="ksvm",search=list(search=mparheuristic("ksvm")))
print(M@mpar) # model hyperparameters
# SVM with C=3, sigma=2^-7
M=fit(Species~.,iris,model="ksvm",C=3,kpar=list(sigma=2^-7))
print(M@mpar)
# SVM with different kernels:
M=fit(Species~.,iris,model="ksvm",kernel="polydot",kpar="automatic")
print(M@mpar)
# fit already has a scale argument, thus the only way to fix scale of "tanhdot"
# is to use the special search argument with the "none" method:
s=list(smethod="none",search=list(scale=2,offset=2))
M=fit(Species~.,iris,model="ksvm",kernel="tanhdot",search=s)
print(M@mpar)
# heuristic: 10 grid search values for sigma, rbfdot kernel (fdebug is used only for more verbose):
s=list(search=mparheuristic("ksvm",10)) # advised "heuristic10" usage
M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
print(M@mpar)
# same thing, uses older search="heuristic10" that works for fewer rminer models
M=fit(Species~.,iris,model="ksvm",search="heuristic10",fdebug=TRUE)
print(M@mpar)
# identical search under a different and explicit code:
s=list(search=2^seq(-15,3,2))
M=fit(Species~.,iris,model="ksvm",search=2^seq(-15,3,2),fdebug=TRUE)
print(M@mpar)
# uniform design "UD" for sigma and C, rbfdot kernel, two level of grid searches,
# under exponential (2^x) search scale:
M=fit(Species~.,iris,model="ksvm",search="UD",fdebug=TRUE)
print(M@mpar)
M=fit(Species~.,iris,model="ksvm",search="UD1",fdebug=TRUE)
print(M@mpar)
M=fit(Species~.,iris,model="ksvm",search=2^seq(-15,3,2),fdebug=TRUE)
print(M@mpar)
# now the more powerful search argument is used for modeling SVM:
# grid 3 x 3 search:
s=list(smethod="grid",search=list(sigma=2^c(-15,-5,3),C=2^c(-5,0,15)),convex=0,
metric="AUC",method=c("kfold",3,12345))
print(s)
M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
print(M@mpar)
# identical search with different argument smethod="matrix"
s$smethod="matrix"
s$search=list(sigma=rep(2^c(-15,-5,3),times=3),C=rep(2^c(-5,0,15),each=3))
print(s)
M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
print(M@mpar)
# search for best kernel (only works for kpar="automatic"):
s=list(smethod="grid",search=list(kernel=c("rbfdot","laplacedot","polydot","vanilladot")),
convex=0,metric="AUC",method=c("kfold",3,12345))
print(s)
M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
print(M@mpar)
# search for best parameters of "rbfdot" or "laplacedot" (which use same kpar):
s$search=list(kernel=c("rbfdot","laplacedot"),sigma=2^seq(-15,3,5))
print(s)
M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE)
print(M@mpar)
### randomForest
# search for mtry and ntree
s=list(smethod="grid",search=list(mtry=c(1,2,3),ntree=c(100,200,500)),
convex=0,metric="AUC",method=c("kfold",3,12345))
print(s)
M=fit(Species~.,iris,model="randomForest",search=s,fdebug=TRUE)
print(M@mpar)
### rpart
# simpler way to tune cp in 0.01 to 0.9 (10 searches):
s=list(search=mparheuristic("rpart",n=10,lower=0.01,upper=0.9),method=c("kfold",3,12345))
M=fit(Species~.,iris,model="rpart",search=s,fdebug=TRUE)
print(M@mpar)
# same thing but with more lines of code
# note: this code can be adapted to tune other rpart parameters,
# while mparheuristic only tunes cp
# a vector list needs to be used for the search$search parameter
lcp=vector("list",10) # 10 grid values for the complexity cp
names(lcp)=rep("cp",10) # same cp name
scp=seq(0.01,0.9,length.out=10) # 10 values from 0.01 to 0.18
for(i in 1:10) lcp[[i]]=scp[i] # cycle needed due to [[]] notation
s=list(smethod="grid",search=list(control=lcp),
convex=0,metric="AUC",method=c("kfold",3,12345))
M=fit(Species~.,iris,model="rpart",search=s,fdebug=TRUE)
print(M@mpar)
### ctree
# simpler way to tune mincriterion in 0.1 to 0.98 (9 searches):
mint=c("kfold",3,123) # internal validation method
s=list(search=mparheuristic("ctree",n=8,lower=0.1,upper=0.99),method=mint)
M=fit(Species~.,iris,model="ctree",search=s,fdebug=TRUE)
print(M@mpar)
# same thing but with more lines of code
# note: this code can be adapted to tune other ctree parameters,
# while mparheuristic only tunes mincriterion
# a vector list needs to be used for the search$search parameter
lmc=vector("list",9) # 9 grid values for the mincriterion
smc=seq(0.1,0.99,length.out=9)
for(i in 1:9) lmc[[i]]=party::ctree_control(mincriterion=smc[i])
s=list(smethod="grid",search=list(controls=lmc),method=mint,convex=0)
M=fit(Species~.,iris,model="ctree",search=s,fdebug=TRUE)
print(M@mpar)
### some MLP fitting examples:
# simplest use:
M=fit(Species~.,iris,model="mlpe")
print(M@mpar)
# same thing, with explicit use of mparheuristic:
M=fit(Species~.,iris,model="mlpe",search=list(search=mparheuristic("mlpe")))
print(M@mpar) # hidden nodes and number of ensemble mlps
# setting some nnet parameters:
M=fit(Species~.,iris,model="mlpe",size=3,decay=0.1,maxit=100,rang=0.9)
print(M@mpar) # mlpe hyperparameters
# MLP, 5 grid search fdebug is only used to put some verbose in the console:
s=list(search=mparheuristic("mlpe",n=5)) # 5 searches for size
print(s) # show search
M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
print(M@mpar)
# previous searches used a random holdout (seed=NULL), now a fixed seed (123) is used:
s=list(smethod="grid",search=mparheuristic("mlpe",n=5),convex=0,metric="AUC",
method=c("holdout",2/3,123))
print(s)
M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
print(M@mpar)
# faster and greedy grid search:
s$convex=1;s$search=list(size=0:9)
print(s)
M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
print(M@mpar)
# 2 level grid with total of 5 searches
# note of caution: some "2L" ranges may lead to non integer (e.g., 1.3) values at
# the 2nd level search. And some R functions crash if non integer values are used for
# integer parameters.
s$smethod="2L";s$convex=0;s$search=list(size=c(4,8,12))
print(s)
M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE)
print(M@mpar)
# testing of all 17 rminer classification methods:
model=c("naive","ctree","cv.glmnet","rpart","kknn","ksvm","lssvm","mlp","mlpe",
"randomForest","xgboost","bagging","boosting","lda","multinom","naiveBayes","qda")
inputs=ncol(iris)-1
ho=holdout(iris$Species,2/3,seed=123) # 2/3 for training and 1/3 for testing
Y=iris[ho$ts,ncol(iris)]
for(i in 1:length(model))
{
cat("i:",i,"model:",model[i],"\n")
search=list(search=mparheuristic(model[i])) # rminer default values
M=fit(Species~.,data=iris[ho$tr,],model=model[i],search=search,fdebug=TRUE)
P=predict(M,iris[ho$ts,])
cat("predicted ACC:",round(mmetric(Y,P,metric="ACC"),1),"\n")
}
# }
# NOT RUN {
### example of an error (warning) generated using fit:
# }
# NOT RUN {
data(iris)
# size needs to be a positive integer, thus 0.1 leads to an error:
M=fit(Species~.,iris,model="mlp",size=0.1)
print(M@object)
# }
# NOT RUN {
### exploration of some rminer regression models:
# }
# NOT RUN {
data(sa_ssin)
H=holdout(sa_ssin$y,ratio=2/3,seed=12345)
models=c("lm","mr","ctree","mars","cubist","cv.glmnet","xgboost","rvm")
for(m in models)
{ cat("model:",m,"\n")
M=fit(y~.,sa_ssin[H$tr,],model=m)
P=predict(M,sa_ssin[H$ts,])
print(mmetric(sa_ssin$y[H$ts],P,"MAE"))
}
# }
# NOT RUN {
# testing of all 18 rminer regression methods:
# }
# NOT RUN {
model=c("naive","ctree","cv.glmnet","rpart","kknn","ksvm","mlp","mlpe",
"randomForest","xgboost","cubist","lm","mr","mars","pcr","plsr","cppls","rvm")
# note: in this example, default values are considered for the hyperparameters.
# better results can be achieved by tuning hyperparameters via improved usage
# of the search argument (via mparheuristic function or written code)
data(iris)
ir2=iris[,1:4] # predict iris "Petal.Width"
names(ir2)[ncol(ir2)]="y" # change output name
inputs=ncol(ir2)-1
ho=holdout(ir2$y,2/3,seed=123) # 2/3 for training and 1/3 for testing
Y=ir2[ho$ts,ncol(ir2)]
for(i in 1:length(model))
{
cat("i:",i,"model:",model[i],"\n")
search=list(search=mparheuristic(model[i])) # rminer default values
M=fit(y~.,data=ir2[ho$tr,],model=model[i],search=search,fdebug=TRUE)
P=predict(M,ir2[ho$ts,])
cat("predicted MAE:",round(mmetric(Y,P,metric="MAE"),1),"\n")
}
# }
# NOT RUN {
### regression example with hyperparameter selection:
# }
# NOT RUN {
data(sa_ssin)
# some SVM experiments:
# default SVM:
M=fit(y~.,data=sa_ssin,model="svm")
print(M@mpar)
# SVM with (Cherkassy and Ma, 2004) heuristics to set C and epsilon:
M=fit(y~.,data=sa_ssin,model="svm",C=NA,epsilon=NA)
print(M@mpar)
# SVM with Uniform Design set sigma, C and epsilon:
M=fit(y~.,data=sa_ssin,model="ksvm",search="UD",fdebug=TRUE)
print(M@mpar)
# sensitivity analysis feature selection
M=fit(y~.,data=sa_ssin,model="ksvm",search=list(search=mparheuristic("ksvm",n=5)),feature="sabs")
print(M@mpar)
print(M@attributes) # selected attributes (1, 2 and 3 are the relevant inputs)
# example that shows how transform works:
M=fit(y~.,data=sa_ssin,model="mr") # linear regression
P=predict(M,data.frame(x1=-1000,x2=0,x3=0,x4=0,y=NA)) # P should be negative
print(P)
M=fit(y~.,data=sa_ssin,model="mr",transform="positive")
P=predict(M,data.frame(x1=-1000,x2=0,x3=0,x4=0,y=NA)) # P is not negative
print(P)
# }
# NOT RUN {
### pure classification example with a generic R (not rminer default) model ###
# }
# NOT RUN {
### nnet is adopted here but virtually ANY fitting function/package could be used:
# since the default nnet prediction is to provide probabilities, there is
# a need to create this "wrapping" function:
predictprob=function(object,newdata)
{ predict(object,newdata,type="class") }
# list with a fit and predict function:
# nnet::nnet (package::function)
model=list(fit=nnet::nnet,predict=predictprob,name="nnet")
data(iris)
# note that size is not a fit parameter and it is sent directly to nnet:
M=fit(Species~.,iris,model=model,size=3,task="class")
P=predict(M,iris)
print(P)
# }
# NOT RUN {
### multiple models: automl and ensembles
# }
# NOT RUN {
data(iris)
d=iris
names(d)[ncol(d)]="y" # change output name
inputs=ncol(d)-1
metric="AUC"
# consult the help of mparheuristic for more automl and ensemble examples:
#
# automatic machine learining (automl) with 5 distinct models and "SE" ensemble.
# the single models are tuned with 10 internal hyperparameter searches,
# except ksvm that uses 13 searches via "UD".
# fit performs an internal validation
sm=mparheuristic(model="automl3",n=NA,task="prob", inputs= inputs )
method=c("kfold",3,123)
search=list(search=sm,smethod="auto",method=method,metric=metric,convex=0)
M=fit(y~.,data=d,model="auto",search=search,fdebug=TRUE)
P=predict(M,d)
# show leaderboard:
cat("> leaderboard models:",M@mpar$LB$model,"\n")
cat("> validation values:",round(M@mpar$LB$eval,4),"\n")
cat("best model is:",M@model,"\n")
cat(metric,"=",round(mmetric(d$y,P,metric=metric),2),"\n")
# average ensemble of 5 distinct models
# the single models are tuned with 1 (heuristic) hyperparameter search
sm2=mparheuristic(model="automl",n=NA,task="prob", inputs= inputs )
method=c("kfold",3,123)
search2=list(search=sm2,smethod="auto",method=method,metric=metric,convex=0)
M2=fit(y~.,data=d,model="AE",search=search2,fdebug=TRUE)
P2=predict(M,d)
cat("best model is:",M2@model,"\n")
cat(metric,"=",round(mmetric(d$y,P2,metric=metric),2),"\n")
# example with an invalid model exclusion:
# in this case, randomForest produces an error and warning
# thus it is excluded from the leaderboard
sm=mparheuristic(model="automl3",n=NA,task="prob", inputs= inputs )
method=c("holdout",2/3,123)
search=list(search=sm,smethod="auto",method=method,metric=metric,convex=0)
d2=d
#
d2[,2]=as.factor(1:150) # force randomForest error
M=fit(y~.,data=d2,model="auto",search=search,fdebug=TRUE)
P=predict(M,d2)
# show leaderboard:
cat("> leaderboard models:",M@mpar$LB$model,"\n")
cat("> validation values:",round(M@mpar$LB$eval,4),"\n")
cat("best model is:",M@model,"\n")
cat(metric,"=",round(mmetric(d$y,P,metric=metric),2),"\n")
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab