train(x, ...)## S3 method for class 'default':
train(x, y,
method = "rf",
preProcess = NULL,
...,
weights = NULL,
metric = ifelse(is.factor(y), "Accuracy", "RMSE"),
maximize = ifelse(metric == "RMSE", FALSE, TRUE),
trControl = trainControl(),
tuneGrid = NULL,
tuneLength = 3)
## S3 method for class 'formula':
train(form, data, ..., weights, subset, na.action, contrasts = NULL)
y ~ x1 + x2 + ...formula are preferentially to be taken.ada, avNNet, bag, bagEarth, bagFDA, bayesglm, bdk, blackboostrandomForest). Errors will occur if values
for tuning parameters are passed here.center, scale, spatialSign, pca, ica, and knnImpute. See trainControl. (NOTE: If given, this argument must be named.)tuneGrid with arguments called len<createGrid. (NOTE: If given, this argument must be named.)train containing:NULL or an object of class preProcessNULL. The returnResamp argument of trainControl
controls how much of the resampled results are saved.everything is for the entire call to train, final for the final model fit and, optionally, prediction for the time to predict new samples (see trainControl)train can be used to tune models by picking the complexity parameters that are associated with the optimal resampling statistics. For particular model, a grid of parameters (if any) is created and the model is trained on slightly different data for each candidate combination of tuning parameters. Across each data set, the performance of held-out samples is calculated and the mean and standard deviation is summarized for each combination. The combination with the optimal resampling statistic is chosen as the final model and the entire training set is used to fit a final model.A variety of models are currently available. The lists below enumerate the models and the values of the method argument, as well as the complexity parameters used by train.
Bagging
bagfrom packagevars(dual use)bagEarthfrom packagenprune,degree(dual use)bagFDAfrom packagedegree,nprune(classification only)logicBagfrom packagentrees,nleaves(dual use)treebagfrom packageBayesian Methods
nbfrom packagefL,usekernel(classification only)vbmpRadialfrom packageestimateTheta(classification only)Boosted Trees
adafrom packageiter,maxdepth,nu(classification only)blackboostfrom packagemaxdepth,mstop(dual use)bstTreefrom packagenu,maxdepth,mstop(dual use)C5.0from packagewinnow,model,trials(classification only)gbmfrom packageinteraction.depth,n.trees,shrinkage(dual use)Boosting (Non-Tree)
bstLsfrom packagemstop,nu(dual use)bstSmfrom packagenu,mstop(dual use)gamboostfrom packageprune,mstop(dual use)glmboostfrom packageprune,mstop(dual use)logitBoostfrom packagenIter(classification only)Elastic Net
glmnetfrom packagealpha,lambda(dual use)Flexible Discriminant Analysis (MARS basis)
fdafrom packagenprune,degree(classification only)Gaussian Processes
gaussprLinearfrom packagegaussprPolyfrom packagedegree,scale(dual use)gaussprRadialfrom packagesigma(dual use)Generalized additive model
gamfrom packageselect,method(dual use)gamLoessfrom packagedegree,span(dual use)gamSplinefrom packagedf(dual use)Generalized linear model
glmfrom packagebayesglmfrom packageglmStepAICfrom packageHeteroscedastic Discriminant Analysis
hdafrom packagenewdim,lambda,gamma(classification only)High Dimensional Discriminant Analysis
hddafrom packagemodel,threshold(classification only)Independent Component Regression
icrfrom packagen.comp(regression only)K Nearest Neighbor
knnfrom packagek(dual use)Learned Vector Quantization
lvqfrom packagesize,k(classification only)Linear Discriminant Analysis
ldafrom packagelda2from packagedimen(classification only)Lindafrom packagerrldafrom packagelambda,alpha(classification only)sdafrom packagediagonal(classification only)sddaLDAfrom packagesldafrom packagestepLDAfrom packagedirection,maxvar(classification only)Linear Least Squares
leapBackwardfrom packagenvmax(regression only)leapForwardfrom packagenvmax(regression only)leapSeqfrom packagenvmax(regression only)lmfrom packagelmStepAICfrom packagerlmfrom packageLogic Regression
logforestfrom packagelogregfrom packagetreesize,ntrees(dual use)Logistic Model Trees
LMTfrom packageiter(classification only)Logistic/Multinomial Regression
multinomfrom packagedecay(classification only)plrfrom packagecp,lambda(classification only)Mixture Discriminant Analysis
mdafrom packagesubclasses(classification only)smdafrom packageR,lambda,NumVars(classification only)Multivariate Adaptive Regression Spline
earthfrom packagenprune,degree(dual use)gcvEarthfrom packagedegree(dual use)Nearest Shrunken Centroids
pamfrom packagethreshold(classification only)Neural Networks
avNNetfrom packagesize,bag,decay(dual use)mlpfrom packagesize(dual use)mlpWeightDecayfrom packagedecay,size(dual use)neuralnetfrom packagelayer2,layer1,layer3(regression only)nnetfrom packagesize,decay(dual use)pcaNNetfrom packagesize,decay(dual use)qrnnfrom packagepenalty,bag,n.hidden(regression only)Partial Least Squares
gplsfrom packageK.prov(classification only)kernelplsfrom packagencomp(dual use)plsfrom packagencomp(dual use)simplsfrom packagencomp(dual use)splsfrom packageeta,kappa,K(dual use)widekernelplsfrom packagencomp(dual use)Penalized Discriminant Analysis
pdafrom packagelambda(classification only)pda2from packagedf(classification only)Penalized Linear Models
enetfrom packagefraction,lambda(regression only)fobafrom packagelambda,k(regression only)krlsPolyfrom packagelambda,degree(regression only)krlsRadialfrom packagesigma,lambda(regression only)larsfrom packagefraction(regression only)lars2from packagestep(regression only)lassofrom packagefraction(regression only)penalizedfrom packagelambda1,lambda2(regression only)relaxofrom packagelambda,phi(regression only)ridgefrom packagelambda(regression only)Principal Component Regression
pcrfrom packagencomp(regression only)Projection Pursuit Regression
pprfrom packagenterms(regression only)Quadratic Discriminant Analysis
qdafrom packageQdaCovfrom packagesddaQDAfrom packagestepQDAfrom packagemaxvar,direction(classification only)Radial Basis Function Networks
rbffrom packagesize(dual use)rbfDDAfrom packagenegativeThreshold(classification only)Random Forests
Borutafrom packagemtry(dual use)cforestfrom packagemtry(dual use)ORFlogfrom packagemtry(classification only)ORFplsfrom packagemtry(classification only)ORFridgefrom packagemtry(classification only)ORFsvmfrom packagemtry(classification only)parRFfrom packagemtry(dual use)qrffrom packagemtry(regression only)rffrom packagemtry(dual use)rFernsfrom packagedepth(classification only)RRFfrom packagemtry,coefReg,coefImp(dual use)RRFglobalfrom packagecoefReg,mtry(dual use)Recursive Partitioning
C5.0Treefrom packagectreefrom packagemincriterion(dual use)ctree2from packagemaxdepth(dual use)evtreefrom packagealpha(dual use)J48from packageC(classification only)nodeHarvestfrom packagemaxinter,mode(dual use)obliqueTreefrom packagevariable.selection,oblique.splits(dual use)partDSAfrom packagecut.off.growth,MPD(dual use)rpartfrom packagecp(dual use)rpart2from packagemaxdepth(dual use)Regularized Discriminant Analysis
rdafrom packagelambda,gamma(classification only)Relevance Vector Machines
rvmLinearfrom packagervmPolyfrom packagescale,degree(regression only)rvmRadialfrom packagesigma(regression only)ROC Curves
roccfrom packagexgenes(classification only)Rule-Based Models
C5.0Rulesfrom packagecubistfrom packagecommittees,neighbors(regression only)JRipfrom packageNumOpt(classification only)M5from packagerules,pruned,smoothed(regression only)M5Rulesfrom packagepruned,smoothed(regression only)OneRfrom packagePARTfrom packagepruned,threshold(classification only)Self-Organizing Maps
bdkfrom packagetopo,ydim,xweight,xdim(dual use)xyffrom packagexdim,ydim,topo,xweight(dual use)Sparse Linear Discriminant Analysis
PenalizedLDAfrom packageK,lambda(classification only)sparseLDAfrom packagelambda,NumVars(classification only)Supervised Principal Components
superpcfrom packagethreshold,n.components(regression only)Support Vector Machines
lssvmRadialfrom packagesigma(classification only)svmLinearfrom packageC(dual use)svmPolyfrom packagedegree,scale,C(dual use)svmRadialfrom packageC,sigma(dual use)svmRadialCostfrom packageC(dual use)
By default, the function createGrid is used to define the candidate values of the tuning parameters. The user can also specify their own. To do this, a data fame is created with columns for each tuning parameter in the model. The column names must be the same as those listed in the table above with a leading dot. For example, ncomp would have the column heading .ncomp. This data frame can then be passed to createGrid.
In some cases, models may require control arguments. These can be passed via the three dots argument. Note that some models can specify tuning parameters in the control objects. If specified, these values will be superseded by those given in the createGrid argument.
The formula interface to train will always convert factor variables to dummy variables. For several models (rpart, rf, gbm, treebag, nb, J48, PART, JRip, OneR, ctree, cforest, bag, cubist, C5.0, C5.0Tree, C5.0Rules and custom) factor predictors variables can be passed directly to the underlying modeling function using the interface train(x, y). In these cases, it is possible for the models to treat factor variables in a manner different than most (i.e. not as a decomposed set of dummy variables).
The web page
train can be used with "explicit parallelism", where different resamples (e.g. cross-validation group) and models can be split up and run on multiple machines or processors. By default, train will use a single processor on the host machine. As of version 4.99 of this package, the framework used for parallel processing uses the train does not change; prior to the call to train, a parallel backend is registered with
trainControl, update.train,
modelLookup, createGrid,
createFolds#######################################
## Classification Example
data(iris)
TrainData <- iris[,1:4]
TrainClasses <- iris[,5]
knnFit1 <- train(TrainData, TrainClasses,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
knnFit2 <- train(TrainData, TrainClasses,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "boot"))
library(MASS)
nnetFit <- train(TrainData, TrainClasses,
method = "nnet",
preProcess = "range",
tuneLength = 2,
trace = FALSE,
maxit = 100)
#######################################
## Regression Example
library(mlbench)
data(BostonHousing)
lmFit <- train(medv ~ . + rm:lstat,
data = BostonHousing,
"lm")
library(rpart)
rpartFit <- train(medv ~ .,
data = BostonHousing,
"rpart",
tuneLength = 9)
#######################################
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL)
{
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(.degree = 1,
.nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
"earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
#######################################
## Parallel Processing Example via multicore package
## library(doMC)
## registerDoMC(2)
## NOTE: don't run models form RWeka when using
### multicore. The session will crash.
## The code for train() does not change:
set.seed(1)
usingMC <- train(medv ~ .,
data = BostonHousing,
"glmboost")
## or use:
## library(doMPI) or
## library(doSMP) and so onRun the code above in your browser using DataLab