# NOT RUN {
library("OpenML")
library("farff")
library("stringr")
library("stats")
library("data.table")
library("rpart")
library("catSplit")
# An example dataset from OpenML
datInfo <- getOMLDataSet(data.id = 41283, verbosity = 0)
targetVariable <- datInfo$target.features
dat <- datInfo$data
datasetName <- datInfo$desc$name
catVariables <- names(Filter(is.factor, dat))
# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]
problemType <- "classification"
# Split dat to train and test sets
smp_size <- floor(0.75 * nrow(dat))
train_ind <- sample(seq_len(nrow(dat)), size = smp_size)
train <- as.data.frame.matrix(dat[train_ind, ])
test <- as.data.frame.matrix(dat[-train_ind, ])
# Outputs a list containing 2 files: encoding frame for train data, encoding frame for test data
train_and_test_cat = catSplitEncoding(targetVariable = targetVariable,
trainData = train,
testData = test,
problemType = problemType,
datasetName = datasetName,
catVariables = catVariables)
# Get transformed train and test sets from the output list
trainCat = train_and_test_cat[1]
testCat = train_and_test_cat[2]
# Drop categorical variables from the original train and test data
trainData <- train[!names(train) %in% catVariables]
testData <- test[!names(test) %in% catVariables]
# Merge encoding frame and original data
train <- cbind(trainCat, trainData)
test <- cbind(testCat, testData)
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab