# NOT RUN {
# Example with cross validation and unemployment data
library(Ecdat)
library(mgcv)
data(UnempDur)
summary(UnempDur$spell)
# Extract subset of data
set.seed(635)
IDsample <- sample(1:dim(UnempDur)[1], 100)
UnempDurSubset <- UnempDur [IDsample, ]
head(UnempDurSubset)
range(UnempDurSubset$spell)
# Generate training and test data
set.seed(7550)
TrainIndices <- sample (x = 1:dim(UnempDurSubset) [1], size = 75)
TrainUnempDur <- UnempDurSubset [TrainIndices, ]
TestUnempDur <- UnempDurSubset [-TrainIndices, ]
# Convert to long format
LongTrain <- dataLong(dataShort = TrainUnempDur, timeColumn = "spell", eventColumn = "censor1")
LongTest <- dataLong(dataShort = TestUnempDur, timeColumn = "spell", eventColumn = "censor1")
# Convert factor to numeric for smoothing
LongTrain$timeInt <- as.numeric(as.character(LongTrain$timeInt))
LongTest$timeInt <- as.numeric(as.character(LongTest$timeInt))
######################################################################
# Estimate a generalized, additive model in discrete survival analysis
gamFit <- gam (formula = y ~ s(timeInt) + age + logwage, data = LongTrain, family = binomial())
# Estimate survival function of each person in the test data
oneMinusPredHaz <- 1 - predict(gamFit, newdata = LongTest, type = "response")
predSurv <- aggregate(oneMinusPredHaz ~ obj, data = LongTest, FUN = cumprod)
# Prediction error in first interval
tryPredErrDisc1 <- predErrCurve (timepoints = 1,
estSurvList = predSurv [[2]], testTime = TestUnempDur$spell,
testEvent=TestUnempDur$censor1, trainTime = TrainUnempDur$spell,
trainEvent=TrainUnempDur$censor1)
tryPredErrDisc1
# Prediction error of the 2. to 10. interval
tryPredErrDisc2 <- predErrCurve (timepoints = 2:10,
estSurvList = predSurv [[2]], testTime = TestUnempDur$spell,
testEvent = TestUnempDur$censor1, trainTime = TrainUnempDur$spell,
trainEvent = TrainUnempDur$censor1)
tryPredErrDisc2
plot(tryPredErrDisc2)
########################################
# Fit a random discrete survival forest
library(ranger)
LongTrainRF <- LongTrain
LongTrainRF$y <- factor(LongTrainRF$y)
rfFit <- ranger(formula = y ~ timeInt + age + logwage, data = LongTrainRF,
probability = TRUE)
# Estimate survival function of each person in the test data
oneMinusPredHaz <- 1 - predict(rfFit, data = LongTest)$predictions[, 2]
predSurv <- aggregate(oneMinusPredHaz ~ obj, data = LongTest, FUN = cumprod)
# Prediction error in first interval
tryPredErrDisc1 <- predErrCurve (timepoints = 1,
estSurvList = predSurv [[2]], testTime = TestUnempDur$spell,
testEvent = TestUnempDur$censor1, trainTime = TrainUnempDur$spell,
trainEvent = TrainUnempDur$censor1)
tryPredErrDisc1
# Prediction error of the 2. to 10. interval
tryPredErrDisc2 <- predErrCurve (timepoints = 2:10,
estSurvList = predSurv [[2]], testTime = TestUnempDur$spell,
testEvent = TestUnempDur$censor1, trainTime = TrainUnempDur$spell,
trainEvent = TrainUnempDur$censor1)
tryPredErrDisc2
plot(tryPredErrDisc2)
# }
Run the code above in your browser using DataLab