# Starting with nwtco, rename variables, convert some to factors, drop
# in.subcohort (which is used elsewhere for a different simulated dataset), etc.
library(survival, quietly=TRUE)
cchsData <- data.frame(
id = nwtco$seqno,
localHistol = factor(nwtco$instit, labels=c("favorable", "unfavorable")),
centralLabHistol = factor(nwtco$histol, labels=c("favorable", "unfavorable")),
stage = factor(nwtco$stage, labels=c("I", "II", "III", "IV")),
study = factor(nwtco$study, labels=c("NWTS-3", "NWTS-4")),
isCase = as.logical(nwtco$rel),
time = nwtco$edrel,
ageAtDiagnosis = nwtco$age / 12 # nwtco$age is in months
) # Define the intended sampling fractions for the two strata.
samplingFractions <- c(favorable=0.05, unfavorable=0.2) # Select participants/rows to be in the subcohort by stratified simple random
# sampling.
cchsData$inSubcohort <- rep(FALSE, nrow(cchsData))
set.seed(1)
for (stratumName in levels(cchsData$localHistol)) {
inThisStratum <- cchsData$localHistol == stratumName
stratumSubcohortSize <-
round(samplingFractions[stratumName] * sum(inThisStratum))
rowsToSetTrue <- sample(which(inThisStratum), size=stratumSubcohortSize)
cchsData$inSubcohort[rowsToSetTrue] <- TRUE
} # Change the sampling fractions to their exact values.
stratumSubcohortSizes <- table(cchsData$localHistol[cchsData$inSubcohort])
stratumCohortSizes <- table(cchsData$localHistol)
samplingFractions <- stratumSubcohortSizes / stratumCohortSizes
samplingFractions <- c(samplingFractions) # make it a vector, not a table # Keep only the cases and the subcohort.
cchsData <- cchsData[cchsData$isCase | cchsData$inSubcohort,] # Put the sampling fraction in each row of the data-frame.
cchsData$sampFrac <-
samplingFractions[match(cchsData$localHistol, names(samplingFractions))]