cv.smtl: cv.smtl: cross-validation function

Description

cv.smtl: cross-validation function

Usage

cv.smtl(
  y,
  X,
  study = NA,
  grid = NA,
  nfolds = NA,
  commonSupp = FALSE,
  multiTask = TRUE,
  lambda_1 = TRUE,
  lambda_2 = FALSE,
  lambda_z = TRUE,
  maxIter = 2500,
  LocSrch_skip = 1,
  LocSrch_maxIter = 10,
  messageInd = FALSE,
  independent.regs = FALSE
)

Value

A list

Arguments

y: A numeric outcome vector or matrix (for multi-label problems)
X: A design (feature) matrix
study: An integer vector specifying the task ID
grid: A dataframe with column names "s", "lambda_1", "lambda_2" and "lambda_z" (if commonSupp = FALSE) with tuning values
nfolds: An integer specifying number of CV folds
commonSupp: A boolean specifying whether the task models should have the same support
multiTask: A boolean only used if study/task indices are provided: used to distinguish between a Multi-Task Learning Tuning (TRUE) or Domain Generalization Tuning (FALSE)
lambda_1: An optional boolean: if a grid is not provided, then set to TRUE if you want an automatic grid to be generated with non-zero values for this hyperparameter
lambda_2: An optional boolean: if a grid is not provided, then set to TRUE if you want an automatic grid to be generated with non-zero values for this hyperparameter
lambda_z: An optional boolean: if a grid is not provided, then set to TRUE if you want an automatic grid to be generated with non-zero values for this hyperparameter
maxIter: An integer specifying the maximum number of coordinate descent iterations
LocSrch_skip: An integer specifying whether to use local search at every tuning value (set to 1), every other value (set to 2), every third (set to 3),...
LocSrch_maxIter: An integer specifying the maximum number of local search iterations
messageInd: A boolean (verbose) of whether to print messages
independent.regs: A boolean of whether models are completely indpendent (only set to TRUE for benchmarks)

Examples

Run this code


#####################################################################################
##### simulate data
#####################################################################################
set.seed(1) # fix the seed to get a reproducible result
K <- 4 # number of datasets 
p <- 100 # covariate dimension
s <- 5 # support size
q <- 7 # size of subset of covariates that can be non-zero for any task
n_k <- 50 # task sample size
N <- n_k * p # full dataset samplesize
X <- matrix( rnorm(N * p), nrow = N, ncol=p) # full design matrix
B <- matrix(1 + rnorm(K * (p+1) ), nrow = p + 1, ncol = K) # betas before making sparse
Z <- matrix(0, nrow = p, ncol = K) # matrix of supports
y <- vector(length = N) # outcome vector

# randomly sample support to make betas sparse
for(j in 1:K)     Z[1:q, j] <- sample( c( rep(1,s), rep(0, q - s) ), q, replace = FALSE )
B[-1,] <- B[-1,] * Z # make betas sparse and ensure all models have an intercept

task <- rep(1:K, each = n_k) # vector of task labels (indices)

# iterate through and make each task specific dataset
for(j in 1:K){
    indx <- which(task == j) # indices of task
    e <- rnorm(n_k)
    y[indx] <- B[1, j] + X[indx,] %*% B[-1,j] + e
    }
    
colnames(B) <- paste0("beta_", 1:K)
rownames(B) <- paste0("X_", 1:(p+1))
    
print("Betas")
print(round(B[1:8,],2))
    
    ###########################
    # custom tuning grid
    ###########################
    grid <- data.frame(s = c(4, 4, 5, 5), 
                  lambda_1 = c(0.01, 0.1, 0.01, 0.1), 
                  lambda_2 = rep(0, 4), 
                  lambda_z = c(0.01, 0.1, 0.01, 0.1))
    
    #################################################
    # cross validation with custom tuning grid
    ##################################################
if (FALSE) {

if (identical(Sys.getenv("AUTO_JULIA_INSTALL"), "true")) { ## The examples are quite time consuming
## Do initiation for and automatic installation if necessary

    tn <- cv.smtl(y = y, 
                  X = X, 
                  study = task, 
                  commonSupp = FALSE,
                  grid = grid,
                  nfolds = 5,
                  multiTask = FALSE) 
                  
     # model fitting
     mod <- sMTL::smtl(y = y, 
                   X = X, 
                   study = task, 
                   s = tn$best.1se$s, 
                   commonSupp = TRUE,
                   lambda_1 = tn$best.1se$lambda_1,
                   lambda_z = tn$best.1se$lambda_z)
    
    ######################################################
    # cross validation with automatically generated grid
    #######################################################
    tn <- cv.smtl(y = y, 
                  X = X, 
                  study = task, 
                  commonSupp = FALSE,
                  lambda_1 = TRUE,
                  lambda_w = FALSE,
                  lambda_z = TRUE,
                  nfolds = 5,
                  multiTask = FALSE) 
    
     # model fitting
     mod <- sMTL::smtl(y = y, 
                   X = X, 
                   study = task, 
                   s = tn$best.1se$s, 
                   commonSupp = TRUE,
                   lambda_1 = tn$best.1se$lambda_1,
                   lambda_z = tn$best.1se$lambda_z)
                   
     print(round(mod$beta[1:8,],2))
                   }
                   }

Run the code above in your browser using DataLab