# NOT RUN {
library(bigstep)
library(RcppEigen)
# data1
set.seed(1)
n <- 100
M <- 50
X <- matrix(rnorm(n*M), ncol=M)
colnames(X) <- 1:M
snp <- 10*(1:5)
y <- rowSums(X[, snp]) + rnorm(n)
# you can use classical criteria to such type of data:
selectModel(X, y, crit=aic)
model <- selectModel(X, y, crit=bic)
summary(fastLm(X[, model], y))
# data2
set.seed(1)
n <- 1e3
M <- 1e4
X <- matrix(rnorm(M*n), ncol=M)
colnames(X) <- 1:M
snp <- 1e3*(1:10)
y <- rowSums(X[, snp]) + rnorm(n, sd=2)
Q <- matrix(rnorm(n*5), n, 5) # additional variables
colnames(Q) <- -(1:5)
# single tests + multi-forward + multi-backward + stepwise using mBIC
selectModel(X, y, p=M)
selectModel(X, y, p=M, multif=FALSE) # only single tests + stepwise
selectModel(X, y, p=M, multif=FALSE, minpv=1) # only stepwise
# you can start with a model with variables from Q and force that
# 1st and 5th would not be removed
selectModel(X, y, Xm=Q, stay=c(1, 5), p=M, crit=mbic2)
# after reducing the size of matrix X (removing variables with
# p-value > minpv), save it to a file (you can use it in another
# model selection, it will be faster)
selectModel(X, y, p=M, file.out="Xshort")
Xs <- read.table("Xshort.txt", header=TRUE)
selectModel(Xs, y, p=M, minpv=1)
# selectModel(X, y, crit=bic) # bad idea...
# you can define your own criterion
# (it can depend on log-likelihood (loglik), the number of observations (n),
# the number of variables currently in a model (k), the matrix with variables
# currenly in a model (Xm) and constants)
myCrit <- function(loglik, k, n, c1=2, c2=4) {
-c1*loglik + 10*sqrt(k*c2)
}
selectModel(X, y, multif=FALSE, crit=myCrit, c1=0.18)
selectModel(X, y, multif=FALSE,
crit=function(loglik, k, n) -0.17*loglik + 10*sqrt(k*4))
# your criterion can depend on eg. the average correlation between variables
# currently in a model
myCrit2 <- function(loglik, Xm) {
n <- nrow(Xm)
k <- ncol(Xm)
m.cor <- 0
np <- 1
if (k > 1) {
pairs <- combn(1:k, 2)
np <- ncol(pairs)
for (i in 1:np) {
res <- suppressWarnings(abs(cor(Xm[, pairs[1, i]], Xm[, pairs[2, i]])))
if (is.na(res))
res <- 0
m.cor <- m.cor + res
}
}
-2*loglik + k*log(n) + n*k*m.cor/np
}
selectModel(X, y, multif=FALSE, crit=myCrit2)
# you can define your own fitting function
# (it can depend on X and y and should return the log-likelihood)
fitLinear2 <- function(X, y) {
model <- RcppEigen::fastLmPure(X, y, method=2)
rss <- sum(model$residuals^2)
n <- length(y)
loglik <- -n/2*log(rss/n)
return(loglik)
}
system.time(selectModel(X, y, p=M, fitFun=fitLinear))
system.time(selectModel(X, y, p=M, fitFun=fitLinear2))
# fastLmPure with method=2 is faster but sometimes gives errors
# data3 (big data)
library(bigmemory)
# if it is possible, set type="char", reading will be quicker
X <- read.big.matrix("X.txt", sep=" ", type="char", head=TRUE)
y <- read.table("Trait.txt")
Q <- read.table("Q.txt")
M <- ncol(X)
selectModel(X, y, p=M)
selectModel(X, y, p=M, minpv=0.001) # if you do not have time...
# data4
set.seed(1)
n <- 50
M <- 1e3
X <- matrix(runif(M*n), ncol=M)
colnames(X) <- 1:M
snp <- 1e2*(1:5)
mu <- rowSums(X[, snp])
y <- rpois(n, exp(mu))
selectModel(X, y, p=M, fitFun=fitLinear, multif=FALSE)
selectModel(X, y, p=M, fitFun=fitPoisson, multif=FALSE)
set.seed(1)
n <- 100
X <- matrix(runif(M*n, -5, 5), ncol=M)
colnames(X) <- 1:M
mu <- rowSums(X[, snp])
p <- 1/(1 + exp(-mu))
y <- rbinom(n, 1, p)
selectModel(X, y, p=M, fitFun=fitLogistic, multif=FALSE)
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab