require(mvtnorm, quiet=TRUE)
############################################
# dimension v, number of observations n
v <- 5
n <- 200
simdata <- array( rmvnorm(n*v, mean=rep(0,v),
sigma = diag(rep(1,v))), c(n,v) )
#
# detect outliers with nominal sizes
# c(0.05,0.01,0.001)
#
sa <- 1. - ((1. - c(0.05,0.01,0.001))^(1./n))
results <- cerioli2010.fsrmcd.test( simdata,
signif.alpha=sa )
# count number of outliers detected for each
# significance level
colSums( results$outliers )
#############################################
# add some contamination to illustrate how to
# detect outliers using the fsrmcd test
# 10/200 = 5% contamination
simdata[ sample(n,10), ] <- array(
rmvnorm( 10*v, mean=rep(2,v), sigma = diag(rep(1,v))),
c(10,v)
)
results <- cerioli2010.fsrmcd.test( simdata,
signif.alpha=sa )
colMeans( results$outliers )
if (FALSE) {
#############################################
# example of how to ensure the size of the intersection test is correct
n.sim <- 5000
simdata <- array(
rmvnorm(n*v*n.sim, mean=rep(0,v), sigma=diag(rep(1,v))),
c(n,v,n.sim)
)
# in practice we'd do this using one of the parallel processing
# methods out there
sa <- 1. - ((1. - 0.01)^(1./n))
results <- apply( simdata, 3, function(dm) {
z <- cerioli2010.fsrmcd.test( dm,
signif.alpha=sa )
# true if outliers were detected in the data, false otherwise
any(z$outliers[,1,drop=TRUE])
})
# count the percentage of samples where outliers were detected;
# should be close to the significance level value used (0.01) in these
# samples for the intersection test.
mean(results)
}
Run the code above in your browser using DataLab