require(mvtnorm, quiet=TRUE)
############################################
# dimension v, number of observations n
v <- 5
n <- 200
simdata <- array( rmvnorm(n*v, mean=rep(0,v),
sigma = diag(rep(1,v))), c(n,v) )
# detect outliers
results <- cerioli2010.irmcd.test( simdata,
signif.gamma=c(0.05,0.01,0.001) )
# count number of outliers detected for each
# significance level
colSums( results$outliers )
#############################################
# add some contamination to illustrate how to
# detect outliers using the irmcd test
# 10/200 = 5% contamination
simdata[ sample(n,10), ] <- array(
rmvnorm( 10*v, mean=rep(2,v), sigma = diag(rep(1,v))),
c(10,v)
)
results <- cerioli2010.irmcd.test( simdata,
signif.gamma=0.01 )
mean( results$outliers[,1,drop=TRUE] )
#############################################
# banknote example from Cerioli (2010)
if (FALSE) {
require(rrcov) # for CovMcd
require(mclust) # banknote data set lives here
data(banknote, package="mclust")
# length, width of left edge, width of right edge,
# width of bottom edge, width of top edge, length
# of image diagonal, counterfeit (1=counterfeit)
bnk.gamma <- 0.01
# genuine banknotes
# classical mean and covariance
banknote.real <- banknote[ banknote[,"Status"]=="genuine", 2:7 ]
cov.cls <- CovClassic( banknote.real )
# 1 - (1 - 0.01)^(1/100) quantile of scaled-Beta distribution
# with m=100 and v=6
bnk.m <- nrow( banknote.real )
bnk.v <- ncol( banknote.real )
bnk.alpha <- 1. - ((1. - bnk.gamma)^(1./bnk.m))
cutoff.cls <- (bnk.m-1.)*(bnk.m-1.)*qbeta( 1. - bnk.alpha, bnk.v/2.,
(bnk.m - bnk.v - 1.)/2.)/bnk.m
# Figure 4 (left) in Cerioli (2010)
plot( getDistance( cov.cls ), xlab="Index number",
ylab="Squared Mahalanobis Distance", type="p",
ylim=c(0,45)
)
abline( h=cutoff.cls )
# reweighted MCD, maximum breakdown point case
cov.rob <- CovMcd( banknote.real,
alpha=floor((bnk.m + bnk.v + 1.)/2.)/bnk.m, nsamp="best" )
# cutoff using chi-squared individually
cutoff.rmcdind <- qchisq(1. - bnk.gamma, df=bnk.v)
# cufoff using simultaneous chi-square
cutoff.rmcdsim <- qchisq(1. - bnk.alpha, df=bnk.v)
# scaled-F cutoff using FSRMCD
# cutoff value is returned by critvalfcn for observations
# with weight=0
tmp.fsrmcd <- cerioli2010.fsrmcd.test( banknote.real,
signif.alpha=bnk.alpha )
cutoff.fsrmcd <- unique(tmp.fsrmcd$critvalfcn( bnk.alpha )[tmp.fsrmcd$weights==0])
# Figure 4 (right)
plot( getDistance( cov.rob ), xlab="Index number",
ylab="Squared Robust Reweighted Distance", type="p",
ylim=c(0,45)
)
abline( h=cutoff.rmcdind, lty="dotted" )
abline( h=cutoff.rmcdsim, lty="dashed" )
abline( h=cutoff.fsrmcd, lty="solid" )
legend( "topright", c("RMCD_ind","RMCD","FSRMCD"),
lty=c("dotted","dashed","solid") )
# forged banknotes
# classical mean and covariance
banknote.fake <- banknote[ banknote[,"Status"]=="counterfeit", 2:7 ]
cov.cls <- CovClassic( banknote.fake )
# 1 - (1 - 0.01)^(1/100) quantile of scaled-Beta distribution
# with m=100 and v=6
bnk.m <- nrow( banknote.fake )
bnk.v <- ncol( banknote.fake )
bnk.alpha <- 1. - ((1. - bnk.gamma)^(1./bnk.m))
cutoff.cls <- (bnk.m-1.)*(bnk.m-1.)*qbeta( 1. - bnk.alpha, bnk.v/2.,
(bnk.m - bnk.v - 1.)/2.)/bnk.m
# Figure 5 (left) in Cerioli (2010)
plot( getDistance( cov.cls ), xlab="Index number",
ylab="Squared Mahalanobis Distance", type="p",
ylim=c(0,45)
)
abline( h=cutoff.cls )
# reweighted MCD, maximum breakdown point case
cov.rob <- CovMcd( banknote.fake,
alpha=floor((bnk.m + bnk.v + 1.)/2.)/bnk.m, nsamp="best" )
# cutoff using chi-squared individually
cutoff.rmcdind <- qchisq(1. - bnk.gamma, df=bnk.v)
# scaled-F cutoff using FSRMCD
# cutoff value is returned by critvalfcn for observations
# with weight=0
tmp.fsrmcd <- cerioli2010.fsrmcd.test( banknote.fake,
signif.alpha=bnk.alpha )
cutoff.fsrmcd <- unique(tmp.fsrmcd$critvalfcn( bnk.alpha )[tmp.fsrmcd$weights==0])
cutoff.irmcd <- unique(tmp.fsrmcd$critvalfcn( bnk.gamma )[tmp.fsrmcd$weights==0])
# Figure 5 (right) in Cerioli (2010)
plot( getDistance( cov.rob ), xlab="Index number",
ylab="Squared robust reweighted Distance", type="p",
ylim=c(0,150)
)
abline( h=cutoff.rmcdind, lty="dotted" )
abline( h=cutoff.fsrmcd, lty="dashed" )
abline( h=cutoff.irmcd, lty="solid" )
legend( "topright", c("RMCD_ind","FSRMCD","IRMCD"),
lty=c("dotted","dashed","solid") )
}
#############################################
# example of how to ensure the size of the intersection test is correct
if (FALSE) {
n.sim <- 5000
simdata <- array(
rmvnorm(n*v*n.sim, mean=rep(0,v), sigma=diag(rep(1,v))),
c(n,v,n.sim)
)
# in practice we'd do this using one of the parallel processing
# methods out there
results <- apply( simdata, 3, function(dm) {
z <- cerioli2010.irmcd.test( dm,
signif.gamma=0.01 )
# true if outliers were detected in the data, false otherwise
any(z$outliers[,1,drop=TRUE])
})
# count the percentage of samples where outliers were detected;
# should be close to the significance level value used (0.01) in these
# samples for the intersection test
mean(results)
}
Run the code above in your browser using DataLab