set.seed(0)
x <- runif(10)
x[1] <- 2
group <- gl(2, 5)
CheckForOutliers(x, group, method = "dist")
CheckForOutliers(x, group, method = "logical")
CheckForOutliers(x, group, method = "idx")
graphics::par(mfrow = c(1, 2))
bg <- c(3, 2)[1 + CheckForOutliers(x, group, method = "logical")]
graphics::plot(x = as.numeric(group), y = x, pch = 21, cex = 3,
bg = bg, main = "n_sd=3", las = 1, xlim = c(0.5, 2.5))
bg <- c(3, 2)[1 + CheckForOutliers(x, group, n_sd = 4, method = "logical")]
graphics::plot(x = as.numeric(group), y = x, pch = 21, cex = 3,
bg = bg, main = "n_sd=4", las = 1, xlim = c(0.5, 2.5))
graphics::par(mfrow = c(1, 1))
# load raw data and sample description
raw <- MetabolomicsBasics::raw
sam <- MetabolomicsBasics::sam
# no missing data in this matrix
all(is.finite(raw))
# check for outliers (computing n-fold sd distance from group mean)
tmp <- apply(raw, 2, CheckForOutliers, group = sam$GT, method = "dist")
# plot a histogram of the observed distances
graphics::hist(tmp, breaks = seq(0, ceiling(max(tmp))), main = "n*SD from mean", xlab = "n")
# Calculate the amount of values exceeding five-sigma and compare with a standard gaussian
table(tmp > 5)
round(100 * sum(tmp > 5) / length(tmp), 2)
# \donttest{
gauss <- CheckForOutliers(x = rnorm(prod(dim(raw))), method = "dist")
sapply(1:5, function(i) {
data.frame("obs" = sum(tmp > i), "gauss" = sum(gauss > i))
})
# compare a PCA w/wo outliers
RestrictedPCA(
dat = raw, sam = sam, use.sam = sam$GT %in% c("Mo17", "B73"), group.col = "GT",
fmod = "GT+Batch+Order", P = 1, sign.col = "GT", legend.x = NULL, text.col = "Batch", medsd = TRUE
)
raw_filt <- raw
raw_filt[tmp > 3] <- NA
RestrictedPCA(
dat = raw_filt, sam = sam, use.sam = sam$GT %in% c("Mo17", "B73"), group.col = "GT",
fmod = "GT+Batch+Order", P = 1, sign.col = "GT", legend.x = NULL, text.col = "Batch", medsd = TRUE
)
# }
Run the code above in your browser using DataLab