summarize: Sorts numeric from factor variables and returns separate summaries for those types of variables.

Description

The work is done by the functions summarizeNumerics and summarizeFactors. Please see the help pages for those functions for complete details.

Usage

summarize(dat, ...)

Arguments

dat

A data frame

...

Optional arguments that are passed to summarizeNumerics and summarizeFactors. These may be used: maxLevels The maximum number of levels that will be reported. alphaSort If TRUE (default), the columns are re-organized in alphabetical order. If FA

Value

A list with 2 objects, numerics and factors. numerics is a matrix of summary information, while factors is a list of factor summaries.

Examples

Run this code

library(rockchalk)


set.seed(23452345)
N <- 100
x1 <- gl(12, 2, labels = LETTERS[1:12])
x2 <- gl(8, 3, labels = LETTERS[12:24])
x1 <- sample(x = x1, size=N, replace = TRUE)
x2 <- sample(x = x2, size=N, replace = TRUE)
z1 <- rnorm(N)
a1 <- rnorm(N, mean = 1.2, sd = 1.7)
a2 <- rpois(N, lambda = 10 + a1)
a3 <- rgamma(N, 0.5, 4)
b1 <- rnorm(N, mean = 1.3, sd = 1.4)
dat <- data.frame(z1, a1, x2, a2, x1, a3, b1)
summary(dat)


summarize(dat)


summarizeNumerics(dat)
summarizeFactors(dat, maxLevels = 5)

summarize(dat, alphaSort = FALSE)

summarize(dat, digits = 6, alphaSort = FALSE)

summarize(dat, digits = 22, alphaSort = FALSE)

summarize(dat, maxLevels = 2)

datsumm <- summarize(dat)

datsumm$numerics
datsumm[[1]]  ## same: gets numerics

datsumm$factors
datsumm[[2]]


## Use numerics output to make plots. First,
## transpose gives varnames x summary stat matrix
datsummNT <- t(datsumm$numerics)
datsummNT <- as.data.frame(datsummNT)

plot(datsummNT$mean, datsummNT$var, xlab = "The Means",
    ylab = "The Variances")

plot(datsummNT$mean, datsummNT$var, xlab = "The Means",
    ylab = "The Variances", type = "n")
text(datsummNT$mean, datsummNT$var, labels = rownames(datsummNT))

## Here's a little plot wrinkle.  Note variable names are 'out to the
##  edge' of the plot. If names are longer they don't stay inside
##  figure. See?

## Make the variable names longer

rownames(datsummNT)
rownames(datsummNT) <- c("boring var", "var with long name",
    "tedious name var", "stupid varname", "buffoon not baboon")
plot(datsummNT$mean, datsummNT$var, xlab = "The Means",
    ylab = "The Variances", type = "n")
text(datsummNT$mean, datsummNT$var, labels = rownames(datsummNT),
    cex = 0.8)
## That's no good. Names across the edges

## We could brute force the names outside the edges like
##  this
par(xpd = TRUE)
text(datsummNT$mean, datsummNT$var, labels = rownames(datsummNT),
    cex = 0.8)
## but that is not much better
par(xpd = FALSE)

## Here is one fix. Make the unused space inside the plot
##  larger by
## making xlim and ylim bigger.  I use the magRange
##  function from
## rockchalk to easily expand range to 1.2 times its
##  current size.
## otherwise, long variable names do not fit inside plot.
##  magRange
## could be asymmetric if we want, but this use is
##  symmetric.

rownames(datsummNT)
rownames(datsummNT) <- c("boring var", "var with long name",
    "tedious name var", "stupid varname", "buffoon not baboon")
plot(datsummNT$mean, datsummNT$var, xlab = "The Means",
    ylab = "The Variances", type = "n", xlim = magRange(datsummNT$mean,
        1.2), ylim = magRange(datsummNT$var, 1.2))
text(datsummNT$mean, datsummNT$var, labels = rownames(datsummNT),
    cex = 0.8)

## Here's another little plot wrinkle.  If we don't do that to keep
## the names in bounds, we need some fancy footwork.  Note when a
## point is near the edge, I make sure the text prints toward the
## center of the graph.
plot(datsummNT$mean, datsummNT$var, xlab = "The Means",
    ylab = "The Variances")
## calculate label positions. This is not as fancy as it could be.  If
##  there were lots of variables, we'd have to get smarter about
##  positioning labels on above, below, left, or right.
labelPos <- ifelse(datsummNT$mean - mean(datsummNT$mean,
    na.rm = TRUE) > 0, 2, 4)
text(datsummNT$mean, datsummNT$var, labels = rownames(datsummNT),
    cex = 0.8, pos = labelPos)



x <- data.frame(x = rnorm(N), y = gl(50, 2), z = rep(1:4,
    25), ab = gl(2, 50))

summarize(x)
summarize(x, maxLevels = 15)

sumry <- summarize(x)
sumry[[1]]  ##another way to get the numerics output
sumry[[2]]  ##another way to get the factors output

dat <- data.frame(x = rnorm(N), y = gl(50, 2), z = factor(rep(1:4,
    25), labels = c("A", "B", "C", "D")), animal = factor(ifelse(runif(N) <
    0.2, "cow", ifelse(runif(N) < 0.5, "pig", "duck"))))

summarize(dat)

## Run this if you have internet access

## dat <- read.table(url("http://pj.freefaculty.org/guides/stat/DataSets/USNewsCollege/USNewsCollege.csv"),
## sep = ",")

## colnames(dat) <- c("fice", "name", "state", "private", "avemath",
##                    "aveverb", "avecomb", "aveact", "fstmath",
##                    "trdmath", "fstverb", "trdverb", "fstact",
##                    "trdact", "numapps", "numacc", "numenr",
##                    "pctten", "pctquart", "numfull", "numpart",
##                    "instate", "outstate", "rmbrdcst", "roomcst",
##                    "brdcst", "addfees", "bookcst", "prsnl",
##                    "pctphd", "pctterm", "stdtofac", "pctdonat",
##                    "instcst", "gradrate")

## dat$private <- factor(dat$private, labels = c("public",
##                                    "private"))
## sumry <- summarize(dat, digits = 2)
## sumry

## sumry[[1]]
## sumry[[2]]

## summarize(dat[, c("fice", "name", "private", "fstverb",
##                   "avemath")], digits = 4)

Run the code above in your browser using DataLab