# NOT RUN {
## Let's start with some statistical programming
v <- iris$Sepal.Length
d <- num_vars(iris)    # Saving numeric variables
f <- iris$Species      # Factor
# Simple statistics
fmean(v)               # vector
fmean(qM(d))           # matrix (qM is a faster as.matrix)
fmean(d)               # data.frame
# Preserving data structure
fmean(qM(d), drop = FALSE)     # Still a matrix
fmean(d, drop = FALSE)         # Still a data.frame
# Weighted statistics, supported by most functions...
w <- abs(rnorm(fnrow(iris)))
fmean(d, w = w)
# Grouped statistics...
fmean(d, f)
# Groupwise-weighted statistics...
fmean(d, f, w)
# Simple Transformations...
head(fmode(d, TRA = "replace"))    # Replacing values with the mode
head(fmedian(d, TRA = "-"))        # Subtracting the median
head(fsum(d, TRA = "%"))           # Computing percentages
head(fsd(d, TRA = "/"))            # Dividing by the standard-deviation (scaling), etc...
# Weighted Transformations...
head(fnth(d, 0.75, w = w, TRA = "replace"))  # Replacing by the weighted 3rd quartile
# Grouped Transformations...
head(fvar(d, f, TRA = "replace"))  # Replacing values with the group variance
head(fsd(d, f, TRA = "/"))         # Grouped scaling
head(fmin(d, f, TRA = "-"))        # Setting the minimum value in each species to 0
head(fsum(d, f, TRA = "/"))        # Dividing by the sum (proportions)
head(fmedian(d, f, TRA = "-"))     # Groupwise de-median
head(ffirst(d, f, TRA = "%%"))     # Taking modulus of first group-value, etc. ...
# Grouped and weighted transformations...
head(fsd(d, f, w, "/"), 3)         # weighted scaling
head(fmedian(d, f, w, "-"), 3)     # subtracting the weighted group-median
head(fmode(d, f, w, "replace"), 3) # replace with weighted statistical mode
## Some more advanced transformations...
head(fbetween(d))                             # Averaging (faster t.: fmean(d, TRA = "replace"))
head(fwithin(d))                              # Centering (faster than: fmean(d, TRA = "-"))
head(fwithin(d, f, w))                        # Grouped and weighted (same as fmean(d, f, w, "-"))
head(fwithin(d, f, w, mean = 5))              # Setting a custom mean
head(fwithin(d, f, w, theta = 0.76))          # Quasi-centering i.e. d - theta*fbetween(d, f, w)
head(fwithin(d, f, w, mean = "overall.mean")) # Preserving the overall mean of the data
head(fscale(d))                               # Scaling and centering
head(fscale(d, mean = 5, sd = 3))             # Custom scaling and centering
head(fscale(d, mean = FALSE, sd = 3))         # Mean preserving scaling
head(fscale(d, f, w))                         # Grouped and weighted scaling and centering
head(fscale(d, f, w, mean = 5, sd = 3))       # Custom grouped and weighted scaling and centering
head(fscale(d, f, w, mean = FALSE,            # Preserving group means
            sd = "within.sd"))                # and setting group-sd to fsd(fwithin(d, f, w), w = w)
head(fscale(d, f, w, mean = "overall.mean",   # Full harmonization of group means and variances,
            sd = "within.sd"))                # while preserving the level and scale of the data.
head(get_vars(iris, 1:2))                      # Use get_vars for fast selecting, gv is shortcut
head(fhdbetween(gv(iris, 1:2), gv(iris, 3:5))) # Linear prediction with factors and covariates
head(fhdwithin(gv(iris, 1:2), gv(iris, 3:5)))  # Linear partialling out factors and covariates
ss(iris, 1:10, 1:2)                            # Similarly fsubset/ss for fast subsetting rows
# Simple Time-Computations..
head(flag(AirPassengers, -1:3))                # One lead and three lags
head(fdiff(EuStockMarkets,                     # Suitably lagged first and second differences
      c(1, frequency(EuStockMarkets)), diff = 1:2))
head(fdiff(EuStockMarkets, rho = 0.87))        # Quasi-differences (x_t - rho*x_t-1)
head(fdiff(EuStockMarkets, log = TRUE))        # Log-differences
head(fgrowth(EuStockMarkets))                  # Exact growth rates (percentage change)
head(fgrowth(EuStockMarkets, logdiff = TRUE))  # Log-difference growth rates (percentage change)
# Note that it is not necessary to use factors for grouping.
fmean(gv(mtcars, -c(2,8:9)), mtcars$cyl) # Can also use vector (internally converted using qF())
fmean(gv(mtcars, -c(2,8:9)),
      gv(mtcars, c(2,8:9)))              # or a list of vector (internally grouped using GRP())
g <- GRP(mtcars, ~ cyl + vs + am)        # It is also possible to create grouping objects
print(g)                                 # These are instructive to learn about the grouping,
plot(g)                                  # and are directly handed down to C++ code
fmean(gv(mtcars, -c(2,8:9)), g)          # This can speed up multiple computations over same groups
fsd(gv(mtcars, -c(2,8:9)), g)
# Factors can efficiently be created using qF()
f1 <- qF(mtcars$cyl)                     # Unlike GRP objects, factors are checked for NA's
f2 <- qF(mtcars$cyl, na.exclude = FALSE) # This can however be avoided through this option
class(f2)                                # Note the added class
# }
# NOT RUN {
 <!-- % No code relying on suggested package -->
library(microbenchmark)
microbenchmark(fmean(mtcars, f1), fmean(mtcars, f2)) # A minor difference, larger on larger data
# }
# NOT RUN {
with(mtcars, finteraction(cyl, vs, am))  # Efficient interactions of vectors and/or factors
finteraction(gv(mtcars, c(2,8:9)))       # .. or lists of vectors/factors
# Simple row- or column-wise computations on matrices or data frames with dapply()
dapply(mtcars, quantile)                 # column quantiles
dapply(mtcars, quantile, MARGIN = 1)     # Row-quantiles
  # dapply preserves the data structure of any matrices / data frames passed
  # Some fast matrix row/column functions are also provided by the matrixStats package
# Similarly, BY performs grouped comptations
BY(mtcars, f2, quantile)
BY(mtcars, f2, quantile, expand.wide = TRUE)
# For efficient (grouped) replacing and sweeping out computed statistics, use TRA()
sds <- fsd(mtcars)
head(TRA(mtcars, sds, "/"))     # Simple scaling (if sd's not needed, use fsd(mtcars, TRA = "/"))
# }
# NOT RUN {
 <!-- % No code relying on suggested package -->
microbenchmark(TRA(mtcars, sds, "/"), sweep(mtcars, 2, sds, "/")) # A remarkable performance gain..
# }
# NOT RUN {
sds <- fsd(mtcars, f2)
head(TRA(mtcars, sds, "/", f2)) # Groupd scaling (if sd's not needed: fsd(mtcars, f2, TRA = "/"))
# All functions above perserve the structure of matrices / data frames
# If conversions are required, use these efficient functions:
mtcarsM <- qM(mtcars)                      # Matrix from data.frame
head(qDF(mtcarsM))                         # data.frame from matrix columns
head(mrtl(mtcarsM, TRUE, "data.frame"))    # data.frame from matrix rows, etc..
head(qDT(mtcarsM, "cars"))                 # Saving row.names when converting matrix to data.table
head(qDT(mtcars, "cars"))                  # Same use a data.frame
## Now let's get some real data and see how we can use this power for data manipulation
library(magrittr)
head(wlddev) # World Bank World Development Data: 216 countries, 61 years, 5 series (columns 9-13)
# Starting with some discriptive tools...
namlab(wlddev, class = TRUE)           # Show variable names, labels and classes
fnobs(wlddev)                          # Observation count
pwnobs(wlddev)                         # Pairwise observation count
head(fnobs(wlddev, wlddev$country))    # Grouped observation count
fndistinct(wlddev)                     # Distinct values
descr(wlddev)                          # Describe data
varying(wlddev, ~ country)             # Show which variables vary within countries
qsu(wlddev, pid = ~ country,           # Panel-summarize columns 9 though 12 of this data
    cols = 9:12, vlabels = TRUE)       # (between and within countries)
qsu(wlddev, ~ region, ~ country,       # Do all of that by region and also compute higher moments
    cols = 9:12, higher = TRUE)        # -> returns a 4D array
qsu(wlddev, ~ region, ~ country, cols = 9:12,
    higher = TRUE, array = FALSE) %>%                           # Return as a list of matrices..
unlist2d(c("Variable","Trans"), row.names = "Region") %>% head  # and turn into a tidy data.frame
pwcor(num_vars(wlddev), P = TRUE)                           # Pairwise correlations with p-value
pwcor(fmean(num_vars(wlddev), wlddev$country), P = TRUE)    # Correlating country means
pwcor(fwithin(num_vars(wlddev), wlddev$country), P = TRUE)  # Within-country correlations
psacf(wlddev, ~country, ~year, cols = 9:12)                 # Panel-data Autocorrelation function
pspacf(wlddev, ~country, ~year, cols = 9:12)                # Partial panel-autocorrelations
psmat(wlddev, ~iso3c, ~year, cols = 9:12) %>% plot          # Convert panel to 3D array and plot
## collapse offers a few very efficent functions for data manipulation:
# Fast selecting and replacing columns
series <- get_vars(wlddev, 9:12)     # Same as wlddev[9:12] but 2x faster
series <- fselect(wlddev, PCGDP:ODA) # Same thing: > 100x faster than dplyr::select
get_vars(wlddev, 9:12) <- series     # Replace, 8x faster wlddev[9:12] <- series + replaces names
fselect(wlddev, PCGDP:ODA) <- series # Same thing
# Fast subsetting
head(fsubset(wlddev, country == "Ireland", -country, -iso3c))
head(fsubset(wlddev, country == "Ireland" & year > 1990, year, PCGDP:ODA))
ss(wlddev, 1:10, 1:10) # This is an order of magnitude faster than wlddev[1:10, 1:10]
# Fast transforming
head(ftransform(wlddev, ODA_GDP = ODA / PCGDP, ODA_LIFEEX = sqrt(ODA) / LIFEEX))
settransform(wlddev, ODA_GDP = ODA / PCGDP, ODA_LIFEEX = sqrt(ODA) / LIFEEX) # by reference
head(ftransform(wlddev, PCGDP = NULL, ODA = NULL, GINI_sum = fsum(GINI)))
head(ftransformv(wlddev, 9:12, log))           # Can also transform with lists of columns
head(ftransformv(wlddev, 9:12, fscale, apply = FALSE)) # apply = FALSE invokes fscale.data.frame
settransformv(wlddev, 9:12, fscale, apply = FALSE)     # Changing the data by reference
ftransform(wlddev) <- fscale(gv(wlddev, 9:12))         # Same thing (using replacement method)
wlddev %<>% ftransformv(9:12, fscale, apply = FALSE) # Same thing, using magrittr
wlddev %>% ftransform(gv(., 9:12) %>%              # With compound pipes: Scaling and lagging
                        fscale %>% flag(0:2, iso3c, year)) %>% head
# Fast reordering
head(roworder(wlddev, -country, year))
head(colorder(wlddev, country, year))
# Fast renaming
head(frename(wlddev, country = Ctry, year = Yr))
setrename(wlddev, country = Ctry, year = Yr)     # By reference
head(frename(wlddev, tolower, cols = 9:12))
# Fast grouping
fgroup_by(wlddev, Ctry, decade) %>% fgroup_vars %>% head # fgroup_by is faster than dplyr::group_by
rm(wlddev)                                       # .. but only works with collapse functions
## Now lets start putting things together
wlddev %>% fsubset(year > 1990, region, income, PCGDP:ODA) %>%
  fgroup_by(region, income) %>% fmean            # Fast aggregation using the mean
# }
# NOT RUN {
# Same thing using dplyr manipulation verbs
library(dplyr)
wlddev %>% filter(year > 1990) %>% select(region, income, PCGDP:ODA) %>%
  group_by(region,income) %>% fmean       # This is already a lot faster than summarize_all(mean)
# }
# NOT RUN {
wlddev %>% fsubset(year > 1990, region, income, PCGDP:POP) %>%
  fgroup_by(region, income) %>% fmean(POP)     # Weighted group means
wlddev %>% fsubset(year > 1990, region, income, PCGDP:POP) %>%
  fgroup_by(region, income) %>% fsd(POP)       # Weighted group standard deviations
wlddev %>% na_omit(cols = "POP") %>% fgroup_by(region, income) %>%
  fselect(PCGDP:POP) %>% fnth(0.75, POP)       # Weighted group third quartile
wlddev %>% fgroup_by(country) %>% fselect(PCGDP:ODA) %>%
fwithin %>% head                               # Within transformation
wlddev %>% fgroup_by(country) %>% fselect(PCGDP:ODA) %>%
fmedian(TRA = "-") %>% head                    # Grouped centering using the median
# Replacing data points by the weighted first quartile:
wlddev %>% na_omit(cols = "POP") %>% fgroup_by(country) %>%
  fselect(country, year, PCGDP:POP) %>%
  ftransform(fselect(., -country, -year) %>%
  fnth(0.25, POP, "replace_fill")) %>% head
wlddev %>% fgroup_by(country) %>% fselect(PCGDP:ODA) %>% fscale %>% head    # Standardizing
wlddev %>% fgroup_by(country) %>% fselect(PCGDP:POP) %>%
   fscale(POP) %>% head  # Weigted..
wlddev %>% fselect(country, year, PCGDP:ODA) %>%  # Adding 1 lead and 2 lags of each variable
  fgroup_by(country) %>% flag(-1:2, year) %>% head
wlddev %>% fselect(country, year, PCGDP:ODA) %>%  # Adding 1 lead and 10-year growth rates
  fgroup_by(country) %>% fgrowth(c(0:1,10), 1, year) %>% head
# etc...
# Aggregation with multiple functions
wlddev %>% fsubset(year > 1990, region, income, PCGDP:ODA) %>%
  fgroup_by(region, income) %>% {
    add_vars(fgroup_vars(., "unique"),
             fmedian(., keep.group_vars = FALSE) %>% add_stub("median_"),
             fmean(., keep.group_vars = FALSE) %>% add_stub("mean_"),
             fsd(., keep.group_vars = FALSE) %>% add_stub("sd_"))
  } %>% head
# Transformation with multiple functions
wlddev %>% fselect(country, year, PCGDP:ODA) %>%
  fgroup_by(country) %>% {
    add_vars(fdiff(., c(1,10), 1, year) %>% flag(0:2, year),  # Sequence of lagged differences
             ftransform(., fselect(., PCGDP:ODA) %>% fwithin %>% add_stub("W.")) %>%
               flag(0:2, year, keep.ids = FALSE))             # Sequence of lagged demeaned vars
  } %>% head
# With ftransform, can also easily do one or more grouped mutations on the fly..
settransform(wlddev, median_ODA = fmedian(ODA, list(region, income), TRA = "replace_fill"))
settransform(wlddev, sd_ODA = fsd(ODA, list(region, income), TRA = "replace_fill"),
                     mean_GDP = fmean(PCGDP, country, TRA = "replace_fill"))
wlddev %<>% ftransform(fmedian(list(median_ODA = ODA, median_GDP = PCGDP),
                               list(region, income), TRA = "replace_fill"))
# On a groped data frame it is also possible to grouped transform certain columns
# but perform aggregate operatins on others:
wlddev %>% fgroup_by(region, income) %>%
    ftransform(gmedian_GDP = fmedian(PCGDP, GRP(.), TRA = "replace"),
               omedian_GDP = fmedian(PCGDP, TRA = "replace"),  # "replace" preserves NA's
               omedian_GDP_fill = fmedian(PCGDP)) %>% tail
rm(wlddev)
## For multi-type data aggregation, the function collap offers ease and flexibility
# Aggregate this data by country and decade: Numeric columns with mean, categorical with mode
head(collap(wlddev, ~ country + decade, fmean, fmode))
# taking weighted mean and weighted mode:
head(collap(wlddev, ~ country + decade, fmean, fmode, w = ~ POP, wFUN = fsum))
# Multi-function aggregation of certain columns
head(collap(wlddev, ~ country + decade,
            list(fmean, fmedian, fsd),
            list(ffirst, flast), cols = c(3,9:12)))
# Customized Aggregation: Assign columns to functions
head(collap(wlddev, ~ country + decade,
            custom = list(fmean = 9:10, fsd = 9:12, flast = 3, ffirst = 6:8)))
# For grouped data frames use collapg
wlddev %>% fsubset(year > 1990, country, region, income, PCGDP:ODA) %>%
  fgroup_by(country) %>% collapg(fmean, ffirst) %>%
  ftransform(AMGDP = PCGDP > fmedian(PCGDP, list(region, income), TRA = "replace_fill"),
             AMODA = ODA > fmedian(ODA, income, TRA = "replace_fill")) %>% head
## Additional flexibility for data transformation tasks is offerend by tidy transformation operators
# Within-transformation (centering on overall mean)
head(W(wlddev, ~ country, cols = 9:12, mean = "overall.mean"))
# }
# NOT RUN {
# Partialling out country and year fixed effects
head(HDW(wlddev, PCGDP + LIFEEX ~ qF(country) + qF(year)))
# Same, adding ODA as continuous regressor
head(HDW(wlddev, PCGDP + LIFEEX ~ qF(country) + qF(year) + ODA))
# }
# NOT RUN {
# Standardizing (scaling and centering) by country
head(STD(wlddev, ~ country, cols = 9:12))
# Computing 1 lead and 3 lags of the 4 series
head(L(wlddev, -1:3, ~ country, ~year, cols = 9:12))
# Computing the 1- and 10-year first differences
head(D(wlddev, c(1,10), 1, ~ country, ~year, cols = 9:12))
head(D(wlddev, c(1,10), 1:2, ~ country, ~year, cols = 9:12))     # ..first and second differences
# Computing the 1- and 10-year growth rates
head(G(wlddev, c(1,10), 1, ~ country, ~year, cols = 9:12))
# Adding growth rate variables to dataset
add_vars(wlddev) <- G(wlddev, c(1, 10), 1, ~ country, ~year, cols = 9:12, keep.ids = FALSE)
get_vars(wlddev, "G1.", regex = TRUE) <- NULL # Deleting again
# }
# NOT RUN {
 <!-- % No code relying on suggested package -->
# These operators can conveniently be used in regression formulas:
# Using a Mundlak (1978) procedure to estimate the effect of OECD on LIFEEX, controlling for PCGDP
lm(LIFEEX ~ log(PCGDP) + OECD + B(log(PCGDP), country),
   wlddev %>% fselect(country, OECD, PCGDP, LIFEEX) %>% na_omit)
# Adding 10-year lagged life-expectancy to allow for some convergence effects (dynamic panel model)
lm(LIFEEX ~ L(LIFEEX, 10, country) + log(PCGDP) + OECD + B(log(PCGDP), country),
   wlddev %>% fselect(country, OECD, PCGDP, LIFEEX) %>% na_omit)
# Tranformation functions and operators also support plm panel data classes:
library(plm)
pwlddev <- pdata.frame(wlddev, index = c("country","year"))
head(W(pwlddev$PCGDP))                      # Country-demeaning
head(W(pwlddev, cols = 9:12))
head(W(pwlddev$PCGDP, effect = 2))          # Time-demeaning
head(W(pwlddev, effect = 2, cols = 9:12))
head(HDW(pwlddev$PCGDP))                    # Country- and time-demeaning
head(HDW(pwlddev, cols = 9:12))
head(STD(pwlddev$PCGDP))                    # Standardizing by country
head(STD(pwlddev, cols = 9:12))
head(L(pwlddev$PCGDP, -1:3))                # Panel-lags
head(L(pwlddev, -1:3, 9:12))
head(G(pwlddev$PCGDP))                      # Panel-Growth rates
head(G(pwlddev, 1, 1, 9:12))
rm(pwlddev)
# }
# NOT RUN {
# Remove all objects used in this example section
rm(v, d, w, f, f1, f2, g, mtcarsM, sds, series, wlddev)
# }
Run the code above in your browser using DataLab