Last chance! 50% off unlimited learning
Sale ends in
wfm
- Generate a word frequency matrix by grouping variable(s).
wfdf
- Generate a word frequency data frame by grouping variable.
wfm_expanded
- Expand a word frequency matrix to have multiple rows
for each word.
wfm_combine
- Combines words (rows) of a word frequency matrix
(wfdf
) together.
weight
- Weight a word frequency matrix for analysis where such
weighting is sensible.
weight.wfdf
- Weight a word frequency matrix for analysis where such
weighting is sensible.
as.wfm
- Attempts to coerce a matrix to a wfm
.
wfm(
text.var = NULL,
grouping.var = NULL,
output = "raw",
stopwords = NULL,
char2space = "~~",
...
)# S3 method for wfdf
wfm(
text.var = NULL,
grouping.var = NULL,
output = "raw",
stopwords = NULL,
char2space = "~~",
...
)
# S3 method for character
wfm(
text.var = NULL,
grouping.var = NULL,
output = "raw",
stopwords = NULL,
char2space = "~~",
...
)
# S3 method for factor
wfm(
text.var = NULL,
grouping.var = NULL,
output = "raw",
stopwords = NULL,
char2space = "~~",
...
)
wfdf(
text.var,
grouping.var = NULL,
stopwords = NULL,
margins = FALSE,
output = "raw",
digits = 2,
char2space = "~~",
...
)
wfm_expanded(text.var, grouping.var = NULL, ...)
wfm_combine(wf.obj, word.lists, matrix = TRUE)
# S3 method for wfm
weight(x, type = "prop", ...)
# S3 method for wfm
weight(x, type = "prop", ...)
as.wfm(x, ...)
# S3 method for matrix
as.wfm(x, ...)
# S3 method for default
as.wfm(x, ...)
# S3 method for TermDocumentMatrix
as.wfm(x, ...)
# S3 method for DocumentTermMatrix
as.wfm(x, ...)
# S3 method for data.frame
as.wfm(x, ...)
# S3 method for wfdf
as.wfm(x, ...)
# S3 method for Corpus
as.wfm(x, col = "docs", row = "text", ...)
# S3 method for Corpus
wfm(text.var, ...)
The text variable.
The grouping variables. Default NULL
generates
one word list for all text. Also takes a single grouping variable or a list
of 1 or more grouping variables.
Output type (either "proportion"
or "percent"
).
A vector of stop words to remove.
A vector of characters to be turned into spaces. If
char.keep
is NULL
, char2space
will activate this
argument.
logical. If TRUE
provides grouping.var and word
variable totals.
An integer indicating the number of decimal places (round) or significant digits (signif) to be used. Negative values are allowed.
A wfm
or wfdf
object.
A list of character vectors of words to pass to
wfm_combine
An object with words for row names and integer values.
The type of weighting to use: c("prop"
, "max"
,
"scaled"
). All weight by column. "prop"
uses a proportion
weighting and all columns sum to 1. "max"
weights in proportion to
the max value; all values are integers and column sums may not be equal.
"scaled"
uses scale
to scale with
center = FALSE
; output is not integer and column sums may not be
equal.
The column name (generally not used).
The row name (generally not used).
Other arguments supplied to Corpus
or
TermDocumentMatrix
. If as.wfm
this is other
arguments passed to as.wfm
methods (currently ignored).
wfm
- returns a word frequency of the class matrix.
wfdf
- returns a word frequency of the class data.frame with
a words column and optional margin sums.
wfm_expanded
- returns a matrix similar to a word frequency
matrix (wfm
) but the rows are expanded to represent the maximum usages
of the word and cells are dummy coded to indicate that number of uses.
wfm_combine
- returns a word frequency matrix (wfm
) or
dataframe (wfdf
) with counts for the combined word.lists merged and
remaining terms (else
).
weight
- Returns a weighted matrix for use with other R
packages. The output is not of the class "wfm".
as.wfm
- Returns a matrix of the class "wfm".
# NOT RUN {
## word frequency matrix (wfm) example:
with(DATA, wfm(state, list(sex, adult)))[1:15, ]
with(DATA, wfm(state, person))[1:15, ]
Filter(with(DATA, wfm(state, list(sex, adult))), 5)
with(DATA, wfm(state, list(sex, adult)))
## Filter particular words based on max/min values in wfm
v <- with(DATA, wfm(state, list(sex, adult)))
Filter(v, 5)
Filter(v, 5, count.apostrophe = FALSE)
Filter(v, 5, 7)
Filter(v, 4, 4)
Filter(v, 3, 4)
Filter(v, 3, 4, stopwords = Top25Words)
## insert double tilde ("~~") to keep phrases(i.e., first last name)
alts <- c(" fun", "I ")
state2 <- space_fill(DATA$state, alts, rm.extra = FALSE)
with(DATA, wfm(state2, list(sex, adult)))[1:18, ]
## word frequency dataframe (wfdf) example:
with(DATA, wfdf(state, list(sex, adult)))[1:15, ]
with(DATA, wfdf(state, person))[1:15, ]
## wfm_expanded example:
z <- wfm(DATA$state, DATA$person)
wfm_expanded(z)[30:45, ] #two "you"s
## wf_combine examples:
#===================
## raw no margins (will work)
x <- wfm(DATA$state, DATA$person)
## raw with margin (will work)
y <- wfdf(DATA$state, DATA$person, margins = TRUE)
## Proportion matrix
z2 <- wfm(DATA$state, DATA$person, output="proportion")
WL1 <- c(y[, 1])
WL2 <- list(c("read", "the", "a"), c("you", "your", "you're"))
WL3 <- list(bob = c("read", "the", "a"), yous = c("you", "your", "you're"))
WL4 <- list(bob = c("read", "the", "a"), yous = c("a", "you", "your", "your're"))
WL5 <- list(yous = c("you", "your", "your're"))
WL6 <- list(c("you", "your", "your're")) #no name so will be called words 1
WL7 <- c("you", "your", "your're")
wfm_combine(z2, WL2) #Won't work not a raw frequency matrix
wfm_combine(x, WL2) #Works (raw and no margins)
wfm_combine(y, WL2) #Works (raw with margins)
wfm_combine(y, c("you", "your", "your're"))
wfm_combine(y, WL1)
wfm_combine(y, WL3)
## wfm_combine(y, WL4) #Error
wfm_combine(y, WL5)
wfm_combine(y, WL6)
wfm_combine(y, WL7)
worlis <- c("you", "it", "it's", "no", "not", "we")
y <- wfdf(DATA$state, list(DATA$sex, DATA$adult), margins = TRUE)
z <- wfm_combine(y, worlis)
chisq.test(z)
chisq.test(wfm(y))
## Dendrogram
presdeb <- with(pres_debates2012, wfm(dialogue, list(person, time)))
library(sjPlot)
sjc.dend(t(presdeb), 2:4)
## Words correlated within turns of talk
## EXAMPLE 1
library(qdapTools)
x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep = "|")))
dat <- wfm(rajSPLIT$dialogue, x)
cor(t(dat)[, c("romeo", "juliet")])
cor(t(dat)[, c("romeo", "banished")])
cor(t(dat)[, c("romeo", "juliet", "hate", "love")])
qheat(cor(t(dat)[, c("romeo", "juliet", "hate", "love")]),
diag.na = TRUE, values = TRUE, digits = 3, by.column = NULL)
dat2 <- wfm(DATA$state, id(DATA))
qheat(cor(t(dat2)), low = "yellow", high = "red",
grid = "grey90", diag.na = TRUE, by.column = NULL)
## EXAMPLE 2
x2 <- factor(with(pres_debates2012, paste(time, pad(TOT(tot)), sep = "|")))
dat2 <- wfm(pres_debates2012$dialogue, x2)
wrds <- word_list(pres_debates2012$dialogue,
stopwords = c("it's", "that's", Top200Words))
wrds2 <- tolower(sort(wrds$rfswl[[1]][, 1]))
qheat(word_cor(t(dat2), word = wrds2, r = NULL),
diag.na = TRUE, values = TRUE, digits = 3, by.column = NULL,
high="red", low="yellow", grid=NULL)
## EXAMPLE 3
library(gridExtra); library(ggplot2); library(grid)
dat3 <- lapply(qcv(OBAMA, ROMNEY), function(x) {
with(pres_debates2012, wfm(dialogue[person == x], x2[person == x]))
})
# Presidential debates by person
dat5 <- pres_debates2012
dat5 <- dat5[dat5$person %in% qcv(ROMNEY, OBAMA), ]
disp <- with(dat5, dispersion_plot(dialogue, wrds2, grouping.var = person,
total.color = NULL, rm.vars=time))
cors <- lapply(dat3, function(m) {
word_cor(t(m), word = wrds2, r = NULL)
})
plots <- lapply(cors, function(x) {
qheat(x, diag.na = TRUE, values = TRUE, digits = 3, plot = FALSE,
by.column = NULL, high="red", low="yellow", grid=NULL)
})
plots <- lapply(1:2, function(i) {
plots[[i]] + ggtitle(qcv(OBAMA, ROMNEY)[i]) +
theme(axis.title.x = element_blank(),
plot.margin = unit(rep(0, 4), "lines"))
})
grid.arrange(disp, arrangeGrob(plots[[1]], plots[[2]], ncol=1), ncol=2)
## With `word_cor`
worlis <- list(
pronouns = c("you", "it", "it's", "we", "i'm", "i"),
negative = qcv(no, dumb, distrust, not, stinks),
literacy = qcv(computer, talking, telling)
)
y <- wfdf(DATA$state, qdapTools::id(DATA, prefix = TRUE))
z <- wfm_combine(y, worlis)
word_cor(t(z), word = names(worlis), r = NULL)
## Plotting method
plot(y, TRUE)
plot(z)
## Correspondence Analysis
library(ca)
dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]
speech <- stemmer(dat$dialogue)
mytable1 <- with(dat, wfm(speech, list(person, time), stopwords = Top25Words))
fit <- ca(mytable1)
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)
mytable2 <- with(dat, wfm(speech, list(person, time), stopwords = Top200Words))
fit2 <- ca(mytable2)
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)
## Weight a wfm
WFM <- with(DATA, wfm(state, list(sex, adult)))
plot(weight(WFM, "scaled"), TRUE)
weight(WFM, "prop")
weight(WFM, "max")
weight(WFM, "scaled")
# }
Run the code above in your browser using DataLab