library("quanteda")
set.seed(20)
dfmat1 <- dfm(tokens(c("a a b b c d", "a d d d", "a a a")))
textstat_frequency(dfmat1)
textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "first")
textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "average")
dfmat2 <- corpus_subset(data_corpus_inaugural, President == "Obama") %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(stopwords("en")) %>%
dfm()
tstat1 <- textstat_frequency(dfmat2)
head(tstat1, 10)
dfmat3 <- head(data_corpus_inaugural) %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(stopwords("en")) %>%
dfm()
textstat_frequency(dfmat3, n = 2, groups = President)
if (FALSE) {
# plot 20 most frequent words
library("ggplot2")
ggplot(tstat1[1:20, ], aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency")
# plot relative frequencies by group
dfmat3 <- data_corpus_inaugural %>%
corpus_subset(Year > 2000) %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(stopwords("en")) %>%
dfm() %>%
dfm_group(groups = President) %>%
dfm_weight(scheme = "prop")
# calculate relative frequency by president
tstat2 <- textstat_frequency(dfmat3, n = 10, groups = President)
# plot frequencies
ggplot(data = tstat2, aes(x = factor(nrow(tstat2):1), y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_discrete(breaks = nrow(tstat2):1,
labels = tstat2$feature) +
labs(x = NULL, y = "Relative frequency")
}
Run the code above in your browser using DataLab