library("quanteda")
corp <- data_corpus_inaugural[1:2]
head(cols <- textstat_collocations(corp, size = 2, min_count = 2), 10)
head(cols <- textstat_collocations(corp, size = 3, min_count = 2), 10)
# extracting multi-part proper nouns (capitalized terms)
toks1 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks1, pattern = stopwords("english"), padding = TRUE)
toks3 <- tokens_select(toks2, pattern = "^([A-Z][a-z\\-]{2,})", valuetype = "regex",
case_insensitive = FALSE, padding = TRUE)
tstat <- textstat_collocations(toks3, size = 3, tolower = FALSE)
head(tstat, 10)
# vectorized size
txt <- c(". . . . a b c . . a b c . . . c d e",
"a b . . a b . . a b . . a b . a b",
"b c d . . b c . b c . . . b c")
textstat_collocations(txt, size = 2:3)
# compounding tokens from collocations
toks <- tokens("This is the European Union.")
colls <- tokens("The new European Union is not the old European Union.") %>%
textstat_collocations(size = 2, min_count = 1, tolower = FALSE)
colls
tokens_compound(toks, colls, case_insensitive = FALSE)
#' # from a collocations object
(coll <- textstat_collocations(tokens("a b c a b d e b d a b")))
phrase(coll)
Run the code above in your browser using DataLab