sequences(x, features = "*", valuetype = c("glob", "regex", "fixed"),
case_insensitive = TRUE, min_count = 2, max_size = 5, nested = TRUE,
ordered = FALSE)is.sequences(x)
"glob"
for
"glob"-style wildcard expressions; "regex"
for regular expressions;
or "fixed"
for exact matching. See valuetype for details.TRUE
TRUE
, collect all the subsequences of a longer
sequence as separate entities. e.g. in a sequence of capitalized words
"United States Congress", "States Congress" is considered as a subsequence.
But "United States" is not a subsequence because it is followed by
"Congress".sequences
returns TRUE
if the object is of class
sequences, FALSE
otherwise.toks <- tokens(corpus_segment(data_corpus_inaugural, what = "sentence"))
toks <- tokens_select(toks, stopwords("english"), "remove", padding = TRUE)
# extracting multi-part proper nouns (capitalized terms)
seqs <- sequences(toks, "^([A-Z][a-z\\-]{2,})", valuetype="regex", case_insensitive = FALSE)
head(seqs, 10)
# more efficient when applied to the same tokens object
toks_comp <- tokens_compound(toks, seqs)
toks_comp_ir <- tokens_compound(tokens(data_corpus_irishbudget2010), seqs)
# types can be any words
seqs2 <- sequences(toks, "^([a-z]+)$", valuetype="regex", case_insensitive = FALSE,
min_count = 2, ordered = TRUE)
head(seqs2, 10)
# convert to tokens object
as.tokens(seqs2)
Run the code above in your browser using DataLab