This function selects or discards features from a dfm.variety of objects,
such as tokenized texts, a dfm, or a list of collocations. The most common
usage for removeFeatures
will be to eliminate stop words from a text
or text-based object, or to select only features from a list of regular
expression.
selectFeatures(x, features, ...)# S3 method for dfm
selectFeatures(x, features, selection = c("keep", "remove"),
valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE,
verbose = FALSE, ...)
# S3 method for tokenizedTexts
selectFeatures(x, features, selection = c("keep",
"remove"), valuetype = c("glob", "regex", "fixed"),
case_insensitive = TRUE, padding = FALSE, indexing = FALSE,
verbose = FALSE, ...)
# S3 method for tokens
selectFeatures(x, features, selection = c("keep", "remove"),
valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE,
padding = FALSE, ...)
# S3 method for collocations
selectFeatures(x, features, selection = c("keep",
"remove"), valuetype = c("glob", "regex", "fixed"),
case_insensitive = TRUE, verbose = TRUE, pos = 1:3, ...)
object whose features will be selected
supplementary arguments passed to the underlying functions in
stri_detect_regex
. (This is how
case_insensitive
is passed, but you may wish to pass others.)
whether to keep or remove the features
the type of pattern matching: "glob"
for
"glob"-style wildcard expressions; "regex"
for regular expressions;
or "fixed"
for exact matching. See valuetype for details.
ignore the case of dictionary values if TRUE
if TRUE
print message about how many features were
removed
(only for tokenizedTexts
objects) if TRUE
, leave
an empty string where the removed tokens previously existed. This is
useful if a positional match is needed between the pre- and post-selected
features, for instance if a window of adjacency needs to be computed.
use dfm-based index to efficiently process large tokenizedTexts object
indexes of word position if called on collocations: remove if word
pos
is a stopword
A dfm after the feature selection has been applied.
When features
is a dfm object, then the returned object
will be identical in its feature set to the dfm supplied as the
features
argument. This means that any features in x
not in
features
will be discarded, and that any features in found in the
dfm supplied as features
but not found in x
will be added
with all zero counts. This is useful when you have trained a model on one dfm, and
need to project this onto a test set whose features must be identical.
# NOT RUN {
data(SOTUCorpus, package = "quantedaData")
toks <- tokenize(SOTUCorpus, remove_punct = TRUE)
# toks <- tokenize(tokenize(SOTUCorpus, what='sentence', simplify = TRUE), remove_punct = TRUE)
# head to head, old v. new
system.time(selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE))
system.time(selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE))
system.time(selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
system.time(selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
microbenchmark::microbenchmark(
old = selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE),
new = selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE),
times = 5, unit = "relative")
microbenchmark::microbenchmark(
new = selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
old = selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
times = 2, unit = "relative")
types <- unique(unlist(toks))
numbers <- types[stringi::stri_detect_regex(types, '[0-9]')]
microbenchmark::microbenchmark(
new = selectFeaturesOLD(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
old = selectFeatures(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
times = 2, unit = "relative")
# removing tokens before dfm, versus after
microbenchmark::microbenchmark(
pre = dfm(selectFeaturesOLD(toks, stopwords("english"), "remove"), verbose = FALSE),
post = dfm(toks, remove = stopwords("english"), verbose = FALSE),
times = 5, unit = "relative")
## with simple examples
toks <- tokenize(c("This is a sentence.", "This is a second sentence."),
remove_punct = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
# how case_insensitive works
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = FALSE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "glob", padding = TRUE, case_insensitive = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "glob", padding = TRUE, case_insensitive = FALSE)
# with longer texts
toks <- tokenize(data_corpus_inaugural[1:2])
selectFeatures(toks, stopwords("english"), "remove")
selectFeatures(toks, stopwords("english"), "keep")
selectFeatures(toks, stopwords("english"), "remove", padding = TRUE)
selectFeatures(toks, stopwords("english"), "keep", padding = TRUE)
selectFeatures(tokenize(data_corpus_inaugural[2]), stopwords("english"), "remove", padding = TRUE)
# }
# NOT RUN {
toksh <- tokens(c(doc1 = "This is a SAMPLE text", doc2 = "this sample text is better"))
feats <- c("this", "sample", "is")
# keeping features
selectFeatures(toksh, feats, selection = "keep")
selectFeatures(toksh, feats, selection = "keep", padding = TRUE)
selectFeatures(toksh, feats, selection = "keep", case_insensitive = FALSE)
selectFeatures(toksh, feats, selection = "keep", padding = TRUE, case_insensitive = FALSE)
# removing features
selectFeatures(toksh, feats, selection = "remove")
selectFeatures(toksh, feats, selection = "remove", padding = TRUE)
selectFeatures(toksh, feats, selection = "remove", case_insensitive = FALSE)
selectFeatures(toksh, feats, selection = "remove", padding = TRUE, case_insensitive = FALSE)
# }
# NOT RUN {
# }
# NOT RUN {
## example for collocations
(myCollocs <- collocations(data_corpus_inaugural[1:3], n=20))
selectFeatures(myCollocs, stopwords("english"), "remove")
# }
Run the code above in your browser using DataLab